Re: [PATCH/v4] bpf: add bpf_skb_adjust_room flag BPF_F_ADJ_ROOM_ENCAP_L2_ETH

From: Xuesen Huang
Date: Wed Mar 03 2021 - 22:24:45 EST




> 2021年3月4日 上午2:53,Willem de Bruijn <willemdebruijn.kernel@xxxxxxxxx> 写道:
>
> On Wed, Mar 3, 2021 at 7:33 AM Xuesen Huang <hxseverything@xxxxxxxxx> wrote:
>>
>> From: Xuesen Huang <huangxuesen@xxxxxxxxxxxx>
>>
>> bpf_skb_adjust_room sets the inner_protocol as skb->protocol for packets
>> encapsulation. But that is not appropriate when pushing Ethernet header.
>>
>> Add an option to further specify encap L2 type and set the inner_protocol
>> as ETH_P_TEB.
>>
>> Update test_tc_tunnel to verify adding vxlan encapsulation works with
>> this flag.
>>
>> Suggested-by: Willem de Bruijn <willemb@xxxxxxxxxx>
>> Signed-off-by: Xuesen Huang <huangxuesen@xxxxxxxxxxxx>
>> Signed-off-by: Zhiyong Cheng <chengzhiyong@xxxxxxxxxxxx>
>> Signed-off-by: Li Wang <wangli09@xxxxxxxxxxxx>
>
> Thanks for adding the test. Perhaps that is better in a separate patch?
>
> Overall looks great to me.
>
> The patch has not (yet?) arrived on patchwork.
>
Thanks Willem, I will separate it into two patch.

I will send patch/v5 with only that new flag addition, lol.

>> enum {
>> diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
>> index 37bce7a..6e144db 100644
>> --- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
>> +++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
>> @@ -20,6 +20,14 @@
>> #include <bpf/bpf_endian.h>
>> #include <bpf/bpf_helpers.h>
>>
>> +#define encap_ipv4(...) __encap_ipv4(__VA_ARGS__, 0)
>> +
>> +#define encap_ipv4_with_ext_proto(...) __encap_ipv4(__VA_ARGS__)
>> +
>> +#define encap_ipv6(...) __encap_ipv6(__VA_ARGS__, 0)
>> +
>> +#define encap_ipv6_with_ext_proto(...) __encap_ipv6(__VA_ARGS__)
>> +
>
> Instead of untyped macros, I'd define encap_ipv4 as a function that
> calls __encap_ipv4.
>
> And no need for encap_ipv4_with_ext_proto equivalent to __encap_ipv4.
>
I defined these macros to try to keep the existing invocation for encap_ipv4/6
as the same, if we define this as a function all invocation should be modified?

>> static const int cfg_port = 8000;
>>
>> static const int cfg_udp_src = 20000;
>> @@ -27,11 +35,24 @@
>> #define UDP_PORT 5555
>> #define MPLS_OVER_UDP_PORT 6635
>> #define ETH_OVER_UDP_PORT 7777
>> +#define VXLAN_UDP_PORT 8472
>> +
>> +#define EXTPROTO_VXLAN 0x1
>> +
>> +#define VXLAN_N_VID (1u << 24)
>> +#define VXLAN_VNI_MASK bpf_htonl((VXLAN_N_VID - 1) << 8)
>> +#define VXLAN_FLAGS 0x8
>> +#define VXLAN_VNI 1
>>
>> /* MPLS label 1000 with S bit (last label) set and ttl of 255. */
>> static const __u32 mpls_label = __bpf_constant_htonl(1000 << 12 |
>> MPLS_LS_S_MASK | 0xff);
>>
>> +struct vxlanhdr {
>> + __be32 vx_flags;
>> + __be32 vx_vni;
>> +} __attribute__((packed));
>> +
>> struct gre_hdr {
>> __be16 flags;
>> __be16 protocol;
>> @@ -45,13 +66,13 @@ struct gre_hdr {
>> struct v4hdr {
>> struct iphdr ip;
>> union l4hdr l4hdr;
>> - __u8 pad[16]; /* enough space for L2 header */
>> + __u8 pad[24]; /* space for L2 header / vxlan header ... */
>
> could we use something like sizeof(..) instead of a constant?
>
Thanks, I will try to fix this.

>> @@ -171,14 +197,26 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto,
>> }
>>
>> /* add L2 encap (if specified) */
>> + l2_hdr = (__u8 *)&h_outer + olen;
>> switch (l2_proto) {
>> case ETH_P_MPLS_UC:
>> - *((__u32 *)((__u8 *)&h_outer + olen)) = mpls_label;
>> + *(__u32 *)l2_hdr = mpls_label;
>> break;
>> case ETH_P_TEB:
>> - if (bpf_skb_load_bytes(skb, 0, (__u8 *)&h_outer + olen,
>> - ETH_HLEN))
>
> This is non-standard indentation? Here and elsewhere.
I thinks it’s a previous issue.

>
>> @@ -249,7 +288,11 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
>> break;
>> case ETH_P_TEB:
>> l2_len = ETH_HLEN;
>> - udp_dst = ETH_OVER_UDP_PORT;
>> + if (ext_proto & EXTPROTO_VXLAN) {
>> + udp_dst = VXLAN_UDP_PORT;
>> + l2_len += sizeof(struct vxlanhdr);
>> + } else
>> + udp_dst = ETH_OVER_UDP_PORT;
>> break;
>> }
>> flags |= BPF_F_ADJ_ROOM_ENCAP_L2(l2_len);
>> @@ -267,7 +310,7 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
>> h_outer.l4hdr.udp.source = __bpf_constant_htons(cfg_udp_src);
>> h_outer.l4hdr.udp.dest = bpf_htons(udp_dst);
>> tot_len = bpf_ntohs(iph_inner.payload_len) + sizeof(iph_inner) +
>> - sizeof(h_outer.l4hdr.udp);
>> + sizeof(h_outer.l4hdr.udp) + l2_len;
>
> Was this a bug previously?
>
Yes, a tiny bug.

>> h_outer.l4hdr.udp.check = 0;
>> h_outer.l4hdr.udp.len = bpf_htons(tot_len);
>> break;
>> @@ -278,13 +321,24 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
>> }
>>
>> /* add L2 encap (if specified) */
>> + l2_hdr = (__u8 *)&h_outer + olen;
>> switch (l2_proto) {
>> case ETH_P_MPLS_UC:
>> - *((__u32 *)((__u8 *)&h_outer + olen)) = mpls_label;
>> + *(__u32 *)l2_hdr = mpls_label;
>> break;
>> case ETH_P_TEB:
>> - if (bpf_skb_load_bytes(skb, 0, (__u8 *)&h_outer + olen,
>> - ETH_HLEN))
>> + flags |= BPF_F_ADJ_ROOM_ENCAP_L2_ETH;
>
> This is a change also for the existing case. Correctly so, I imagine.
> But the test used to pass with the wrong protocol?
Yes all tests pass. I’m not sure should we add this flag for the existing tests
which encap eth as the l2 header or only for the Vxlan test?

Waiting for your suggestion.
Thanks.