Re: [RFC PATCH bpf-next v3 2/2] net: Add additional bit to support userspace timestamp type

From: Abhishek Chauhan (ABC)
Date: Tue Apr 16 2024 - 19:41:27 EST




On 4/15/2024 1:00 PM, Martin KaFai Lau wrote:
> On 4/13/24 12:07 PM, Willem de Bruijn wrote:
>>> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
>>> index a83a2120b57f..b6346c21c3d4 100644
>>> --- a/include/linux/skbuff.h
>>> +++ b/include/linux/skbuff.h
>>> @@ -827,7 +827,8 @@ enum skb_tstamp_type {
>>>    *    @tstamp_type: When set, skb->tstamp has the
>>>    *        delivery_time in mono clock base (i.e. EDT).  Otherwise, the
>>>    *        skb->tstamp has the (rcv) timestamp at ingress and
>>> - *        delivery_time at egress.
>>> + *        delivery_time at egress or skb->tstamp defined by skb->sk->sk_clockid
>>> + *        coming from userspace
>>>    *    @napi_id: id of the NAPI struct this skb came from
>>>    *    @sender_cpu: (aka @napi_id) source CPU in XPS
>>>    *    @alloc_cpu: CPU which did the skb allocation.
>>> @@ -955,7 +956,7 @@ struct sk_buff {
>>>       /* private: */
>>>       __u8            __mono_tc_offset[0];
>>>       /* public: */
>>> -    __u8            tstamp_type:1;    /* See SKB_MONO_DELIVERY_TIME_MASK */
>>> +    __u8            tstamp_type:2;    /* See SKB_MONO_DELIVERY_TIME_MASK */
>>>   #ifdef CONFIG_NET_XGRESS
>>>       __u8            tc_at_ingress:1;    /* See TC_AT_INGRESS_MASK */
>>>       __u8            tc_skip_classify:1;
>>
>> A quick pahole for a fairly standard .config that I had laying around
>> shows a hole after this list of bits, so no huge concerns there from
>> adding a bit:
>>
>>             __u8               slow_gro:1;           /*     3: 4  1 */
>>             __u8               csum_not_inet:1;      /*     3: 5  1 */
>>
>>             /* XXX 2 bits hole, try to pack */
>>
>>             __u16              tc_index;             /*     4     2 */
>>
>>> @@ -1090,10 +1091,10 @@ struct sk_buff {
>>>    */
>>>   #ifdef __BIG_ENDIAN_BITFIELD
>>>   #define SKB_MONO_DELIVERY_TIME_MASK    (1 << 7)
>>> -#define TC_AT_INGRESS_MASK        (1 << 6)
>>> +#define TC_AT_INGRESS_MASK        (1 << 5)
>>
>> Have to be careful when adding a new 2 bit tstamp_type with both bits
>> set, that this does not incorrectly get interpreted as MONO.
>>
>> I haven't looked closely at the BPF API, but hopefully it can be
>> extensible to return the specific type. If it is hardcoded to return
>> either MONO or not, then only 0x1 should match, not 0x3.
>
> Good point. I believe it is the best to have bpf to consider both bits in tstamp_type:2 in filter.c to avoid the 0x3 surprise in the future. The BPF API can be extended to support SKB_CLOCK_TAI.
>
> Regardless, in bpf_convert_tstamp_write(), it still needs to clear both bits in tstamp_type when it is at ingress. Right now it only clears the mono bit.
>
> Then it may as well consider both tstamp_type:2 bits in bpf_convert_tstamp_read() and bpf_convert_tstamp_type_read(). e.g. bpf_convert_tstamp_type_read(), it should be a pretty straight forward change because the SKB_CLOCK_* enum value should be a 1:1 mapping to the BPF_SKB_TSTAMP_*.
>
>>
>>>   #else
>>>   #define SKB_MONO_DELIVERY_TIME_MASK    (1 << 0)
>>> -#define TC_AT_INGRESS_MASK        (1 << 1)
>>> +#define TC_AT_INGRESS_MASK        (1 << 2)
>>>   #endif
>>>   #define SKB_BF_MONO_TC_OFFSET        offsetof(struct sk_buff, __mono_tc_offset)
>>>  
>
Hi Martin and Willem,

I have made the changes as per your guidelines . I will be raising RFC patch bpf-next v4 soon. Giving you heads up of the changes i am bringing in the BPF code. If you feel i have done something incorrectly, please do correct me here.
I apologies for adding the code here and making the content of the email huge for other upstream reviewers.

//Introduce a new BPF tstamp type mask in bpf.h

#ifdef __BIG_ENDIAN_BITFIELD
#define SKB_MONO_DELIVERY_TIME_MASK (1 << 7)
+ #define SKB_TAI_DELIVERY_TIME_MASK (1 << 6) (new)
#define TC_AT_INGRESS_MASK (1 << 5)
#else
#define SKB_MONO_DELIVERY_TIME_MASK (1 << 0)
+ #define SKB_TAI_DELIVERY_TIME_MASK (1 << 1) (new)
#define TC_AT_INGRESS_MASK (1 << 2)
#endif

//changes in the filter.c (bpf_convert_tstamp_{read,write}, bpf_convert_tstamp_type_read())code are accordingly taken care since now we have 3 bits instead of 2

Value 3 => unspec
Value 2 => tai
Value 1 => mono

static struct bpf_insn *bpf_convert_tstamp_type_read(const struct bpf_insn *si,
struct bpf_insn *insn)
{
__u8 value_reg = si->dst_reg;
__u8 skb_reg = si->src_reg;
/* AX is needed because src_reg and dst_reg could be the same */
__u8 tmp_reg = BPF_REG_AX;

*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg,
SKB_BF_MONO_TC_OFFSET);
*insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg,
SKB_MONO_DELIVERY_TIME_MASK | SKB_TAI_DELIVERY_TIME_MASK, 2); <== check for both bits are set (if so make it tstamp_unspec)
*insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg,
SKB_MONO_DELIVERY_TIME_MASK, 3); <== if mono is set then its mono base and jump to 3rd instruction from here.
*insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg,
SKB_TAI_DELIVERY_TIME_MASK, 4); <== if tai is set then its tai base and jump to 4th instruction from here.
*insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_UNSPEC);
*insn++ = BPF_JMP_A(1);
*insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_DELIVERY_MONO);
*insn++ = BPF_JMP_A(1);
*insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_DELIVERY_TAI);

return insn;
}

//Values 7, 6 and 5 as input will set the value reg to 0
//otherwise the skb_reg has correct configuration and will store the content in value reg.

static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog,
const struct bpf_insn *si,
struct bpf_insn *insn)
{
__u8 value_reg = si->dst_reg;
__u8 skb_reg = si->src_reg;

#ifdef CONFIG_NET_XGRESS
/* If the tstamp_type is read,
* the bpf prog is aware the tstamp could have delivery time.
* Thus, read skb->tstamp as is if tstamp_type_access is true.
*/
if (!prog->tstamp_type_access) {
/* AX is needed because src_reg and dst_reg could be the same */
__u8 tmp_reg = BPF_REG_AX;

*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
/*check if all three bits are set*/
*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg,
TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK |
SKB_TAI_DELIVERY_TIME_MASK); <== check if all 3 bits are set which is value 7
/*if all 3 bits are set jump 3 instructions and clear the register */
*insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg,
TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK |
SKB_TAI_DELIVERY_TIME_MASK, 4); <== if all 3 bits are set then value reg is set to 0
/*Now check Mono is set with ingress mask if so clear*/
*insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg,
TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK, 3); <== check if value is 5 (mono + ingress) if so then value reg is set to 0
/*Now Check tai is set with ingress mask if so clear*/
*insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg,
TC_AT_INGRESS_MASK | SKB_TAI_DELIVERY_TIME_MASK, 2); <== check if value is 6 (tai + ingress) if so then value reg is set to 0
/*Now Check tai and mono are set if so clear */
*insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg,
SKB_MONO_DELIVERY_TIME_MASK | SKB_TAI_DELIVERY_TIME_MASK, 1); <== check if value 3 (tai + mono) if so then value reg is set to 0
/* goto <store> */
*insn++ = BPF_JMP_A(2);
/* skb->tc_at_ingress && skb->tstamp_type:1,
* read 0 as the (rcv) timestamp.
*/
*insn++ = BPF_MOV64_IMM(value_reg, 0);
*insn++ = BPF_JMP_A(1);
}
#endif

*insn++ = BPF_LDX_MEM(BPF_DW, value_reg, skb_reg,
offsetof(struct sk_buff, tstamp));
return insn;
}

//this was pretty straight forward
//if ingress mask is set just go ahead and unset both tai and mono delivery time.

static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog,
const struct bpf_insn *si,
struct bpf_insn *insn)
{
__u8 value_reg = si->src_reg;
__u8 skb_reg = si->dst_reg;

#ifdef CONFIG_NET_XGRESS
/* If the tstamp_type is read,
* the bpf prog is aware the tstamp could have delivery time.
* Thus, write skb->tstamp as is if tstamp_type_access is true.
* Otherwise, writing at ingress will have to clear the
* mono_delivery_time (skb->tstamp_type:1)bit also.
*/
if (!prog->tstamp_type_access) {
__u8 tmp_reg = BPF_REG_AX;

*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
/* Writing __sk_buff->tstamp as ingress, goto <clear> */
*insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1);
/* goto <store> */
*insn++ = BPF_JMP_A(3);
/* <clear>: mono_delivery_time or (skb->tstamp_type:1) */
*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_MONO_DELIVERY_TIME_MASK);
/* <clear>: tai delivery_time or (skb->tstamp_type:2) */ (new)
*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_TAI_DELIVERY_TIME_MASK); (new) <== reset tai delivery mask if ingress bit is set.
*insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, SKB_BF_MONO_TC_OFFSET);
}
#endif

/* <store>: skb->tstamp = tstamp */
*insn++ = BPF_RAW_INSN(BPF_CLASS(si->code) | BPF_DW | BPF_MEM,
skb_reg, value_reg, offsetof(struct sk_buff, tstamp), si->imm);
return insn;


And the ctx_rewrite will be as follows

.read = "r11 = *(u8 *)($ctx + sk_buff::__mono_tc_offset);"
"w11 &= 7;"
"if w11 == 0x7 goto pc+4;"
"if w11 == 0x5 goto pc+3;"
"if w11 == 0x6 goto pc+2;"
"if w11 == 0x3 goto pc+1;"
"goto pc+2"
"$dst = 0;"
"goto pc+1;"
"$dst = *(u64 *)($ctx + sk_buff::tstamp);",
.write = "r11 = *(u8 *)($ctx + sk_buff::__mono_tc_offset);"
"if w11 & 0x4 goto pc+1;"
"goto pc+3;"
"w11 &= -2;"
"w11 &= -3;"
"*(u8 *)($ctx + sk_buff::__mono_tc_offset) = r11;"
"*(u64 *)($ctx + sk_buff::tstamp) = $src;",