Bug #326

Fwd: [PATCH net-next] sch_sfq: rehash queues in perturb timer

Added by David Taht over 1 year ago. Updated about 1 year ago.

Status:Closed Start date:
Priority:Normal Due date:
Assignee:Dave Täht % Done:

0%

Category:Linux Kernel Spent time: 10.00 hours
Target version:1st Public Cerowrt release

Description

Eric, I think I love you.

I'd like to slam this into cerowrt, which is based on 3.1.5 for at
least another month. Are there dependencies on the other work you've
been doing in this area I should worry about?

---------- Forwarded message ----------
From: Eric Dumazet <>
Date: Wed, Dec 21, 2011 at 2:30 PM
Subject: [PATCH net-next] sch_sfq: rehash queues in perturb timer
To: David Miller <>
Cc: netdev <>

A known Out Of Order (OOO) problem hurts SFQ when timer changes
perturbation value, since all new packets delivered to SFQ enqueue might
end on different slots than previous in-flight packets.

With round robin delivery, we can thus deliver packets in a different
order.

Since SFQ is limited to small amount of in-flight packets, we can rehash
packets so that this OOO problem is fixed.

This rehashing is performed only if internal flow classifier is in use.

We now store in skb->cb[] the "struct flow_keys" so that we dont call
skb_flow_dissect() again while rehashing.

Signed-off-by: Eric Dumazet <>
---
 net/sched/sch_sfq.c |   87 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 81 insertions(+), 6 deletions(-)

diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 30cda70..d329a8a 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@ -136,16 +136,30 @ static inline struct sfq_head
*sfq_dep_head(struct sfq_sched_data *q, sfq_index
       return &q->dep[val - SFQ_SLOTS];
 }

/*
* In order to be able to quickly rehash our queue when timer changes
+ * q->perturbation, we store flow_keys in skb->cb[]
+ */
struct sfq_skb_cb {
      struct flow_keys        keys;
};

static inline struct sfq_skb_cb *sfq_skb_cb(const struct sk_buff *skb)
{
+       BUILD_BUG_ON(sizeof(skb->cb) <
+               sizeof(struct qdisc_skb_cb) + sizeof(struct sfq_skb_cb));
+       return (struct sfq_skb_cb *)qdisc_skb_cb(skb)->data;
}

 static unsigned int sfq_hash(const struct sfq_sched_data *q,
                            const struct sk_buff *skb)
 {
-       struct flow_keys keys;
+       const struct flow_keys *keys = &sfq_skb_cb(skb)->keys;
       unsigned int hash;

-       skb_flow_dissect(skb, &keys);
-       hash = jhash_3words((_force u32)keys.dst,
-                           (
_force u32)keys.src ^ keys.ip_proto,
-                           (_force u32)keys.ports, q->perturbation);
+       hash = jhash_3words((
_force u32)keys->dst,
+                           (_force u32)keys->src ^ keys->ip_proto,
+                           (
_force u32)keys->ports, q->perturbation);
       return hash & (q->divisor - 1);
 }

@ -161,8 +175,10 @ static unsigned int sfq_classify(struct sk_buff
*skb, struct Qdisc *sch,
           TC_H_MIN(skb->priority) <= q->divisor)
               return TC_H_MIN(skb->priority);

-       if (!q->filter_list)
+       if (!q->filter_list) {
+               skb_flow_dissect(skb, &sfq_skb_cb(skb)->keys);
               return sfq_hash(q, skb) + 1;
+       }

       *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
       result = tc_classify(skb, q->filter_list, &res);
@ -423,12 +439,71 @ sfq_reset(struct Qdisc *sch)
               kfree_skb(skb);
 }

/*
* When q->perturbation is changed, we rehash all queued skbs
+ * to avoid OOO (Out Of Order) effects.
+ * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
+ * counters.
+ /
static void sfq_rehash(struct sfq_sched_data *q)
{
+       struct sk_buff *skb;
+       int i;
+       struct sfq_slot *slot;
+       struct sk_buff_head list;

      __skb_queue_head_init(&list);

      for (i = 0; i < SFQ_SLOTS; i++) {
+               slot = &q->slots[i];
+               if (!slot->qlen)
+                       continue;
+               while (slot->qlen) {
+                       skb = slot_dequeue_head(slot);
+                       sfq_dec(q, i);
+                       __skb_queue_tail(&list, skb);
+               }
+               q->ht[slot->hash] = SFQ_EMPTY_SLOT;
+       }
+       q->tail = NULL;

      while ((skb = __skb_dequeue(&list)) != NULL) {
+               unsigned int hash = sfq_hash(q, skb);
+               sfq_index x = q->ht[hash];

              slot = &q->slots[x];
+               if (x == SFQ_EMPTY_SLOT) {
+                       x = q->dep0.next; /
get a free slot /
+                       q->ht[hash] = x;
+                       slot = &q->slots[x];
+                       slot->hash = hash;
+               }
+               slot_queue_add(slot, skb);
+               sfq_inc(q, x);
+               if (slot->qlen 1) {          /
The flow is new /
+                       if (q->tail NULL) {  /
It is the first flow */
+                               slot->next = x;
+                       } else {
+                               slot->next = q->tail->next;
+                               q->tail->next = x;
+                       }
+                       q->tail = slot;
+                       slot->allot = q->scaled_quantum;
+               }
+       }
}

 static void sfq_perturbation(unsigned long arg)
 {
       struct Qdisc *sch = (struct Qdisc *)arg;
       struct sfq_sched_data *q = qdisc_priv(sch);
+       spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));

+       spin_lock(root_lock);
       q->perturbation = net_random();
+       if (!q->filter_list && q->tail)
+               sfq_rehash(q);
+       spin_unlock(root_lock);

       if (q->perturb_period)
               mod_timer(&q->perturb_timer, jiffies + q->perturb_period);

History

Updated by David Taht over 1 year ago

On Thu, Dec 22, 2011 at 5:19 AM, Eric Dumazet <> wrote:

Le jeudi 22 décembre 2011 à 05:02 +0100, Dave Taht a écrit :

Eric, I think I love you.

I'd like to slam this into cerowrt, which is based on 3.1.5 for at least another month. Are there dependencies on the other work you've been doing in this area I should worry about?

Hmm, this work depends on skb_flow_dissect() patches, included in net-next only for the moment...

(sorry for cc'ing the bug system on this one)

OK. I'm really looking forward to the upcoming next generation kernel,
but it's very hard to keep an embedded system this close to something
so bleeding edge. :/

That said, it looked plausible to to extract the flow_dissect and
adaptive-red work you've done recently, safely. I'll think upon it.

(I thought the backporting bql would be easy too, but thus far no luck)

I've moved most of my own AQM prototyping to x86, which is now
showing useful results, thx to you, tom, and bql.

Updated by Dave Täht over 1 year ago

  • Category set to Linux Kernel
  • Status changed from New to Closed
  • Assignee set to Dave Täht
  • Target version set to 14

I abstracted out BQL, this patch, etc and backported to 3.2.2.

I note that the permute option only works in the background with the
default filters (I think), and I think that I need to use the pre-nat
ips in order to get saner SFQ out the other end... but we'll see.

Certainly the old method would put in a nasty 10 second disturbance. Perhaps we'll
see different behavior now, but traces are needed to see what happens.

Updated by Dave Täht about 1 year ago

  • Target version changed from 14 to 1st Public Cerowrt release

Also available in: Atom PDF