sch_dualpi2.c 33 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177
  1. // SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
  2. /* Copyright (C) 2024 Nokia
  3. *
  4. * Author: Koen De Schepper <koen.de_schepper@nokia-bell-labs.com>
  5. * Author: Olga Albisser <olga@albisser.org>
  6. * Author: Henrik Steen <henrist@henrist.net>
  7. * Author: Olivier Tilmans <olivier.tilmans@nokia.com>
  8. * Author: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
  9. *
  10. * DualPI Improved with a Square (dualpi2):
  11. * - Supports congestion controls that comply with the Prague requirements
  12. * in RFC9331 (e.g. TCP-Prague)
  13. * - Supports coupled dual-queue with PI2 as defined in RFC9332
  14. * - Supports ECN L4S-identifier (IP.ECN==0b*1)
  15. *
  16. * note: Although DCTCP and BBRv3 can use shallow-threshold ECN marks,
  17. * they do not meet the 'Prague L4S Requirements' listed in RFC 9331
  18. * Section 4, so they can only be used with DualPI2 in a datacenter
  19. * context.
  20. *
  21. * References:
  22. * - RFC9332: https://datatracker.ietf.org/doc/html/rfc9332
  23. * - De Schepper, Koen, et al. "PI 2: A linearized AQM for both classic and
  24. * scalable TCP." in proc. ACM CoNEXT'16, 2016.
  25. */
  26. #include <linux/errno.h>
  27. #include <linux/hrtimer.h>
  28. #include <linux/if_vlan.h>
  29. #include <linux/kernel.h>
  30. #include <linux/limits.h>
  31. #include <linux/module.h>
  32. #include <linux/skbuff.h>
  33. #include <linux/types.h>
  34. #include <net/gso.h>
  35. #include <net/inet_ecn.h>
  36. #include <net/pkt_cls.h>
  37. #include <net/pkt_sched.h>
  38. /* 32b enable to support flows with windows up to ~8.6 * 1e9 packets
  39. * i.e., twice the maximal snd_cwnd.
  40. * MAX_PROB must be consistent with the RNG in dualpi2_roll().
  41. */
  42. #define MAX_PROB U32_MAX
  43. /* alpha/beta values exchanged over netlink are in units of 256ns */
  44. #define ALPHA_BETA_SHIFT 8
  45. /* Scaled values of alpha/beta must fit in 32b to avoid overflow in later
  46. * computations. Consequently (see and dualpi2_scale_alpha_beta()), their
  47. * netlink-provided values can use at most 31b, i.e. be at most (2^23)-1
  48. * (~4MHz) as those are given in 1/256th. This enable to tune alpha/beta to
  49. * control flows whose maximal RTTs can be in usec up to few secs.
  50. */
  51. #define ALPHA_BETA_MAX ((1U << 31) - 1)
  52. /* Internal alpha/beta are in units of 64ns.
  53. * This enables to use all alpha/beta values in the allowed range without loss
  54. * of precision due to rounding when scaling them internally, e.g.,
  55. * scale_alpha_beta(1) will not round down to 0.
  56. */
  57. #define ALPHA_BETA_GRANULARITY 6
  58. #define ALPHA_BETA_SCALING (ALPHA_BETA_SHIFT - ALPHA_BETA_GRANULARITY)
  59. /* We express the weights (wc, wl) in %, i.e., wc + wl = 100 */
  60. #define MAX_WC 100
  61. struct dualpi2_sched_data {
  62. struct Qdisc *l_queue; /* The L4S Low latency queue (L-queue) */
  63. struct Qdisc *sch; /* The Classic queue (C-queue) */
  64. /* Registered tc filters */
  65. struct tcf_proto __rcu *tcf_filters;
  66. struct tcf_block *tcf_block;
  67. /* PI2 parameters */
  68. u64 pi2_target; /* Target delay in nanoseconds */
  69. u32 pi2_tupdate; /* Timer frequency in nanoseconds */
  70. u32 pi2_prob; /* Base PI probability */
  71. u32 pi2_alpha; /* Gain factor for the integral rate response */
  72. u32 pi2_beta; /* Gain factor for the proportional response */
  73. struct hrtimer pi2_timer; /* prob update timer */
  74. /* Step AQM (L-queue only) parameters */
  75. u32 step_thresh; /* Step threshold */
  76. bool step_in_packets; /* Step thresh in packets (1) or time (0) */
  77. /* C-queue starvation protection */
  78. s32 c_protection_credit; /* Credit (sign indicates which queue) */
  79. s32 c_protection_init; /* Reset value of the credit */
  80. u8 c_protection_wc; /* C-queue weight (between 0 and MAX_WC) */
  81. u8 c_protection_wl; /* L-queue weight (MAX_WC - wc) */
  82. /* General dualQ parameters */
  83. u32 memory_limit; /* Memory limit of both queues */
  84. u8 coupling_factor;/* Coupling factor (k) between both queues */
  85. u8 ecn_mask; /* Mask to match packets into L-queue */
  86. u32 min_qlen_step; /* Minimum queue length to apply step thresh */
  87. bool drop_early; /* Drop at enqueue (1) instead of dequeue (0) */
  88. bool drop_overload; /* Drop (1) on overload, or overflow (0) */
  89. bool split_gso; /* Split aggregated skb (1) or leave as is (0) */
  90. /* Statistics */
  91. u64 c_head_ts; /* Enqueue timestamp of the C-queue head */
  92. u64 l_head_ts; /* Enqueue timestamp of the L-queue head */
  93. u64 last_qdelay; /* Q delay val at the last probability update */
  94. u32 packets_in_c; /* Enqueue packet counter of the C-queue */
  95. u32 packets_in_l; /* Enqueue packet counter of the L-queue */
  96. u32 maxq; /* Maximum queue size of the C-queue */
  97. u32 ecn_mark; /* ECN mark pkt counter due to PI probability */
  98. u32 step_marks; /* ECN mark pkt counter due to step AQM */
  99. u32 memory_used; /* Memory used of both queues */
  100. u32 max_memory_used;/* Maximum used memory */
  101. /* Deferred drop statistics */
  102. u32 deferred_drops_cnt; /* Packets dropped */
  103. u32 deferred_drops_len; /* Bytes dropped */
  104. };
  105. struct dualpi2_skb_cb {
  106. u64 ts; /* Timestamp at enqueue */
  107. u8 apply_step:1, /* Can we apply the step threshold */
  108. classified:2, /* Packet classification results */
  109. ect:2; /* Packet ECT codepoint */
  110. };
  111. enum dualpi2_classification_results {
  112. DUALPI2_C_CLASSIC = 0, /* C-queue */
  113. DUALPI2_C_L4S = 1, /* L-queue (scale mark/classic drop) */
  114. DUALPI2_C_LLLL = 2, /* L-queue (no drops/marks) */
  115. __DUALPI2_C_MAX /* Keep last*/
  116. };
  117. static struct dualpi2_skb_cb *dualpi2_skb_cb(struct sk_buff *skb)
  118. {
  119. qdisc_cb_private_validate(skb, sizeof(struct dualpi2_skb_cb));
  120. return (struct dualpi2_skb_cb *)qdisc_skb_cb(skb)->data;
  121. }
  122. static u64 dualpi2_sojourn_time(struct sk_buff *skb, u64 reference)
  123. {
  124. return reference - dualpi2_skb_cb(skb)->ts;
  125. }
  126. static u64 head_enqueue_time(struct Qdisc *q)
  127. {
  128. struct sk_buff *skb = qdisc_peek_head(q);
  129. return skb ? dualpi2_skb_cb(skb)->ts : 0;
  130. }
  131. static u32 dualpi2_scale_alpha_beta(u32 param)
  132. {
  133. u64 tmp = ((u64)param * MAX_PROB >> ALPHA_BETA_SCALING);
  134. do_div(tmp, NSEC_PER_SEC);
  135. return tmp;
  136. }
  137. static u32 dualpi2_unscale_alpha_beta(u32 param)
  138. {
  139. u64 tmp = ((u64)param * NSEC_PER_SEC << ALPHA_BETA_SCALING);
  140. do_div(tmp, MAX_PROB);
  141. return tmp;
  142. }
  143. static ktime_t next_pi2_timeout(struct dualpi2_sched_data *q)
  144. {
  145. return ktime_add_ns(ktime_get_ns(), q->pi2_tupdate);
  146. }
  147. static bool skb_is_l4s(struct sk_buff *skb)
  148. {
  149. return dualpi2_skb_cb(skb)->classified == DUALPI2_C_L4S;
  150. }
  151. static bool skb_in_l_queue(struct sk_buff *skb)
  152. {
  153. return dualpi2_skb_cb(skb)->classified != DUALPI2_C_CLASSIC;
  154. }
  155. static bool skb_apply_step(struct sk_buff *skb, struct dualpi2_sched_data *q)
  156. {
  157. return skb_is_l4s(skb) && qdisc_qlen(q->l_queue) >= q->min_qlen_step;
  158. }
  159. static bool dualpi2_mark(struct dualpi2_sched_data *q, struct sk_buff *skb)
  160. {
  161. if (INET_ECN_set_ce(skb)) {
  162. q->ecn_mark++;
  163. return true;
  164. }
  165. return false;
  166. }
  167. static void dualpi2_reset_c_protection(struct dualpi2_sched_data *q)
  168. {
  169. q->c_protection_credit = q->c_protection_init;
  170. }
  171. /* This computes the initial credit value and WRR weight for the L queue (wl)
  172. * from the weight of the C queue (wc).
  173. * If wl > wc, the scheduler will start with the L queue when reset.
  174. */
  175. static void dualpi2_calculate_c_protection(struct Qdisc *sch,
  176. struct dualpi2_sched_data *q, u32 wc)
  177. {
  178. q->c_protection_wc = wc;
  179. q->c_protection_wl = MAX_WC - wc;
  180. q->c_protection_init = (s32)psched_mtu(qdisc_dev(sch)) *
  181. ((int)q->c_protection_wc - (int)q->c_protection_wl);
  182. dualpi2_reset_c_protection(q);
  183. }
  184. static bool dualpi2_roll(u32 prob)
  185. {
  186. return get_random_u32() <= prob;
  187. }
  188. /* Packets in the C-queue are subject to a marking probability pC, which is the
  189. * square of the internal PI probability (i.e., have an overall lower mark/drop
  190. * probability). If the qdisc is overloaded, ignore ECT values and only drop.
  191. *
  192. * Note that this marking scheme is also applied to L4S packets during overload.
  193. * Return true if packet dropping is required in C queue
  194. */
  195. static bool dualpi2_classic_marking(struct dualpi2_sched_data *q,
  196. struct sk_buff *skb, u32 prob,
  197. bool overload)
  198. {
  199. if (dualpi2_roll(prob) && dualpi2_roll(prob)) {
  200. if (overload || dualpi2_skb_cb(skb)->ect == INET_ECN_NOT_ECT)
  201. return true;
  202. dualpi2_mark(q, skb);
  203. }
  204. return false;
  205. }
  206. /* Packets in the L-queue are subject to a marking probability pL given by the
  207. * internal PI probability scaled by the coupling factor.
  208. *
  209. * On overload (i.e., @local_l_prob is >= 100%):
  210. * - if the qdisc is configured to trade losses to preserve latency (i.e.,
  211. * @q->drop_overload), apply classic drops first before marking.
  212. * - otherwise, preserve the "no loss" property of ECN at the cost of queueing
  213. * delay, eventually resulting in taildrop behavior once sch->limit is
  214. * reached.
  215. * Return true if packet dropping is required in L queue
  216. */
  217. static bool dualpi2_scalable_marking(struct dualpi2_sched_data *q,
  218. struct sk_buff *skb,
  219. u64 local_l_prob, u32 prob,
  220. bool overload)
  221. {
  222. if (overload) {
  223. /* Apply classic drop */
  224. if (!q->drop_overload ||
  225. !(dualpi2_roll(prob) && dualpi2_roll(prob)))
  226. goto mark;
  227. return true;
  228. }
  229. /* We can safely cut the upper 32b as overload==false */
  230. if (dualpi2_roll(local_l_prob)) {
  231. /* Non-ECT packets could have classified as L4S by filters. */
  232. if (dualpi2_skb_cb(skb)->ect == INET_ECN_NOT_ECT)
  233. return true;
  234. mark:
  235. dualpi2_mark(q, skb);
  236. }
  237. return false;
  238. }
  239. /* Decide whether a given packet must be dropped (or marked if ECT), according
  240. * to the PI2 probability.
  241. *
  242. * Never mark/drop if we have a standing queue of less than 2 MTUs.
  243. */
  244. static bool must_drop(struct Qdisc *sch, struct dualpi2_sched_data *q,
  245. struct sk_buff *skb)
  246. {
  247. u64 local_l_prob;
  248. bool overload;
  249. u32 prob;
  250. if (sch->qstats.backlog < 2 * psched_mtu(qdisc_dev(sch)))
  251. return false;
  252. prob = READ_ONCE(q->pi2_prob);
  253. local_l_prob = (u64)prob * q->coupling_factor;
  254. overload = local_l_prob > MAX_PROB;
  255. switch (dualpi2_skb_cb(skb)->classified) {
  256. case DUALPI2_C_CLASSIC:
  257. return dualpi2_classic_marking(q, skb, prob, overload);
  258. case DUALPI2_C_L4S:
  259. return dualpi2_scalable_marking(q, skb, local_l_prob, prob,
  260. overload);
  261. default: /* DUALPI2_C_LLLL */
  262. return false;
  263. }
  264. }
  265. static void dualpi2_read_ect(struct sk_buff *skb)
  266. {
  267. struct dualpi2_skb_cb *cb = dualpi2_skb_cb(skb);
  268. int wlen = skb_network_offset(skb);
  269. switch (skb_protocol(skb, true)) {
  270. case htons(ETH_P_IP):
  271. wlen += sizeof(struct iphdr);
  272. if (!pskb_may_pull(skb, wlen) ||
  273. skb_try_make_writable(skb, wlen))
  274. goto not_ecn;
  275. cb->ect = ipv4_get_dsfield(ip_hdr(skb)) & INET_ECN_MASK;
  276. break;
  277. case htons(ETH_P_IPV6):
  278. wlen += sizeof(struct ipv6hdr);
  279. if (!pskb_may_pull(skb, wlen) ||
  280. skb_try_make_writable(skb, wlen))
  281. goto not_ecn;
  282. cb->ect = ipv6_get_dsfield(ipv6_hdr(skb)) & INET_ECN_MASK;
  283. break;
  284. default:
  285. goto not_ecn;
  286. }
  287. return;
  288. not_ecn:
  289. /* Non pullable/writable packets can only be dropped hence are
  290. * classified as not ECT.
  291. */
  292. cb->ect = INET_ECN_NOT_ECT;
  293. }
  294. static int dualpi2_skb_classify(struct dualpi2_sched_data *q,
  295. struct sk_buff *skb)
  296. {
  297. struct dualpi2_skb_cb *cb = dualpi2_skb_cb(skb);
  298. struct tcf_result res;
  299. struct tcf_proto *fl;
  300. int result;
  301. dualpi2_read_ect(skb);
  302. if (cb->ect & q->ecn_mask) {
  303. cb->classified = DUALPI2_C_L4S;
  304. return NET_XMIT_SUCCESS;
  305. }
  306. if (TC_H_MAJ(skb->priority) == q->sch->handle &&
  307. TC_H_MIN(skb->priority) < __DUALPI2_C_MAX) {
  308. cb->classified = TC_H_MIN(skb->priority);
  309. return NET_XMIT_SUCCESS;
  310. }
  311. fl = rcu_dereference_bh(q->tcf_filters);
  312. if (!fl) {
  313. cb->classified = DUALPI2_C_CLASSIC;
  314. return NET_XMIT_SUCCESS;
  315. }
  316. result = tcf_classify(skb, NULL, fl, &res, false);
  317. if (result >= 0) {
  318. #ifdef CONFIG_NET_CLS_ACT
  319. switch (result) {
  320. case TC_ACT_STOLEN:
  321. case TC_ACT_QUEUED:
  322. case TC_ACT_TRAP:
  323. return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
  324. case TC_ACT_SHOT:
  325. return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
  326. }
  327. #endif
  328. cb->classified = TC_H_MIN(res.classid) < __DUALPI2_C_MAX ?
  329. TC_H_MIN(res.classid) : DUALPI2_C_CLASSIC;
  330. }
  331. return NET_XMIT_SUCCESS;
  332. }
  333. static int dualpi2_enqueue_skb(struct sk_buff *skb, struct Qdisc *sch,
  334. struct sk_buff **to_free)
  335. {
  336. struct dualpi2_sched_data *q = qdisc_priv(sch);
  337. struct dualpi2_skb_cb *cb;
  338. if (unlikely(qdisc_qlen(sch) >= sch->limit) ||
  339. unlikely((u64)q->memory_used + skb->truesize > q->memory_limit)) {
  340. qdisc_qstats_overlimit(sch);
  341. if (skb_in_l_queue(skb))
  342. qdisc_qstats_overlimit(q->l_queue);
  343. return qdisc_drop_reason(skb, sch, to_free,
  344. SKB_DROP_REASON_QDISC_OVERLIMIT);
  345. }
  346. if (q->drop_early && must_drop(sch, q, skb)) {
  347. qdisc_drop_reason(skb, sch, to_free,
  348. SKB_DROP_REASON_QDISC_CONGESTED);
  349. return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
  350. }
  351. cb = dualpi2_skb_cb(skb);
  352. cb->ts = ktime_get_ns();
  353. q->memory_used += skb->truesize;
  354. if (q->memory_used > q->max_memory_used)
  355. q->max_memory_used = q->memory_used;
  356. if (qdisc_qlen(sch) > q->maxq)
  357. q->maxq = qdisc_qlen(sch);
  358. if (skb_in_l_queue(skb)) {
  359. /* Apply step thresh if skb is L4S && L-queue len >= min_qlen */
  360. dualpi2_skb_cb(skb)->apply_step = skb_apply_step(skb, q);
  361. /* Keep the overall qdisc stats consistent */
  362. ++sch->q.qlen;
  363. qdisc_qstats_backlog_inc(sch, skb);
  364. ++q->packets_in_l;
  365. if (!q->l_head_ts)
  366. q->l_head_ts = cb->ts;
  367. return qdisc_enqueue_tail(skb, q->l_queue);
  368. }
  369. ++q->packets_in_c;
  370. if (!q->c_head_ts)
  371. q->c_head_ts = cb->ts;
  372. return qdisc_enqueue_tail(skb, sch);
  373. }
  374. /* By default, dualpi2 will split GSO skbs into independent skbs and enqueue
  375. * each of those individually. This yields the following benefits, at the
  376. * expense of CPU usage:
  377. * - Finer-grained AQM actions as the sub-packets of a burst no longer share the
  378. * same fate (e.g., the random mark/drop probability is applied individually)
  379. * - Improved precision of the starvation protection/WRR scheduler at dequeue,
  380. * as the size of the dequeued packets will be smaller.
  381. */
  382. static int dualpi2_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
  383. struct sk_buff **to_free)
  384. {
  385. struct dualpi2_sched_data *q = qdisc_priv(sch);
  386. int err;
  387. err = dualpi2_skb_classify(q, skb);
  388. if (err != NET_XMIT_SUCCESS) {
  389. if (err & __NET_XMIT_BYPASS)
  390. qdisc_qstats_drop(sch);
  391. __qdisc_drop(skb, to_free);
  392. return err;
  393. }
  394. if (q->split_gso && skb_is_gso(skb)) {
  395. netdev_features_t features;
  396. struct sk_buff *nskb, *next;
  397. int cnt, byte_len, orig_len;
  398. int err;
  399. features = netif_skb_features(skb);
  400. nskb = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
  401. if (IS_ERR_OR_NULL(nskb))
  402. return qdisc_drop(skb, sch, to_free);
  403. cnt = 1;
  404. byte_len = 0;
  405. orig_len = qdisc_pkt_len(skb);
  406. skb_list_walk_safe(nskb, nskb, next) {
  407. skb_mark_not_on_list(nskb);
  408. /* Iterate through GSO fragments of an skb:
  409. * (1) Set pkt_len from the single GSO fragments
  410. * (2) Copy classified and ect values of an skb
  411. * (3) Enqueue fragment & set ts in dualpi2_enqueue_skb
  412. */
  413. qdisc_skb_cb(nskb)->pkt_len = nskb->len;
  414. qdisc_skb_cb(nskb)->pkt_segs = 1;
  415. dualpi2_skb_cb(nskb)->classified =
  416. dualpi2_skb_cb(skb)->classified;
  417. dualpi2_skb_cb(nskb)->ect = dualpi2_skb_cb(skb)->ect;
  418. err = dualpi2_enqueue_skb(nskb, sch, to_free);
  419. if (err == NET_XMIT_SUCCESS) {
  420. /* Compute the backlog adjustment that needs
  421. * to be propagated in the qdisc tree to reflect
  422. * all new skbs successfully enqueued.
  423. */
  424. ++cnt;
  425. byte_len += nskb->len;
  426. }
  427. }
  428. if (cnt > 1) {
  429. /* The caller will add the original skb stats to its
  430. * backlog, compensate this if any nskb is enqueued.
  431. */
  432. --cnt;
  433. byte_len -= orig_len;
  434. }
  435. qdisc_tree_reduce_backlog(sch, -cnt, -byte_len);
  436. consume_skb(skb);
  437. return err;
  438. }
  439. return dualpi2_enqueue_skb(skb, sch, to_free);
  440. }
  441. /* Select the queue from which the next packet can be dequeued, ensuring that
  442. * neither queue can starve the other with a WRR scheduler.
  443. *
  444. * The sign of the WRR credit determines the next queue, while the size of
  445. * the dequeued packet determines the magnitude of the WRR credit change. If
  446. * either queue is empty, the WRR credit is kept unchanged.
  447. *
  448. * As the dequeued packet can be dropped later, the caller has to perform the
  449. * qdisc_bstats_update() calls.
  450. */
  451. static struct sk_buff *dequeue_packet(struct Qdisc *sch,
  452. struct dualpi2_sched_data *q,
  453. int *credit_change,
  454. u64 now)
  455. {
  456. struct sk_buff *skb = NULL;
  457. int c_len;
  458. *credit_change = 0;
  459. c_len = qdisc_qlen(sch) - qdisc_qlen(q->l_queue);
  460. if (qdisc_qlen(q->l_queue) && (!c_len || q->c_protection_credit <= 0)) {
  461. skb = __qdisc_dequeue_head(&q->l_queue->q);
  462. WRITE_ONCE(q->l_head_ts, head_enqueue_time(q->l_queue));
  463. if (c_len)
  464. *credit_change = q->c_protection_wc;
  465. qdisc_qstats_backlog_dec(q->l_queue, skb);
  466. /* Keep the global queue size consistent */
  467. --sch->q.qlen;
  468. q->memory_used -= skb->truesize;
  469. } else if (c_len) {
  470. skb = __qdisc_dequeue_head(&sch->q);
  471. WRITE_ONCE(q->c_head_ts, head_enqueue_time(sch));
  472. if (qdisc_qlen(q->l_queue))
  473. *credit_change = ~((s32)q->c_protection_wl) + 1;
  474. q->memory_used -= skb->truesize;
  475. } else {
  476. dualpi2_reset_c_protection(q);
  477. return NULL;
  478. }
  479. *credit_change *= qdisc_pkt_len(skb);
  480. qdisc_qstats_backlog_dec(sch, skb);
  481. return skb;
  482. }
  483. static int do_step_aqm(struct dualpi2_sched_data *q, struct sk_buff *skb,
  484. u64 now)
  485. {
  486. u64 qdelay = 0;
  487. if (q->step_in_packets)
  488. qdelay = qdisc_qlen(q->l_queue);
  489. else
  490. qdelay = dualpi2_sojourn_time(skb, now);
  491. if (dualpi2_skb_cb(skb)->apply_step && qdelay > q->step_thresh) {
  492. if (!dualpi2_skb_cb(skb)->ect) {
  493. /* Drop this non-ECT packet */
  494. return 1;
  495. }
  496. if (dualpi2_mark(q, skb))
  497. ++q->step_marks;
  498. }
  499. qdisc_bstats_update(q->l_queue, skb);
  500. return 0;
  501. }
  502. static void drop_and_retry(struct dualpi2_sched_data *q, struct sk_buff *skb,
  503. struct Qdisc *sch, enum skb_drop_reason reason)
  504. {
  505. ++q->deferred_drops_cnt;
  506. q->deferred_drops_len += qdisc_pkt_len(skb);
  507. kfree_skb_reason(skb, reason);
  508. qdisc_qstats_drop(sch);
  509. }
  510. static struct sk_buff *dualpi2_qdisc_dequeue(struct Qdisc *sch)
  511. {
  512. struct dualpi2_sched_data *q = qdisc_priv(sch);
  513. struct sk_buff *skb;
  514. int credit_change;
  515. u64 now;
  516. now = ktime_get_ns();
  517. while ((skb = dequeue_packet(sch, q, &credit_change, now))) {
  518. if (!q->drop_early && must_drop(sch, q, skb)) {
  519. drop_and_retry(q, skb, sch,
  520. SKB_DROP_REASON_QDISC_CONGESTED);
  521. continue;
  522. }
  523. if (skb_in_l_queue(skb) && do_step_aqm(q, skb, now)) {
  524. qdisc_qstats_drop(q->l_queue);
  525. drop_and_retry(q, skb, sch,
  526. SKB_DROP_REASON_DUALPI2_STEP_DROP);
  527. continue;
  528. }
  529. q->c_protection_credit += credit_change;
  530. qdisc_bstats_update(sch, skb);
  531. break;
  532. }
  533. if (q->deferred_drops_cnt) {
  534. qdisc_tree_reduce_backlog(sch, q->deferred_drops_cnt,
  535. q->deferred_drops_len);
  536. q->deferred_drops_cnt = 0;
  537. q->deferred_drops_len = 0;
  538. }
  539. return skb;
  540. }
  541. static s64 __scale_delta(u64 diff)
  542. {
  543. do_div(diff, 1 << ALPHA_BETA_GRANULARITY);
  544. return diff;
  545. }
  546. static void get_queue_delays(struct dualpi2_sched_data *q, u64 *qdelay_c,
  547. u64 *qdelay_l)
  548. {
  549. u64 now, qc, ql;
  550. now = ktime_get_ns();
  551. qc = READ_ONCE(q->c_head_ts);
  552. ql = READ_ONCE(q->l_head_ts);
  553. *qdelay_c = qc ? now - qc : 0;
  554. *qdelay_l = ql ? now - ql : 0;
  555. }
  556. static u32 calculate_probability(struct Qdisc *sch)
  557. {
  558. struct dualpi2_sched_data *q = qdisc_priv(sch);
  559. u32 new_prob;
  560. u64 qdelay_c;
  561. u64 qdelay_l;
  562. u64 qdelay;
  563. s64 delta;
  564. get_queue_delays(q, &qdelay_c, &qdelay_l);
  565. qdelay = max(qdelay_l, qdelay_c);
  566. /* Alpha and beta take at most 32b, i.e, the delay difference would
  567. * overflow for queuing delay differences > ~4.2sec.
  568. */
  569. delta = ((s64)qdelay - (s64)q->pi2_target) * q->pi2_alpha;
  570. delta += ((s64)qdelay - (s64)q->last_qdelay) * q->pi2_beta;
  571. q->last_qdelay = qdelay;
  572. /* Bound new_prob between 0 and MAX_PROB */
  573. if (delta > 0) {
  574. new_prob = __scale_delta(delta) + q->pi2_prob;
  575. if (new_prob < q->pi2_prob)
  576. new_prob = MAX_PROB;
  577. } else {
  578. new_prob = q->pi2_prob - __scale_delta(~delta + 1);
  579. if (new_prob > q->pi2_prob)
  580. new_prob = 0;
  581. }
  582. /* If we do not drop on overload, ensure we cap the L4S probability to
  583. * 100% to keep window fairness when overflowing.
  584. */
  585. if (!q->drop_overload)
  586. return min_t(u32, new_prob, MAX_PROB / q->coupling_factor);
  587. return new_prob;
  588. }
  589. static u32 get_memory_limit(struct Qdisc *sch, u32 limit)
  590. {
  591. /* Apply rule of thumb, i.e., doubling the packet length,
  592. * to further include per packet overhead in memory_limit.
  593. */
  594. u64 memlim = mul_u32_u32(limit, 2 * psched_mtu(qdisc_dev(sch)));
  595. if (upper_32_bits(memlim))
  596. return U32_MAX;
  597. else
  598. return lower_32_bits(memlim);
  599. }
  600. static u32 convert_us_to_nsec(u32 us)
  601. {
  602. u64 ns = mul_u32_u32(us, NSEC_PER_USEC);
  603. if (upper_32_bits(ns))
  604. return U32_MAX;
  605. return lower_32_bits(ns);
  606. }
  607. static u32 convert_ns_to_usec(u64 ns)
  608. {
  609. do_div(ns, NSEC_PER_USEC);
  610. if (upper_32_bits(ns))
  611. return U32_MAX;
  612. return lower_32_bits(ns);
  613. }
  614. static enum hrtimer_restart dualpi2_timer(struct hrtimer *timer)
  615. {
  616. struct dualpi2_sched_data *q = timer_container_of(q, timer, pi2_timer);
  617. struct Qdisc *sch = q->sch;
  618. spinlock_t *root_lock; /* to lock qdisc for probability calculations */
  619. rcu_read_lock();
  620. root_lock = qdisc_lock(qdisc_root_sleeping(sch));
  621. spin_lock(root_lock);
  622. WRITE_ONCE(q->pi2_prob, calculate_probability(sch));
  623. hrtimer_set_expires(&q->pi2_timer, next_pi2_timeout(q));
  624. spin_unlock(root_lock);
  625. rcu_read_unlock();
  626. return HRTIMER_RESTART;
  627. }
  628. static struct netlink_range_validation dualpi2_alpha_beta_range = {
  629. .min = 1,
  630. .max = ALPHA_BETA_MAX,
  631. };
  632. static const struct nla_policy dualpi2_policy[TCA_DUALPI2_MAX + 1] = {
  633. [TCA_DUALPI2_LIMIT] = NLA_POLICY_MIN(NLA_U32, 1),
  634. [TCA_DUALPI2_MEMORY_LIMIT] = NLA_POLICY_MIN(NLA_U32, 1),
  635. [TCA_DUALPI2_TARGET] = { .type = NLA_U32 },
  636. [TCA_DUALPI2_TUPDATE] = NLA_POLICY_MIN(NLA_U32, 1),
  637. [TCA_DUALPI2_ALPHA] =
  638. NLA_POLICY_FULL_RANGE(NLA_U32, &dualpi2_alpha_beta_range),
  639. [TCA_DUALPI2_BETA] =
  640. NLA_POLICY_FULL_RANGE(NLA_U32, &dualpi2_alpha_beta_range),
  641. [TCA_DUALPI2_STEP_THRESH_PKTS] = { .type = NLA_U32 },
  642. [TCA_DUALPI2_STEP_THRESH_US] = { .type = NLA_U32 },
  643. [TCA_DUALPI2_MIN_QLEN_STEP] = { .type = NLA_U32 },
  644. [TCA_DUALPI2_COUPLING] = NLA_POLICY_MIN(NLA_U8, 1),
  645. [TCA_DUALPI2_DROP_OVERLOAD] =
  646. NLA_POLICY_MAX(NLA_U8, TCA_DUALPI2_DROP_OVERLOAD_MAX),
  647. [TCA_DUALPI2_DROP_EARLY] =
  648. NLA_POLICY_MAX(NLA_U8, TCA_DUALPI2_DROP_EARLY_MAX),
  649. [TCA_DUALPI2_C_PROTECTION] =
  650. NLA_POLICY_RANGE(NLA_U8, 0, MAX_WC),
  651. [TCA_DUALPI2_ECN_MASK] =
  652. NLA_POLICY_RANGE(NLA_U8, TC_DUALPI2_ECN_MASK_L4S_ECT,
  653. TCA_DUALPI2_ECN_MASK_MAX),
  654. [TCA_DUALPI2_SPLIT_GSO] =
  655. NLA_POLICY_MAX(NLA_U8, TCA_DUALPI2_SPLIT_GSO_MAX),
  656. };
  657. static int dualpi2_change(struct Qdisc *sch, struct nlattr *opt,
  658. struct netlink_ext_ack *extack)
  659. {
  660. struct nlattr *tb[TCA_DUALPI2_MAX + 1];
  661. struct dualpi2_sched_data *q;
  662. int old_backlog;
  663. int old_qlen;
  664. int err;
  665. if (!opt || !nla_len(opt)) {
  666. NL_SET_ERR_MSG_MOD(extack, "Dualpi2 options are required");
  667. return -EINVAL;
  668. }
  669. err = nla_parse_nested(tb, TCA_DUALPI2_MAX, opt, dualpi2_policy,
  670. extack);
  671. if (err < 0)
  672. return err;
  673. if (tb[TCA_DUALPI2_STEP_THRESH_PKTS] && tb[TCA_DUALPI2_STEP_THRESH_US]) {
  674. NL_SET_ERR_MSG_MOD(extack, "multiple step thresh attributes");
  675. return -EINVAL;
  676. }
  677. q = qdisc_priv(sch);
  678. sch_tree_lock(sch);
  679. if (tb[TCA_DUALPI2_LIMIT]) {
  680. u32 limit = nla_get_u32(tb[TCA_DUALPI2_LIMIT]);
  681. WRITE_ONCE(sch->limit, limit);
  682. WRITE_ONCE(q->memory_limit, get_memory_limit(sch, limit));
  683. }
  684. if (tb[TCA_DUALPI2_MEMORY_LIMIT])
  685. WRITE_ONCE(q->memory_limit,
  686. nla_get_u32(tb[TCA_DUALPI2_MEMORY_LIMIT]));
  687. if (tb[TCA_DUALPI2_TARGET]) {
  688. u64 target = nla_get_u32(tb[TCA_DUALPI2_TARGET]);
  689. WRITE_ONCE(q->pi2_target, target * NSEC_PER_USEC);
  690. }
  691. if (tb[TCA_DUALPI2_TUPDATE]) {
  692. u64 tupdate = nla_get_u32(tb[TCA_DUALPI2_TUPDATE]);
  693. WRITE_ONCE(q->pi2_tupdate, convert_us_to_nsec(tupdate));
  694. }
  695. if (tb[TCA_DUALPI2_ALPHA]) {
  696. u32 alpha = nla_get_u32(tb[TCA_DUALPI2_ALPHA]);
  697. WRITE_ONCE(q->pi2_alpha, dualpi2_scale_alpha_beta(alpha));
  698. }
  699. if (tb[TCA_DUALPI2_BETA]) {
  700. u32 beta = nla_get_u32(tb[TCA_DUALPI2_BETA]);
  701. WRITE_ONCE(q->pi2_beta, dualpi2_scale_alpha_beta(beta));
  702. }
  703. if (tb[TCA_DUALPI2_STEP_THRESH_PKTS]) {
  704. u32 step_th = nla_get_u32(tb[TCA_DUALPI2_STEP_THRESH_PKTS]);
  705. WRITE_ONCE(q->step_in_packets, true);
  706. WRITE_ONCE(q->step_thresh, step_th);
  707. } else if (tb[TCA_DUALPI2_STEP_THRESH_US]) {
  708. u32 step_th = nla_get_u32(tb[TCA_DUALPI2_STEP_THRESH_US]);
  709. WRITE_ONCE(q->step_in_packets, false);
  710. WRITE_ONCE(q->step_thresh, convert_us_to_nsec(step_th));
  711. }
  712. if (tb[TCA_DUALPI2_MIN_QLEN_STEP])
  713. WRITE_ONCE(q->min_qlen_step,
  714. nla_get_u32(tb[TCA_DUALPI2_MIN_QLEN_STEP]));
  715. if (tb[TCA_DUALPI2_COUPLING]) {
  716. u8 coupling = nla_get_u8(tb[TCA_DUALPI2_COUPLING]);
  717. WRITE_ONCE(q->coupling_factor, coupling);
  718. }
  719. if (tb[TCA_DUALPI2_DROP_OVERLOAD]) {
  720. u8 drop_overload = nla_get_u8(tb[TCA_DUALPI2_DROP_OVERLOAD]);
  721. WRITE_ONCE(q->drop_overload, (bool)drop_overload);
  722. }
  723. if (tb[TCA_DUALPI2_DROP_EARLY]) {
  724. u8 drop_early = nla_get_u8(tb[TCA_DUALPI2_DROP_EARLY]);
  725. WRITE_ONCE(q->drop_early, (bool)drop_early);
  726. }
  727. if (tb[TCA_DUALPI2_C_PROTECTION]) {
  728. u8 wc = nla_get_u8(tb[TCA_DUALPI2_C_PROTECTION]);
  729. dualpi2_calculate_c_protection(sch, q, wc);
  730. }
  731. if (tb[TCA_DUALPI2_ECN_MASK]) {
  732. u8 ecn_mask = nla_get_u8(tb[TCA_DUALPI2_ECN_MASK]);
  733. WRITE_ONCE(q->ecn_mask, ecn_mask);
  734. }
  735. if (tb[TCA_DUALPI2_SPLIT_GSO]) {
  736. u8 split_gso = nla_get_u8(tb[TCA_DUALPI2_SPLIT_GSO]);
  737. WRITE_ONCE(q->split_gso, (bool)split_gso);
  738. }
  739. old_qlen = qdisc_qlen(sch);
  740. old_backlog = sch->qstats.backlog;
  741. while (qdisc_qlen(sch) > sch->limit ||
  742. q->memory_used > q->memory_limit) {
  743. struct sk_buff *skb = qdisc_dequeue_internal(sch, true);
  744. q->memory_used -= skb->truesize;
  745. qdisc_qstats_backlog_dec(sch, skb);
  746. rtnl_qdisc_drop(skb, sch);
  747. }
  748. qdisc_tree_reduce_backlog(sch, old_qlen - qdisc_qlen(sch),
  749. old_backlog - sch->qstats.backlog);
  750. sch_tree_unlock(sch);
  751. return 0;
  752. }
  753. /* Default alpha/beta values give a 10dB stability margin with max_rtt=100ms. */
  754. static void dualpi2_reset_default(struct Qdisc *sch)
  755. {
  756. struct dualpi2_sched_data *q = qdisc_priv(sch);
  757. q->sch->limit = 10000; /* Max 125ms at 1Gbps */
  758. q->memory_limit = get_memory_limit(sch, q->sch->limit);
  759. q->pi2_target = 15 * NSEC_PER_MSEC;
  760. q->pi2_tupdate = 16 * NSEC_PER_MSEC;
  761. q->pi2_alpha = dualpi2_scale_alpha_beta(41); /* ~0.16 Hz * 256 */
  762. q->pi2_beta = dualpi2_scale_alpha_beta(819); /* ~3.20 Hz * 256 */
  763. q->step_thresh = 1 * NSEC_PER_MSEC;
  764. q->step_in_packets = false;
  765. dualpi2_calculate_c_protection(q->sch, q, 10); /* wc=10%, wl=90% */
  766. q->ecn_mask = TC_DUALPI2_ECN_MASK_L4S_ECT; /* INET_ECN_ECT_1 */
  767. q->min_qlen_step = 0; /* Always apply step mark in L-queue */
  768. q->coupling_factor = 2; /* window fairness for equal RTTs */
  769. q->drop_overload = TC_DUALPI2_DROP_OVERLOAD_DROP; /* Drop overload */
  770. q->drop_early = TC_DUALPI2_DROP_EARLY_DROP_DEQUEUE; /* Drop dequeue */
  771. q->split_gso = TC_DUALPI2_SPLIT_GSO_SPLIT_GSO; /* Split GSO */
  772. }
  773. static int dualpi2_init(struct Qdisc *sch, struct nlattr *opt,
  774. struct netlink_ext_ack *extack)
  775. {
  776. struct dualpi2_sched_data *q = qdisc_priv(sch);
  777. int err;
  778. q->l_queue = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
  779. TC_H_MAKE(sch->handle, 1), extack);
  780. if (!q->l_queue)
  781. return -ENOMEM;
  782. err = tcf_block_get(&q->tcf_block, &q->tcf_filters, sch, extack);
  783. if (err)
  784. return err;
  785. q->sch = sch;
  786. dualpi2_reset_default(sch);
  787. hrtimer_setup(&q->pi2_timer, dualpi2_timer, CLOCK_MONOTONIC,
  788. HRTIMER_MODE_ABS_PINNED_SOFT);
  789. if (opt && nla_len(opt)) {
  790. err = dualpi2_change(sch, opt, extack);
  791. if (err)
  792. return err;
  793. }
  794. hrtimer_start(&q->pi2_timer, next_pi2_timeout(q),
  795. HRTIMER_MODE_ABS_PINNED_SOFT);
  796. return 0;
  797. }
  798. static int dualpi2_dump(struct Qdisc *sch, struct sk_buff *skb)
  799. {
  800. struct dualpi2_sched_data *q = qdisc_priv(sch);
  801. struct nlattr *opts;
  802. bool step_in_pkts;
  803. u32 step_th;
  804. step_in_pkts = READ_ONCE(q->step_in_packets);
  805. step_th = READ_ONCE(q->step_thresh);
  806. opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
  807. if (!opts)
  808. goto nla_put_failure;
  809. if (step_in_pkts &&
  810. (nla_put_u32(skb, TCA_DUALPI2_LIMIT, READ_ONCE(sch->limit)) ||
  811. nla_put_u32(skb, TCA_DUALPI2_MEMORY_LIMIT,
  812. READ_ONCE(q->memory_limit)) ||
  813. nla_put_u32(skb, TCA_DUALPI2_TARGET,
  814. convert_ns_to_usec(READ_ONCE(q->pi2_target))) ||
  815. nla_put_u32(skb, TCA_DUALPI2_TUPDATE,
  816. convert_ns_to_usec(READ_ONCE(q->pi2_tupdate))) ||
  817. nla_put_u32(skb, TCA_DUALPI2_ALPHA,
  818. dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_alpha))) ||
  819. nla_put_u32(skb, TCA_DUALPI2_BETA,
  820. dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_beta))) ||
  821. nla_put_u32(skb, TCA_DUALPI2_STEP_THRESH_PKTS, step_th) ||
  822. nla_put_u32(skb, TCA_DUALPI2_MIN_QLEN_STEP,
  823. READ_ONCE(q->min_qlen_step)) ||
  824. nla_put_u8(skb, TCA_DUALPI2_COUPLING,
  825. READ_ONCE(q->coupling_factor)) ||
  826. nla_put_u8(skb, TCA_DUALPI2_DROP_OVERLOAD,
  827. READ_ONCE(q->drop_overload)) ||
  828. nla_put_u8(skb, TCA_DUALPI2_DROP_EARLY,
  829. READ_ONCE(q->drop_early)) ||
  830. nla_put_u8(skb, TCA_DUALPI2_C_PROTECTION,
  831. READ_ONCE(q->c_protection_wc)) ||
  832. nla_put_u8(skb, TCA_DUALPI2_ECN_MASK, READ_ONCE(q->ecn_mask)) ||
  833. nla_put_u8(skb, TCA_DUALPI2_SPLIT_GSO, READ_ONCE(q->split_gso))))
  834. goto nla_put_failure;
  835. if (!step_in_pkts &&
  836. (nla_put_u32(skb, TCA_DUALPI2_LIMIT, READ_ONCE(sch->limit)) ||
  837. nla_put_u32(skb, TCA_DUALPI2_MEMORY_LIMIT,
  838. READ_ONCE(q->memory_limit)) ||
  839. nla_put_u32(skb, TCA_DUALPI2_TARGET,
  840. convert_ns_to_usec(READ_ONCE(q->pi2_target))) ||
  841. nla_put_u32(skb, TCA_DUALPI2_TUPDATE,
  842. convert_ns_to_usec(READ_ONCE(q->pi2_tupdate))) ||
  843. nla_put_u32(skb, TCA_DUALPI2_ALPHA,
  844. dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_alpha))) ||
  845. nla_put_u32(skb, TCA_DUALPI2_BETA,
  846. dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_beta))) ||
  847. nla_put_u32(skb, TCA_DUALPI2_STEP_THRESH_US,
  848. convert_ns_to_usec(step_th)) ||
  849. nla_put_u32(skb, TCA_DUALPI2_MIN_QLEN_STEP,
  850. READ_ONCE(q->min_qlen_step)) ||
  851. nla_put_u8(skb, TCA_DUALPI2_COUPLING,
  852. READ_ONCE(q->coupling_factor)) ||
  853. nla_put_u8(skb, TCA_DUALPI2_DROP_OVERLOAD,
  854. READ_ONCE(q->drop_overload)) ||
  855. nla_put_u8(skb, TCA_DUALPI2_DROP_EARLY,
  856. READ_ONCE(q->drop_early)) ||
  857. nla_put_u8(skb, TCA_DUALPI2_C_PROTECTION,
  858. READ_ONCE(q->c_protection_wc)) ||
  859. nla_put_u8(skb, TCA_DUALPI2_ECN_MASK, READ_ONCE(q->ecn_mask)) ||
  860. nla_put_u8(skb, TCA_DUALPI2_SPLIT_GSO, READ_ONCE(q->split_gso))))
  861. goto nla_put_failure;
  862. return nla_nest_end(skb, opts);
  863. nla_put_failure:
  864. nla_nest_cancel(skb, opts);
  865. return -1;
  866. }
  867. static int dualpi2_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
  868. {
  869. struct dualpi2_sched_data *q = qdisc_priv(sch);
  870. struct tc_dualpi2_xstats st = {
  871. .prob = READ_ONCE(q->pi2_prob),
  872. .packets_in_c = q->packets_in_c,
  873. .packets_in_l = q->packets_in_l,
  874. .maxq = q->maxq,
  875. .ecn_mark = q->ecn_mark,
  876. .credit = q->c_protection_credit,
  877. .step_marks = q->step_marks,
  878. .memory_used = q->memory_used,
  879. .max_memory_used = q->max_memory_used,
  880. .memory_limit = q->memory_limit,
  881. };
  882. u64 qc, ql;
  883. get_queue_delays(q, &qc, &ql);
  884. st.delay_l = convert_ns_to_usec(ql);
  885. st.delay_c = convert_ns_to_usec(qc);
  886. return gnet_stats_copy_app(d, &st, sizeof(st));
  887. }
  888. /* Reset both L-queue and C-queue, internal packet counters, PI probability,
  889. * C-queue protection credit, and timestamps, while preserving current
  890. * configuration of DUALPI2.
  891. */
  892. static void dualpi2_reset(struct Qdisc *sch)
  893. {
  894. struct dualpi2_sched_data *q = qdisc_priv(sch);
  895. qdisc_reset_queue(sch);
  896. qdisc_reset_queue(q->l_queue);
  897. q->c_head_ts = 0;
  898. q->l_head_ts = 0;
  899. q->pi2_prob = 0;
  900. q->packets_in_c = 0;
  901. q->packets_in_l = 0;
  902. q->maxq = 0;
  903. q->ecn_mark = 0;
  904. q->step_marks = 0;
  905. q->memory_used = 0;
  906. q->max_memory_used = 0;
  907. dualpi2_reset_c_protection(q);
  908. }
  909. static void dualpi2_destroy(struct Qdisc *sch)
  910. {
  911. struct dualpi2_sched_data *q = qdisc_priv(sch);
  912. q->pi2_tupdate = 0;
  913. hrtimer_cancel(&q->pi2_timer);
  914. if (q->l_queue)
  915. qdisc_put(q->l_queue);
  916. tcf_block_put(q->tcf_block);
  917. }
  918. static struct Qdisc *dualpi2_leaf(struct Qdisc *sch, unsigned long arg)
  919. {
  920. return NULL;
  921. }
  922. static unsigned long dualpi2_find(struct Qdisc *sch, u32 classid)
  923. {
  924. return 0;
  925. }
  926. static unsigned long dualpi2_bind(struct Qdisc *sch, unsigned long parent,
  927. u32 classid)
  928. {
  929. return 0;
  930. }
  931. static void dualpi2_unbind(struct Qdisc *q, unsigned long cl)
  932. {
  933. }
  934. static struct tcf_block *dualpi2_tcf_block(struct Qdisc *sch, unsigned long cl,
  935. struct netlink_ext_ack *extack)
  936. {
  937. struct dualpi2_sched_data *q = qdisc_priv(sch);
  938. if (cl)
  939. return NULL;
  940. return q->tcf_block;
  941. }
  942. static void dualpi2_walk(struct Qdisc *sch, struct qdisc_walker *arg)
  943. {
  944. unsigned int i;
  945. if (arg->stop)
  946. return;
  947. /* We statically define only 2 queues */
  948. for (i = 0; i < 2; i++) {
  949. if (arg->count < arg->skip) {
  950. arg->count++;
  951. continue;
  952. }
  953. if (arg->fn(sch, i + 1, arg) < 0) {
  954. arg->stop = 1;
  955. break;
  956. }
  957. arg->count++;
  958. }
  959. }
  960. /* Minimal class support to handle tc filters */
  961. static const struct Qdisc_class_ops dualpi2_class_ops = {
  962. .leaf = dualpi2_leaf,
  963. .find = dualpi2_find,
  964. .tcf_block = dualpi2_tcf_block,
  965. .bind_tcf = dualpi2_bind,
  966. .unbind_tcf = dualpi2_unbind,
  967. .walk = dualpi2_walk,
  968. };
  969. static struct Qdisc_ops dualpi2_qdisc_ops __read_mostly = {
  970. .id = "dualpi2",
  971. .cl_ops = &dualpi2_class_ops,
  972. .priv_size = sizeof(struct dualpi2_sched_data),
  973. .enqueue = dualpi2_qdisc_enqueue,
  974. .dequeue = dualpi2_qdisc_dequeue,
  975. .peek = qdisc_peek_dequeued,
  976. .init = dualpi2_init,
  977. .destroy = dualpi2_destroy,
  978. .reset = dualpi2_reset,
  979. .change = dualpi2_change,
  980. .dump = dualpi2_dump,
  981. .dump_stats = dualpi2_dump_stats,
  982. .owner = THIS_MODULE,
  983. };
  984. static int __init dualpi2_module_init(void)
  985. {
  986. return register_qdisc(&dualpi2_qdisc_ops);
  987. }
  988. static void __exit dualpi2_module_exit(void)
  989. {
  990. unregister_qdisc(&dualpi2_qdisc_ops);
  991. }
  992. module_init(dualpi2_module_init);
  993. module_exit(dualpi2_module_exit);
  994. MODULE_DESCRIPTION("Dual Queue with Proportional Integral controller Improved with a Square (dualpi2) scheduler");
  995. MODULE_AUTHOR("Koen De Schepper <koen.de_schepper@nokia-bell-labs.com>");
  996. MODULE_AUTHOR("Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>");
  997. MODULE_AUTHOR("Olga Albisser <olga@albisser.org>");
  998. MODULE_AUTHOR("Henrik Steen <henrist@henrist.net>");
  999. MODULE_AUTHOR("Olivier Tilmans <olivier.tilmans@nokia.com>");
  1000. MODULE_LICENSE("Dual BSD/GPL");
  1001. MODULE_VERSION("1.0");