tcp_ecn.h 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675
  1. /* SPDX-License-Identifier: GPL-2.0-or-later */
  2. #ifndef _TCP_ECN_H
  3. #define _TCP_ECN_H
  4. #include <linux/tcp.h>
  5. #include <linux/skbuff.h>
  6. #include <linux/bitfield.h>
  7. #include <net/inet_connection_sock.h>
  8. #include <net/sock.h>
  9. #include <net/tcp.h>
  10. #include <net/inet_ecn.h>
  11. /* The highest ECN variant (Accurate ECN, ECN, or no ECN) that is
  12. * attemped to be negotiated and requested for incoming connection
  13. * and outgoing connection, respectively.
  14. */
  15. enum tcp_ecn_mode {
  16. TCP_ECN_IN_NOECN_OUT_NOECN = 0,
  17. TCP_ECN_IN_ECN_OUT_ECN = 1,
  18. TCP_ECN_IN_ECN_OUT_NOECN = 2,
  19. TCP_ECN_IN_ACCECN_OUT_ACCECN = 3,
  20. TCP_ECN_IN_ACCECN_OUT_ECN = 4,
  21. TCP_ECN_IN_ACCECN_OUT_NOECN = 5,
  22. };
  23. /* AccECN option sending when AccECN has been successfully negotiated */
  24. enum tcp_accecn_option {
  25. TCP_ACCECN_OPTION_DISABLED = 0,
  26. TCP_ACCECN_OPTION_MINIMUM = 1,
  27. TCP_ACCECN_OPTION_FULL = 2,
  28. TCP_ACCECN_OPTION_PERSIST = 3,
  29. };
  30. /* Apply either ECT(0) or ECT(1) based on TCP_CONG_ECT_1_NEGOTIATION flag */
  31. static inline void INET_ECN_xmit_ect_1_negotiation(struct sock *sk)
  32. {
  33. __INET_ECN_xmit(sk, tcp_ca_ect_1_negotiation(sk));
  34. }
  35. static inline void tcp_ecn_queue_cwr(struct tcp_sock *tp)
  36. {
  37. /* Do not set CWR if in AccECN mode! */
  38. if (tcp_ecn_mode_rfc3168(tp))
  39. tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
  40. }
  41. static inline void tcp_ecn_accept_cwr(struct sock *sk,
  42. const struct sk_buff *skb)
  43. {
  44. struct tcp_sock *tp = tcp_sk(sk);
  45. if (tcp_ecn_mode_rfc3168(tp) && tcp_hdr(skb)->cwr) {
  46. tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
  47. /* If the sender is telling us it has entered CWR, then its
  48. * cwnd may be very low (even just 1 packet), so we should ACK
  49. * immediately.
  50. */
  51. if (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq)
  52. inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
  53. }
  54. }
  55. static inline void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
  56. {
  57. tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
  58. }
  59. static inline bool tcp_accecn_ace_fail_send(const struct tcp_sock *tp)
  60. {
  61. return tp->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_SEND;
  62. }
  63. static inline bool tcp_accecn_ace_fail_recv(const struct tcp_sock *tp)
  64. {
  65. return tp->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_RECV;
  66. }
  67. static inline bool tcp_accecn_opt_fail_send(const struct tcp_sock *tp)
  68. {
  69. return tp->accecn_fail_mode & TCP_ACCECN_OPT_FAIL_SEND;
  70. }
  71. static inline bool tcp_accecn_opt_fail_recv(const struct tcp_sock *tp)
  72. {
  73. return tp->accecn_fail_mode & TCP_ACCECN_OPT_FAIL_RECV;
  74. }
  75. static inline void tcp_accecn_fail_mode_set(struct tcp_sock *tp, u8 mode)
  76. {
  77. tp->accecn_fail_mode |= mode;
  78. }
  79. static inline u8 tcp_accecn_ace(const struct tcphdr *th)
  80. {
  81. return (th->ae << 2) | (th->cwr << 1) | th->ece;
  82. }
  83. /* Infer the ECT value our SYN arrived with from the echoed ACE field */
  84. static inline int tcp_accecn_extract_syn_ect(u8 ace)
  85. {
  86. /* Below is an excerpt from the 1st block of Table 2 of AccECN spec */
  87. static const int ace_to_ecn[8] = {
  88. INET_ECN_ECT_0, /* 0b000 (Undefined) */
  89. INET_ECN_ECT_1, /* 0b001 (Undefined) */
  90. INET_ECN_NOT_ECT, /* 0b010 (Not-ECT is received) */
  91. INET_ECN_ECT_1, /* 0b011 (ECT-1 is received) */
  92. INET_ECN_ECT_0, /* 0b100 (ECT-0 is received) */
  93. INET_ECN_ECT_1, /* 0b101 (Reserved) */
  94. INET_ECN_CE, /* 0b110 (CE is received) */
  95. INET_ECN_ECT_1 /* 0b111 (Undefined) */
  96. };
  97. return ace_to_ecn[ace & 0x7];
  98. }
  99. /* Check ECN field transition to detect invalid transitions */
  100. static inline bool tcp_ect_transition_valid(u8 snt, u8 rcv)
  101. {
  102. if (rcv == snt)
  103. return true;
  104. /* Non-ECT altered to something or something became non-ECT */
  105. if (snt == INET_ECN_NOT_ECT || rcv == INET_ECN_NOT_ECT)
  106. return false;
  107. /* CE -> ECT(0/1)? */
  108. if (snt == INET_ECN_CE)
  109. return false;
  110. return true;
  111. }
  112. static inline bool tcp_accecn_validate_syn_feedback(struct sock *sk, u8 ace,
  113. u8 sent_ect)
  114. {
  115. u8 ect = tcp_accecn_extract_syn_ect(ace);
  116. struct tcp_sock *tp = tcp_sk(sk);
  117. if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback))
  118. return true;
  119. if (!tcp_ect_transition_valid(sent_ect, ect)) {
  120. tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_RECV);
  121. return false;
  122. }
  123. return true;
  124. }
  125. static inline void tcp_accecn_saw_opt_fail_recv(struct tcp_sock *tp,
  126. u8 saw_opt)
  127. {
  128. tp->saw_accecn_opt = saw_opt;
  129. if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN)
  130. tcp_accecn_fail_mode_set(tp, TCP_ACCECN_OPT_FAIL_RECV);
  131. }
  132. /* Validate the 3rd ACK based on the ACE field, see Table 4 of AccECN spec */
  133. static inline void tcp_accecn_third_ack(struct sock *sk,
  134. const struct sk_buff *skb, u8 sent_ect)
  135. {
  136. u8 ace = tcp_accecn_ace(tcp_hdr(skb));
  137. struct tcp_sock *tp = tcp_sk(sk);
  138. switch (ace) {
  139. case 0x0:
  140. /* Invalid value */
  141. if (!TCP_SKB_CB(skb)->sacked)
  142. tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_RECV |
  143. TCP_ACCECN_OPT_FAIL_RECV);
  144. break;
  145. case 0x7:
  146. case 0x5:
  147. case 0x1:
  148. /* Unused but legal values */
  149. break;
  150. default:
  151. /* Validation only applies to first non-data packet */
  152. if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq &&
  153. !TCP_SKB_CB(skb)->sacked &&
  154. tcp_accecn_validate_syn_feedback(sk, ace, sent_ect)) {
  155. if ((tcp_accecn_extract_syn_ect(ace) == INET_ECN_CE) &&
  156. !tp->delivered_ce)
  157. tp->delivered_ce++;
  158. }
  159. break;
  160. }
  161. }
  162. /* Demand the minimum # to send AccECN optnio */
  163. static inline void tcp_accecn_opt_demand_min(struct sock *sk,
  164. u8 opt_demand_min)
  165. {
  166. struct tcp_sock *tp = tcp_sk(sk);
  167. u8 opt_demand;
  168. opt_demand = max_t(u8, opt_demand_min, tp->accecn_opt_demand);
  169. tp->accecn_opt_demand = opt_demand;
  170. }
  171. /* Maps IP ECN field ECT/CE code point to AccECN option field number, given
  172. * we are sending fields with Accurate ECN Order 1: ECT(1), CE, ECT(0).
  173. */
  174. static inline u8 tcp_ecnfield_to_accecn_optfield(u8 ecnfield)
  175. {
  176. switch (ecnfield & INET_ECN_MASK) {
  177. case INET_ECN_NOT_ECT:
  178. return 0; /* AccECN does not send counts of NOT_ECT */
  179. case INET_ECN_ECT_1:
  180. return 1;
  181. case INET_ECN_CE:
  182. return 2;
  183. case INET_ECN_ECT_0:
  184. return 3;
  185. }
  186. return 0;
  187. }
  188. /* Maps IP ECN field ECT/CE code point to AccECN option field value offset.
  189. * Some fields do not start from zero, to detect zeroing by middleboxes.
  190. */
  191. static inline u32 tcp_accecn_field_init_offset(u8 ecnfield)
  192. {
  193. switch (ecnfield & INET_ECN_MASK) {
  194. case INET_ECN_NOT_ECT:
  195. return 0; /* AccECN does not send counts of NOT_ECT */
  196. case INET_ECN_ECT_1:
  197. return TCP_ACCECN_E1B_INIT_OFFSET;
  198. case INET_ECN_CE:
  199. return TCP_ACCECN_CEB_INIT_OFFSET;
  200. case INET_ECN_ECT_0:
  201. return TCP_ACCECN_E0B_INIT_OFFSET;
  202. }
  203. return 0;
  204. }
  205. /* Maps AccECN option field #nr to IP ECN field ECT/CE bits */
  206. static inline unsigned int tcp_accecn_optfield_to_ecnfield(unsigned int option,
  207. bool order)
  208. {
  209. /* Based on Table 5 of the AccECN spec to map (option, order) to
  210. * the corresponding ECN conuters (ECT-1, ECT-0, or CE).
  211. */
  212. static const u8 optfield_lookup[2][3] = {
  213. /* order = 0: 1st field ECT-0, 2nd field CE, 3rd field ECT-1 */
  214. { INET_ECN_ECT_0, INET_ECN_CE, INET_ECN_ECT_1 },
  215. /* order = 1: 1st field ECT-1, 2nd field CE, 3rd field ECT-0 */
  216. { INET_ECN_ECT_1, INET_ECN_CE, INET_ECN_ECT_0 }
  217. };
  218. return optfield_lookup[order][option % 3];
  219. }
  220. /* Handles AccECN option ECT and CE 24-bit byte counters update into
  221. * the u32 value in tcp_sock. As we're processing TCP options, it is
  222. * safe to access from - 1.
  223. */
  224. static inline s32 tcp_update_ecn_bytes(u32 *cnt, const char *from,
  225. u32 init_offset)
  226. {
  227. u32 truncated = (get_unaligned_be32(from - 1) - init_offset) &
  228. 0xFFFFFFU;
  229. u32 delta = (truncated - *cnt) & 0xFFFFFFU;
  230. /* If delta has the highest bit set (24th bit) indicating
  231. * negative, sign extend to correct an estimation using
  232. * sign_extend32(delta, 24 - 1)
  233. */
  234. delta = sign_extend32(delta, 23);
  235. *cnt += delta;
  236. return (s32)delta;
  237. }
  238. /* Updates Accurate ECN received counters from the received IP ECN field */
  239. static inline void tcp_ecn_received_counters(struct sock *sk,
  240. const struct sk_buff *skb, u32 len)
  241. {
  242. u8 ecnfield = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK;
  243. u8 is_ce = INET_ECN_is_ce(ecnfield);
  244. struct tcp_sock *tp = tcp_sk(sk);
  245. bool ecn_edge;
  246. if (!INET_ECN_is_not_ect(ecnfield)) {
  247. u32 pcount = is_ce * max_t(u16, 1, skb_shinfo(skb)->gso_segs);
  248. /* As for accurate ECN, the TCP_ECN_SEEN flag is set by
  249. * tcp_ecn_received_counters() when the ECN codepoint of
  250. * received TCP data or ACK contains ECT(0), ECT(1), or CE.
  251. */
  252. if (!tcp_ecn_mode_rfc3168(tp))
  253. tp->ecn_flags |= TCP_ECN_SEEN;
  254. /* ACE counter tracks *all* segments including pure ACKs */
  255. tp->received_ce += pcount;
  256. tp->received_ce_pending = min(tp->received_ce_pending + pcount,
  257. 0xfU);
  258. if (len > 0) {
  259. u8 minlen = tcp_ecnfield_to_accecn_optfield(ecnfield);
  260. u32 oldbytes = tp->received_ecn_bytes[ecnfield - 1];
  261. u32 bytes_mask = GENMASK_U32(31, 22);
  262. tp->received_ecn_bytes[ecnfield - 1] += len;
  263. tp->accecn_minlen = max_t(u8, tp->accecn_minlen,
  264. minlen);
  265. /* Send AccECN option at least once per 2^22-byte
  266. * increase in any ECN byte counter.
  267. */
  268. if ((tp->received_ecn_bytes[ecnfield - 1] ^ oldbytes) &
  269. bytes_mask) {
  270. tcp_accecn_opt_demand_min(sk, 1);
  271. }
  272. }
  273. }
  274. ecn_edge = tp->prev_ecnfield != ecnfield;
  275. if (ecn_edge || is_ce) {
  276. tp->prev_ecnfield = ecnfield;
  277. /* Demand Accurate ECN change-triggered ACKs. Two ACK are
  278. * demanded to indicate unambiguously the ecnfield value
  279. * in the latter ACK.
  280. */
  281. if (tcp_ecn_mode_accecn(tp)) {
  282. if (ecn_edge)
  283. inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
  284. tp->accecn_opt_demand = 2;
  285. }
  286. }
  287. }
  288. /* AccECN specification, 2.2: [...] A Data Receiver maintains four counters
  289. * initialized at the start of the half-connection. [...] These byte counters
  290. * reflect only the TCP payload length, excluding TCP header and TCP options.
  291. */
  292. static inline void tcp_ecn_received_counters_payload(struct sock *sk,
  293. const struct sk_buff *skb)
  294. {
  295. const struct tcphdr *th = (const struct tcphdr *)skb->data;
  296. tcp_ecn_received_counters(sk, skb, skb->len - th->doff * 4);
  297. }
  298. /* AccECN specification, 5.1: [...] a server can determine that it
  299. * negotiated AccECN as [...] if the ACK contains an ACE field with
  300. * the value 0b010 to 0b111 (decimal 2 to 7).
  301. */
  302. static inline bool cookie_accecn_ok(const struct tcphdr *th)
  303. {
  304. return tcp_accecn_ace(th) > 0x1;
  305. }
  306. /* Used to form the ACE flags for SYN/ACK */
  307. static inline u16 tcp_accecn_reflector_flags(u8 ect)
  308. {
  309. /* TCP ACE flags of SYN/ACK are set based on IP-ECN received from SYN.
  310. * Below is an excerpt from the 1st block of Table 2 of AccECN spec,
  311. * in which TCP ACE flags are encoded as: (AE << 2) | (CWR << 1) | ECE
  312. */
  313. static const u8 ecn_to_ace_flags[4] = {
  314. 0b010, /* Not-ECT is received */
  315. 0b011, /* ECT(1) is received */
  316. 0b100, /* ECT(0) is received */
  317. 0b110 /* CE is received */
  318. };
  319. return FIELD_PREP(TCPHDR_ACE, ecn_to_ace_flags[ect & 0x3]);
  320. }
  321. /* AccECN specification, 3.1.2: If a TCP server that implements AccECN
  322. * receives a SYN with the three TCP header flags (AE, CWR and ECE) set
  323. * to any combination other than 000, 011 or 111, it MUST negotiate the
  324. * use of AccECN as if they had been set to 111.
  325. */
  326. static inline bool tcp_accecn_syn_requested(const struct tcphdr *th)
  327. {
  328. u8 ace = tcp_accecn_ace(th);
  329. return ace && ace != 0x3;
  330. }
  331. static inline void __tcp_accecn_init_bytes_counters(int *counter_array)
  332. {
  333. BUILD_BUG_ON(INET_ECN_ECT_1 != 0x1);
  334. BUILD_BUG_ON(INET_ECN_ECT_0 != 0x2);
  335. BUILD_BUG_ON(INET_ECN_CE != 0x3);
  336. counter_array[INET_ECN_ECT_1 - 1] = 0;
  337. counter_array[INET_ECN_ECT_0 - 1] = 0;
  338. counter_array[INET_ECN_CE - 1] = 0;
  339. }
  340. static inline void tcp_accecn_init_counters(struct tcp_sock *tp)
  341. {
  342. tp->received_ce = 0;
  343. tp->received_ce_pending = 0;
  344. __tcp_accecn_init_bytes_counters(tp->received_ecn_bytes);
  345. __tcp_accecn_init_bytes_counters(tp->delivered_ecn_bytes);
  346. tp->accecn_opt_sent_w_dsack = 0;
  347. tp->accecn_minlen = 0;
  348. tp->accecn_opt_demand = 0;
  349. tp->est_ecnfield = 0;
  350. }
  351. /* Used for make_synack to form the ACE flags */
  352. static inline void tcp_accecn_echo_syn_ect(struct tcphdr *th, u8 ect)
  353. {
  354. /* TCP ACE flags of SYN/ACK are set based on IP-ECN codepoint received
  355. * from SYN. Below is an excerpt from Table 2 of the AccECN spec:
  356. * +====================+====================================+
  357. * | IP-ECN codepoint | Respective ACE falgs on SYN/ACK |
  358. * | received on SYN | AE CWR ECE |
  359. * +====================+====================================+
  360. * | Not-ECT | 0 1 0 |
  361. * | ECT(1) | 0 1 1 |
  362. * | ECT(0) | 1 0 0 |
  363. * | CE | 1 1 0 |
  364. * +====================+====================================+
  365. */
  366. th->ae = !!(ect & INET_ECN_ECT_0);
  367. th->cwr = ect != INET_ECN_ECT_0;
  368. th->ece = ect == INET_ECN_ECT_1;
  369. }
  370. static inline void tcp_accecn_set_ace(struct tcp_sock *tp, struct sk_buff *skb,
  371. struct tcphdr *th)
  372. {
  373. u32 wire_ace;
  374. /* The final packet of the 3WHS or anything like it must reflect
  375. * the SYN/ACK ECT instead of putting CEP into ACE field, such
  376. * case show up in tcp_flags.
  377. */
  378. if (likely(!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACE))) {
  379. wire_ace = tp->received_ce + TCP_ACCECN_CEP_INIT_OFFSET;
  380. th->ece = !!(wire_ace & 0x1);
  381. th->cwr = !!(wire_ace & 0x2);
  382. th->ae = !!(wire_ace & 0x4);
  383. tp->received_ce_pending = 0;
  384. }
  385. }
  386. static inline u8 tcp_accecn_option_init(const struct sk_buff *skb,
  387. u8 opt_offset)
  388. {
  389. u8 *ptr = skb_transport_header(skb) + opt_offset;
  390. unsigned int optlen = ptr[1] - 2;
  391. if (WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != TCPOPT_ACCECN1))
  392. return TCP_ACCECN_OPT_FAIL_SEEN;
  393. ptr += 2;
  394. /* Detect option zeroing: an AccECN connection "MAY check that the
  395. * initial value of the EE0B field or the EE1B field is non-zero"
  396. */
  397. if (optlen < TCPOLEN_ACCECN_PERFIELD)
  398. return TCP_ACCECN_OPT_EMPTY_SEEN;
  399. if (get_unaligned_be24(ptr) == 0)
  400. return TCP_ACCECN_OPT_FAIL_SEEN;
  401. if (optlen < TCPOLEN_ACCECN_PERFIELD * 3)
  402. return TCP_ACCECN_OPT_COUNTER_SEEN;
  403. ptr += TCPOLEN_ACCECN_PERFIELD * 2;
  404. if (get_unaligned_be24(ptr) == 0)
  405. return TCP_ACCECN_OPT_FAIL_SEEN;
  406. return TCP_ACCECN_OPT_COUNTER_SEEN;
  407. }
  408. static inline void tcp_ecn_rcv_synack_accecn(struct sock *sk,
  409. const struct sk_buff *skb, u8 dsf)
  410. {
  411. struct tcp_sock *tp = tcp_sk(sk);
  412. tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN);
  413. tp->syn_ect_rcv = dsf & INET_ECN_MASK;
  414. /* Demand Accurate ECN option in response to the SYN on the SYN/ACK
  415. * and the TCP server will try to send one more packet with an AccECN
  416. * Option at a later point during the connection.
  417. */
  418. if (tp->rx_opt.accecn &&
  419. tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) {
  420. u8 saw_opt = tcp_accecn_option_init(skb, tp->rx_opt.accecn);
  421. tcp_accecn_saw_opt_fail_recv(tp, saw_opt);
  422. tp->accecn_opt_demand = 2;
  423. }
  424. }
  425. /* See Table 2 of the AccECN draft */
  426. static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct sk_buff *skb,
  427. const struct tcphdr *th, u8 ip_dsfield)
  428. {
  429. struct tcp_sock *tp = tcp_sk(sk);
  430. u8 ace = tcp_accecn_ace(th);
  431. switch (ace) {
  432. case 0x0:
  433. case 0x7:
  434. /* +========+========+============+=============+
  435. * | A | B | SYN/ACK | Feedback |
  436. * | | | B->A | Mode of A |
  437. * | | | AE CWR ECE | |
  438. * +========+========+============+=============+
  439. * | AccECN | No ECN | 0 0 0 | Not ECN |
  440. * | AccECN | Broken | 1 1 1 | Not ECN |
  441. * +========+========+============+=============+
  442. */
  443. tcp_ecn_mode_set(tp, TCP_ECN_DISABLED);
  444. break;
  445. case 0x1:
  446. /* +========+========+============+=============+
  447. * | A | B | SYN/ACK | Feedback |
  448. * | | | B->A | Mode of A |
  449. * | | | AE CWR ECE | |
  450. * +========+========+============+=============+
  451. * | AccECN | ECN | 0 0 1 | Classic ECN |
  452. * | Nonce | AccECN | 0 0 1 | Classic ECN |
  453. * | ECN | AccECN | 0 0 1 | Classic ECN |
  454. * +========+========+============+=============+
  455. */
  456. if (tcp_ca_no_fallback_rfc3168(sk))
  457. tcp_ecn_mode_set(tp, TCP_ECN_DISABLED);
  458. else
  459. tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168);
  460. break;
  461. case 0x5:
  462. if (tcp_ecn_mode_pending(tp)) {
  463. tcp_ecn_rcv_synack_accecn(sk, skb, ip_dsfield);
  464. if (INET_ECN_is_ce(ip_dsfield)) {
  465. tp->received_ce++;
  466. tp->received_ce_pending++;
  467. }
  468. }
  469. break;
  470. default:
  471. tcp_ecn_rcv_synack_accecn(sk, skb, ip_dsfield);
  472. if (INET_ECN_is_ce(ip_dsfield) &&
  473. tcp_accecn_validate_syn_feedback(sk, ace,
  474. tp->syn_ect_snt)) {
  475. tp->received_ce++;
  476. tp->received_ce_pending++;
  477. }
  478. break;
  479. }
  480. }
  481. static inline void tcp_ecn_rcv_syn(struct sock *sk, const struct tcphdr *th,
  482. const struct sk_buff *skb)
  483. {
  484. struct tcp_sock *tp = tcp_sk(sk);
  485. if (tcp_ecn_mode_pending(tp)) {
  486. if (!tcp_accecn_syn_requested(th)) {
  487. /* Downgrade to classic ECN feedback */
  488. tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168);
  489. } else {
  490. tp->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield &
  491. INET_ECN_MASK;
  492. tp->prev_ecnfield = tp->syn_ect_rcv;
  493. tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN);
  494. }
  495. }
  496. if (tcp_ecn_mode_rfc3168(tp) &&
  497. (!th->ece || !th->cwr || tcp_ca_no_fallback_rfc3168(sk)))
  498. tcp_ecn_mode_set(tp, TCP_ECN_DISABLED);
  499. }
  500. static inline bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp,
  501. const struct tcphdr *th)
  502. {
  503. if (th->ece && !th->syn && tcp_ecn_mode_rfc3168(tp))
  504. return true;
  505. return false;
  506. }
  507. /* Packet ECN state for a SYN-ACK */
  508. static inline void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
  509. {
  510. struct tcp_sock *tp = tcp_sk(sk);
  511. TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
  512. if (tcp_ecn_disabled(tp))
  513. TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
  514. else if (tcp_ca_needs_ecn(sk) ||
  515. tcp_bpf_ca_needs_ecn(sk))
  516. INET_ECN_xmit_ect_1_negotiation(sk);
  517. if (tp->ecn_flags & TCP_ECN_MODE_ACCECN) {
  518. TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ACE;
  519. TCP_SKB_CB(skb)->tcp_flags |=
  520. tcp_accecn_reflector_flags(tp->syn_ect_rcv);
  521. tp->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK;
  522. }
  523. }
  524. /* Packet ECN state for a SYN. */
  525. static inline void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
  526. {
  527. struct tcp_sock *tp = tcp_sk(sk);
  528. bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
  529. bool use_ecn, use_accecn;
  530. u8 tcp_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn);
  531. use_accecn = tcp_ecn == TCP_ECN_IN_ACCECN_OUT_ACCECN ||
  532. tcp_ca_needs_accecn(sk);
  533. use_ecn = tcp_ecn == TCP_ECN_IN_ECN_OUT_ECN ||
  534. tcp_ecn == TCP_ECN_IN_ACCECN_OUT_ECN ||
  535. tcp_ca_needs_ecn(sk) || bpf_needs_ecn || use_accecn;
  536. if (!use_ecn) {
  537. const struct dst_entry *dst = __sk_dst_get(sk);
  538. if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
  539. use_ecn = true;
  540. }
  541. tp->ecn_flags = 0;
  542. if (use_ecn) {
  543. if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
  544. INET_ECN_xmit_ect_1_negotiation(sk);
  545. TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
  546. if (use_accecn) {
  547. TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_AE;
  548. tcp_ecn_mode_set(tp, TCP_ECN_MODE_PENDING);
  549. tp->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK;
  550. } else {
  551. tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168);
  552. }
  553. }
  554. }
  555. static inline void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb)
  556. {
  557. if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) {
  558. /* tp->ecn_flags are cleared at a later point in time when
  559. * SYN ACK is ultimatively being received.
  560. */
  561. TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ACE;
  562. }
  563. }
  564. static inline void
  565. tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th,
  566. enum tcp_synack_type synack_type)
  567. {
  568. /* Accurate ECN shall retransmit SYN/ACK with ACE=0 if the
  569. * previously retransmitted SYN/ACK also times out.
  570. */
  571. if (!req->num_timeout || synack_type != TCP_SYNACK_RETRANS) {
  572. if (tcp_rsk(req)->accecn_ok)
  573. tcp_accecn_echo_syn_ect(th, tcp_rsk(req)->syn_ect_rcv);
  574. else if (inet_rsk(req)->ecn_ok)
  575. th->ece = 1;
  576. } else if (tcp_rsk(req)->accecn_ok) {
  577. th->ae = 0;
  578. th->cwr = 0;
  579. th->ece = 0;
  580. }
  581. }
  582. static inline bool tcp_accecn_option_beacon_check(const struct sock *sk)
  583. {
  584. u32 ecn_beacon = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option_beacon);
  585. const struct tcp_sock *tp = tcp_sk(sk);
  586. if (!ecn_beacon)
  587. return false;
  588. return tcp_stamp_us_delta(tp->tcp_mstamp, tp->accecn_opt_tstamp) * ecn_beacon >=
  589. (tp->srtt_us >> 3);
  590. }
  591. #endif /* _LINUX_TCP_ECN_H */