act_ct.c 41 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706
  1. // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
  2. /* -
  3. * net/sched/act_ct.c Connection Tracking action
  4. *
  5. * Authors: Paul Blakey <paulb@mellanox.com>
  6. * Yossi Kuperman <yossiku@mellanox.com>
  7. * Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
  8. */
  9. #include <linux/module.h>
  10. #include <linux/init.h>
  11. #include <linux/kernel.h>
  12. #include <linux/skbuff.h>
  13. #include <linux/rtnetlink.h>
  14. #include <linux/pkt_cls.h>
  15. #include <linux/if_tunnel.h>
  16. #include <linux/ip.h>
  17. #include <linux/ipv6.h>
  18. #include <linux/rhashtable.h>
  19. #include <net/gre.h>
  20. #include <net/netlink.h>
  21. #include <net/pkt_sched.h>
  22. #include <net/pkt_cls.h>
  23. #include <net/act_api.h>
  24. #include <net/ip.h>
  25. #include <net/ipv6_frag.h>
  26. #include <uapi/linux/tc_act/tc_ct.h>
  27. #include <net/tc_act/tc_ct.h>
  28. #include <net/tc_wrapper.h>
  29. #include <net/netfilter/nf_flow_table.h>
  30. #include <net/netfilter/nf_conntrack.h>
  31. #include <net/netfilter/nf_conntrack_core.h>
  32. #include <net/netfilter/nf_conntrack_zones.h>
  33. #include <net/netfilter/nf_conntrack_helper.h>
  34. #include <net/netfilter/nf_conntrack_acct.h>
  35. #include <net/netfilter/ipv6/nf_defrag_ipv6.h>
  36. #include <net/netfilter/nf_conntrack_act_ct.h>
  37. #include <net/netfilter/nf_conntrack_seqadj.h>
  38. #include <uapi/linux/netfilter/nf_nat.h>
  39. static struct workqueue_struct *act_ct_wq;
  40. static struct rhashtable zones_ht;
  41. static DEFINE_MUTEX(zones_mutex);
  42. struct zones_ht_key {
  43. struct net *net;
  44. u16 zone;
  45. };
  46. struct tcf_ct_flow_table {
  47. struct rhash_head node; /* In zones tables */
  48. struct rcu_work rwork;
  49. struct nf_flowtable nf_ft;
  50. refcount_t ref;
  51. struct zones_ht_key key;
  52. bool dying;
  53. };
  54. static const struct rhashtable_params zones_params = {
  55. .head_offset = offsetof(struct tcf_ct_flow_table, node),
  56. .key_offset = offsetof(struct tcf_ct_flow_table, key),
  57. .key_len = offsetofend(struct zones_ht_key, zone),
  58. .automatic_shrinking = true,
  59. };
  60. static struct flow_action_entry *
  61. tcf_ct_flow_table_flow_action_get_next(struct flow_action *flow_action)
  62. {
  63. int i = flow_action->num_entries++;
  64. return &flow_action->entries[i];
  65. }
  66. static void tcf_ct_add_mangle_action(struct flow_action *action,
  67. enum flow_action_mangle_base htype,
  68. u32 offset,
  69. u32 mask,
  70. u32 val)
  71. {
  72. struct flow_action_entry *entry;
  73. entry = tcf_ct_flow_table_flow_action_get_next(action);
  74. entry->id = FLOW_ACTION_MANGLE;
  75. entry->mangle.htype = htype;
  76. entry->mangle.mask = ~mask;
  77. entry->mangle.offset = offset;
  78. entry->mangle.val = val;
  79. }
  80. /* The following nat helper functions check if the inverted reverse tuple
  81. * (target) is different then the current dir tuple - meaning nat for ports
  82. * and/or ip is needed, and add the relevant mangle actions.
  83. */
  84. static void
  85. tcf_ct_flow_table_add_action_nat_ipv4(const struct nf_conntrack_tuple *tuple,
  86. struct nf_conntrack_tuple target,
  87. struct flow_action *action)
  88. {
  89. if (memcmp(&target.src.u3, &tuple->src.u3, sizeof(target.src.u3)))
  90. tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_IP4,
  91. offsetof(struct iphdr, saddr),
  92. 0xFFFFFFFF,
  93. be32_to_cpu(target.src.u3.ip));
  94. if (memcmp(&target.dst.u3, &tuple->dst.u3, sizeof(target.dst.u3)))
  95. tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_IP4,
  96. offsetof(struct iphdr, daddr),
  97. 0xFFFFFFFF,
  98. be32_to_cpu(target.dst.u3.ip));
  99. }
  100. static void
  101. tcf_ct_add_ipv6_addr_mangle_action(struct flow_action *action,
  102. union nf_inet_addr *addr,
  103. u32 offset)
  104. {
  105. int i;
  106. for (i = 0; i < sizeof(struct in6_addr) / sizeof(u32); i++)
  107. tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_IP6,
  108. i * sizeof(u32) + offset,
  109. 0xFFFFFFFF, be32_to_cpu(addr->ip6[i]));
  110. }
  111. static void
  112. tcf_ct_flow_table_add_action_nat_ipv6(const struct nf_conntrack_tuple *tuple,
  113. struct nf_conntrack_tuple target,
  114. struct flow_action *action)
  115. {
  116. if (memcmp(&target.src.u3, &tuple->src.u3, sizeof(target.src.u3)))
  117. tcf_ct_add_ipv6_addr_mangle_action(action, &target.src.u3,
  118. offsetof(struct ipv6hdr,
  119. saddr));
  120. if (memcmp(&target.dst.u3, &tuple->dst.u3, sizeof(target.dst.u3)))
  121. tcf_ct_add_ipv6_addr_mangle_action(action, &target.dst.u3,
  122. offsetof(struct ipv6hdr,
  123. daddr));
  124. }
  125. static void
  126. tcf_ct_flow_table_add_action_nat_tcp(const struct nf_conntrack_tuple *tuple,
  127. struct nf_conntrack_tuple target,
  128. struct flow_action *action)
  129. {
  130. __be16 target_src = target.src.u.tcp.port;
  131. __be16 target_dst = target.dst.u.tcp.port;
  132. if (target_src != tuple->src.u.tcp.port)
  133. tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_TCP,
  134. offsetof(struct tcphdr, source),
  135. 0xFFFF, be16_to_cpu(target_src));
  136. if (target_dst != tuple->dst.u.tcp.port)
  137. tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_TCP,
  138. offsetof(struct tcphdr, dest),
  139. 0xFFFF, be16_to_cpu(target_dst));
  140. }
  141. static void
  142. tcf_ct_flow_table_add_action_nat_udp(const struct nf_conntrack_tuple *tuple,
  143. struct nf_conntrack_tuple target,
  144. struct flow_action *action)
  145. {
  146. __be16 target_src = target.src.u.udp.port;
  147. __be16 target_dst = target.dst.u.udp.port;
  148. if (target_src != tuple->src.u.udp.port)
  149. tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_UDP,
  150. offsetof(struct udphdr, source),
  151. 0xFFFF, be16_to_cpu(target_src));
  152. if (target_dst != tuple->dst.u.udp.port)
  153. tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_UDP,
  154. offsetof(struct udphdr, dest),
  155. 0xFFFF, be16_to_cpu(target_dst));
  156. }
  157. static void tcf_ct_flow_table_add_action_meta(struct nf_conn *ct,
  158. enum ip_conntrack_dir dir,
  159. enum ip_conntrack_info ctinfo,
  160. struct flow_action *action)
  161. {
  162. struct nf_conn_labels *ct_labels;
  163. struct flow_action_entry *entry;
  164. u32 *act_ct_labels;
  165. entry = tcf_ct_flow_table_flow_action_get_next(action);
  166. entry->id = FLOW_ACTION_CT_METADATA;
  167. #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
  168. entry->ct_metadata.mark = READ_ONCE(ct->mark);
  169. #endif
  170. /* aligns with the CT reference on the SKB nf_ct_set */
  171. entry->ct_metadata.cookie = (unsigned long)ct | ctinfo;
  172. entry->ct_metadata.orig_dir = dir == IP_CT_DIR_ORIGINAL;
  173. act_ct_labels = entry->ct_metadata.labels;
  174. ct_labels = nf_ct_labels_find(ct);
  175. if (ct_labels)
  176. memcpy(act_ct_labels, ct_labels->bits, NF_CT_LABELS_MAX_SIZE);
  177. else
  178. memset(act_ct_labels, 0, NF_CT_LABELS_MAX_SIZE);
  179. }
  180. static int tcf_ct_flow_table_add_action_nat(struct net *net,
  181. struct nf_conn *ct,
  182. enum ip_conntrack_dir dir,
  183. struct flow_action *action)
  184. {
  185. const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple;
  186. struct nf_conntrack_tuple target;
  187. if (!(ct->status & IPS_NAT_MASK))
  188. return 0;
  189. nf_ct_invert_tuple(&target, &ct->tuplehash[!dir].tuple);
  190. switch (tuple->src.l3num) {
  191. case NFPROTO_IPV4:
  192. tcf_ct_flow_table_add_action_nat_ipv4(tuple, target,
  193. action);
  194. break;
  195. case NFPROTO_IPV6:
  196. tcf_ct_flow_table_add_action_nat_ipv6(tuple, target,
  197. action);
  198. break;
  199. default:
  200. return -EOPNOTSUPP;
  201. }
  202. switch (nf_ct_protonum(ct)) {
  203. case IPPROTO_TCP:
  204. tcf_ct_flow_table_add_action_nat_tcp(tuple, target, action);
  205. break;
  206. case IPPROTO_UDP:
  207. tcf_ct_flow_table_add_action_nat_udp(tuple, target, action);
  208. break;
  209. default:
  210. return -EOPNOTSUPP;
  211. }
  212. return 0;
  213. }
  214. static int tcf_ct_flow_table_fill_actions(struct net *net,
  215. struct flow_offload *flow,
  216. enum flow_offload_tuple_dir tdir,
  217. struct nf_flow_rule *flow_rule)
  218. {
  219. struct flow_action *action = &flow_rule->rule->action;
  220. int num_entries = action->num_entries;
  221. struct nf_conn *ct = flow->ct;
  222. enum ip_conntrack_info ctinfo;
  223. enum ip_conntrack_dir dir;
  224. int i, err;
  225. switch (tdir) {
  226. case FLOW_OFFLOAD_DIR_ORIGINAL:
  227. dir = IP_CT_DIR_ORIGINAL;
  228. ctinfo = test_bit(IPS_SEEN_REPLY_BIT, &ct->status) ?
  229. IP_CT_ESTABLISHED : IP_CT_NEW;
  230. if (ctinfo == IP_CT_ESTABLISHED)
  231. set_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags);
  232. break;
  233. case FLOW_OFFLOAD_DIR_REPLY:
  234. dir = IP_CT_DIR_REPLY;
  235. ctinfo = IP_CT_ESTABLISHED_REPLY;
  236. break;
  237. default:
  238. return -EOPNOTSUPP;
  239. }
  240. err = tcf_ct_flow_table_add_action_nat(net, ct, dir, action);
  241. if (err)
  242. goto err_nat;
  243. tcf_ct_flow_table_add_action_meta(ct, dir, ctinfo, action);
  244. return 0;
  245. err_nat:
  246. /* Clear filled actions */
  247. for (i = num_entries; i < action->num_entries; i++)
  248. memset(&action->entries[i], 0, sizeof(action->entries[i]));
  249. action->num_entries = num_entries;
  250. return err;
  251. }
  252. static bool tcf_ct_flow_is_outdated(const struct flow_offload *flow)
  253. {
  254. return test_bit(IPS_SEEN_REPLY_BIT, &flow->ct->status) &&
  255. test_bit(IPS_HW_OFFLOAD_BIT, &flow->ct->status) &&
  256. !test_bit(NF_FLOW_HW_PENDING, &flow->flags) &&
  257. !test_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags);
  258. }
  259. static void tcf_ct_flow_table_get_ref(struct tcf_ct_flow_table *ct_ft);
  260. static void tcf_ct_nf_get(struct nf_flowtable *ft)
  261. {
  262. struct tcf_ct_flow_table *ct_ft =
  263. container_of(ft, struct tcf_ct_flow_table, nf_ft);
  264. tcf_ct_flow_table_get_ref(ct_ft);
  265. }
  266. static void tcf_ct_flow_table_put(struct tcf_ct_flow_table *ct_ft);
  267. static void tcf_ct_nf_put(struct nf_flowtable *ft)
  268. {
  269. struct tcf_ct_flow_table *ct_ft =
  270. container_of(ft, struct tcf_ct_flow_table, nf_ft);
  271. tcf_ct_flow_table_put(ct_ft);
  272. }
  273. static struct nf_flowtable_type flowtable_ct = {
  274. .gc = tcf_ct_flow_is_outdated,
  275. .action = tcf_ct_flow_table_fill_actions,
  276. .get = tcf_ct_nf_get,
  277. .put = tcf_ct_nf_put,
  278. .owner = THIS_MODULE,
  279. };
  280. static int tcf_ct_flow_table_get(struct net *net, struct tcf_ct_params *params)
  281. {
  282. struct zones_ht_key key = { .net = net, .zone = params->zone };
  283. struct tcf_ct_flow_table *ct_ft;
  284. int err = -ENOMEM;
  285. mutex_lock(&zones_mutex);
  286. ct_ft = rhashtable_lookup_fast(&zones_ht, &key, zones_params);
  287. if (ct_ft && refcount_inc_not_zero(&ct_ft->ref))
  288. goto out_unlock;
  289. ct_ft = kzalloc_obj(*ct_ft);
  290. if (!ct_ft)
  291. goto err_alloc;
  292. refcount_set(&ct_ft->ref, 1);
  293. ct_ft->key = key;
  294. err = rhashtable_insert_fast(&zones_ht, &ct_ft->node, zones_params);
  295. if (err)
  296. goto err_insert;
  297. ct_ft->nf_ft.type = &flowtable_ct;
  298. ct_ft->nf_ft.flags |= NF_FLOWTABLE_HW_OFFLOAD |
  299. NF_FLOWTABLE_COUNTER;
  300. err = nf_flow_table_init(&ct_ft->nf_ft);
  301. if (err)
  302. goto err_init;
  303. write_pnet(&ct_ft->nf_ft.net, net);
  304. __module_get(THIS_MODULE);
  305. out_unlock:
  306. params->ct_ft = ct_ft;
  307. params->nf_ft = &ct_ft->nf_ft;
  308. mutex_unlock(&zones_mutex);
  309. return 0;
  310. err_init:
  311. rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
  312. err_insert:
  313. kfree(ct_ft);
  314. err_alloc:
  315. mutex_unlock(&zones_mutex);
  316. return err;
  317. }
  318. static void tcf_ct_flow_table_get_ref(struct tcf_ct_flow_table *ct_ft)
  319. {
  320. refcount_inc(&ct_ft->ref);
  321. }
  322. static void tcf_ct_flow_table_cleanup_work(struct work_struct *work)
  323. {
  324. struct tcf_ct_flow_table *ct_ft;
  325. struct flow_block *block;
  326. ct_ft = container_of(to_rcu_work(work), struct tcf_ct_flow_table,
  327. rwork);
  328. nf_flow_table_free(&ct_ft->nf_ft);
  329. block = &ct_ft->nf_ft.flow_block;
  330. down_write(&ct_ft->nf_ft.flow_block_lock);
  331. WARN_ON(!list_empty(&block->cb_list));
  332. up_write(&ct_ft->nf_ft.flow_block_lock);
  333. kfree(ct_ft);
  334. module_put(THIS_MODULE);
  335. }
  336. static void tcf_ct_flow_table_put(struct tcf_ct_flow_table *ct_ft)
  337. {
  338. if (refcount_dec_and_test(&ct_ft->ref)) {
  339. rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
  340. INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work);
  341. queue_rcu_work(act_ct_wq, &ct_ft->rwork);
  342. }
  343. }
  344. static void tcf_ct_flow_tc_ifidx(struct flow_offload *entry,
  345. struct nf_conn_act_ct_ext *act_ct_ext, u8 dir)
  346. {
  347. entry->tuplehash[dir].tuple.xmit_type = FLOW_OFFLOAD_XMIT_TC;
  348. entry->tuplehash[dir].tuple.tc.iifidx = act_ct_ext->ifindex[dir];
  349. }
  350. static void tcf_ct_flow_ct_ext_ifidx_update(struct flow_offload *entry)
  351. {
  352. struct nf_conn_act_ct_ext *act_ct_ext;
  353. act_ct_ext = nf_conn_act_ct_ext_find(entry->ct);
  354. if (act_ct_ext) {
  355. tcf_ct_flow_tc_ifidx(entry, act_ct_ext, FLOW_OFFLOAD_DIR_ORIGINAL);
  356. tcf_ct_flow_tc_ifidx(entry, act_ct_ext, FLOW_OFFLOAD_DIR_REPLY);
  357. }
  358. }
  359. static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft,
  360. struct nf_conn *ct,
  361. bool tcp, bool bidirectional)
  362. {
  363. struct nf_conn_act_ct_ext *act_ct_ext;
  364. struct flow_offload *entry;
  365. int err;
  366. if (test_and_set_bit(IPS_OFFLOAD_BIT, &ct->status))
  367. return;
  368. entry = flow_offload_alloc(ct);
  369. if (!entry) {
  370. WARN_ON_ONCE(1);
  371. goto err_alloc;
  372. }
  373. if (tcp) {
  374. ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
  375. ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
  376. }
  377. if (bidirectional)
  378. __set_bit(NF_FLOW_HW_BIDIRECTIONAL, &entry->flags);
  379. act_ct_ext = nf_conn_act_ct_ext_find(ct);
  380. if (act_ct_ext) {
  381. tcf_ct_flow_tc_ifidx(entry, act_ct_ext, FLOW_OFFLOAD_DIR_ORIGINAL);
  382. tcf_ct_flow_tc_ifidx(entry, act_ct_ext, FLOW_OFFLOAD_DIR_REPLY);
  383. }
  384. err = flow_offload_add(&ct_ft->nf_ft, entry);
  385. if (err)
  386. goto err_add;
  387. return;
  388. err_add:
  389. flow_offload_free(entry);
  390. err_alloc:
  391. clear_bit(IPS_OFFLOAD_BIT, &ct->status);
  392. }
  393. static void tcf_ct_flow_table_process_conn(struct tcf_ct_flow_table *ct_ft,
  394. struct nf_conn *ct,
  395. enum ip_conntrack_info ctinfo)
  396. {
  397. bool tcp = false, bidirectional = true;
  398. switch (nf_ct_protonum(ct)) {
  399. case IPPROTO_TCP:
  400. if ((ctinfo != IP_CT_ESTABLISHED &&
  401. ctinfo != IP_CT_ESTABLISHED_REPLY) ||
  402. !test_bit(IPS_ASSURED_BIT, &ct->status) ||
  403. ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED)
  404. return;
  405. tcp = true;
  406. break;
  407. case IPPROTO_UDP:
  408. if (!nf_ct_is_confirmed(ct))
  409. return;
  410. if (!test_bit(IPS_ASSURED_BIT, &ct->status))
  411. bidirectional = false;
  412. break;
  413. #ifdef CONFIG_NF_CT_PROTO_GRE
  414. case IPPROTO_GRE: {
  415. struct nf_conntrack_tuple *tuple;
  416. if ((ctinfo != IP_CT_ESTABLISHED &&
  417. ctinfo != IP_CT_ESTABLISHED_REPLY) ||
  418. !test_bit(IPS_ASSURED_BIT, &ct->status) ||
  419. ct->status & IPS_NAT_MASK)
  420. return;
  421. tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
  422. /* No support for GRE v1 */
  423. if (tuple->src.u.gre.key || tuple->dst.u.gre.key)
  424. return;
  425. break;
  426. }
  427. #endif
  428. default:
  429. return;
  430. }
  431. if (nf_ct_ext_exist(ct, NF_CT_EXT_HELPER) ||
  432. ct->status & IPS_SEQ_ADJUST)
  433. return;
  434. tcf_ct_flow_table_add(ct_ft, ct, tcp, bidirectional);
  435. }
  436. static bool
  437. tcf_ct_flow_table_fill_tuple_ipv4(struct sk_buff *skb,
  438. struct flow_offload_tuple *tuple,
  439. struct tcphdr **tcph)
  440. {
  441. struct flow_ports *ports;
  442. unsigned int thoff;
  443. struct iphdr *iph;
  444. size_t hdrsize;
  445. u8 ipproto;
  446. if (!pskb_network_may_pull(skb, sizeof(*iph)))
  447. return false;
  448. iph = ip_hdr(skb);
  449. thoff = iph->ihl * 4;
  450. if (ip_is_fragment(iph) ||
  451. unlikely(thoff != sizeof(struct iphdr)))
  452. return false;
  453. ipproto = iph->protocol;
  454. switch (ipproto) {
  455. case IPPROTO_TCP:
  456. hdrsize = sizeof(struct tcphdr);
  457. break;
  458. case IPPROTO_UDP:
  459. hdrsize = sizeof(*ports);
  460. break;
  461. #ifdef CONFIG_NF_CT_PROTO_GRE
  462. case IPPROTO_GRE:
  463. hdrsize = sizeof(struct gre_base_hdr);
  464. break;
  465. #endif
  466. default:
  467. return false;
  468. }
  469. if (iph->ttl <= 1)
  470. return false;
  471. if (!pskb_network_may_pull(skb, thoff + hdrsize))
  472. return false;
  473. switch (ipproto) {
  474. case IPPROTO_TCP:
  475. *tcph = (void *)(skb_network_header(skb) + thoff);
  476. fallthrough;
  477. case IPPROTO_UDP:
  478. ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
  479. tuple->src_port = ports->source;
  480. tuple->dst_port = ports->dest;
  481. break;
  482. case IPPROTO_GRE: {
  483. struct gre_base_hdr *greh;
  484. greh = (struct gre_base_hdr *)(skb_network_header(skb) + thoff);
  485. if ((greh->flags & GRE_VERSION) != GRE_VERSION_0)
  486. return false;
  487. break;
  488. }
  489. }
  490. iph = ip_hdr(skb);
  491. tuple->src_v4.s_addr = iph->saddr;
  492. tuple->dst_v4.s_addr = iph->daddr;
  493. tuple->l3proto = AF_INET;
  494. tuple->l4proto = ipproto;
  495. return true;
  496. }
  497. static bool
  498. tcf_ct_flow_table_fill_tuple_ipv6(struct sk_buff *skb,
  499. struct flow_offload_tuple *tuple,
  500. struct tcphdr **tcph)
  501. {
  502. struct flow_ports *ports;
  503. struct ipv6hdr *ip6h;
  504. unsigned int thoff;
  505. size_t hdrsize;
  506. u8 nexthdr;
  507. if (!pskb_network_may_pull(skb, sizeof(*ip6h)))
  508. return false;
  509. ip6h = ipv6_hdr(skb);
  510. thoff = sizeof(*ip6h);
  511. nexthdr = ip6h->nexthdr;
  512. switch (nexthdr) {
  513. case IPPROTO_TCP:
  514. hdrsize = sizeof(struct tcphdr);
  515. break;
  516. case IPPROTO_UDP:
  517. hdrsize = sizeof(*ports);
  518. break;
  519. #ifdef CONFIG_NF_CT_PROTO_GRE
  520. case IPPROTO_GRE:
  521. hdrsize = sizeof(struct gre_base_hdr);
  522. break;
  523. #endif
  524. default:
  525. return false;
  526. }
  527. if (ip6h->hop_limit <= 1)
  528. return false;
  529. if (!pskb_network_may_pull(skb, thoff + hdrsize))
  530. return false;
  531. switch (nexthdr) {
  532. case IPPROTO_TCP:
  533. *tcph = (void *)(skb_network_header(skb) + thoff);
  534. fallthrough;
  535. case IPPROTO_UDP:
  536. ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
  537. tuple->src_port = ports->source;
  538. tuple->dst_port = ports->dest;
  539. break;
  540. case IPPROTO_GRE: {
  541. struct gre_base_hdr *greh;
  542. greh = (struct gre_base_hdr *)(skb_network_header(skb) + thoff);
  543. if ((greh->flags & GRE_VERSION) != GRE_VERSION_0)
  544. return false;
  545. break;
  546. }
  547. }
  548. ip6h = ipv6_hdr(skb);
  549. tuple->src_v6 = ip6h->saddr;
  550. tuple->dst_v6 = ip6h->daddr;
  551. tuple->l3proto = AF_INET6;
  552. tuple->l4proto = nexthdr;
  553. return true;
  554. }
  555. static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p,
  556. struct sk_buff *skb,
  557. u8 family)
  558. {
  559. struct nf_flowtable *nf_ft = &p->ct_ft->nf_ft;
  560. struct flow_offload_tuple_rhash *tuplehash;
  561. struct flow_offload_tuple tuple = {};
  562. enum ip_conntrack_info ctinfo;
  563. struct tcphdr *tcph = NULL;
  564. bool force_refresh = false;
  565. struct flow_offload *flow;
  566. struct nf_conn *ct;
  567. u8 dir;
  568. switch (family) {
  569. case NFPROTO_IPV4:
  570. if (!tcf_ct_flow_table_fill_tuple_ipv4(skb, &tuple, &tcph))
  571. return false;
  572. break;
  573. case NFPROTO_IPV6:
  574. if (!tcf_ct_flow_table_fill_tuple_ipv6(skb, &tuple, &tcph))
  575. return false;
  576. break;
  577. default:
  578. return false;
  579. }
  580. tuplehash = flow_offload_lookup(nf_ft, &tuple);
  581. if (!tuplehash)
  582. return false;
  583. dir = tuplehash->tuple.dir;
  584. flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
  585. ct = flow->ct;
  586. if (dir == FLOW_OFFLOAD_DIR_REPLY &&
  587. !test_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags)) {
  588. /* Only offload reply direction after connection became
  589. * assured.
  590. */
  591. if (test_bit(IPS_ASSURED_BIT, &ct->status))
  592. set_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags);
  593. else if (test_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags))
  594. /* If flow_table flow has already been updated to the
  595. * established state, then don't refresh.
  596. */
  597. return false;
  598. force_refresh = true;
  599. }
  600. if (tcph && (unlikely(tcph->fin || tcph->rst))) {
  601. flow_offload_teardown(flow);
  602. return false;
  603. }
  604. if (dir == FLOW_OFFLOAD_DIR_ORIGINAL)
  605. ctinfo = test_bit(IPS_SEEN_REPLY_BIT, &ct->status) ?
  606. IP_CT_ESTABLISHED : IP_CT_NEW;
  607. else
  608. ctinfo = IP_CT_ESTABLISHED_REPLY;
  609. nf_conn_act_ct_ext_fill(skb, ct, ctinfo);
  610. tcf_ct_flow_ct_ext_ifidx_update(flow);
  611. flow_offload_refresh(nf_ft, flow, force_refresh);
  612. if (!test_bit(IPS_ASSURED_BIT, &ct->status)) {
  613. /* Process this flow in SW to allow promoting to ASSURED */
  614. return false;
  615. }
  616. nf_conntrack_get(&ct->ct_general);
  617. nf_ct_set(skb, ct, ctinfo);
  618. if (nf_ft->flags & NF_FLOWTABLE_COUNTER)
  619. nf_ct_acct_update(ct, dir, skb->len);
  620. return true;
  621. }
  622. static int tcf_ct_flow_tables_init(void)
  623. {
  624. return rhashtable_init(&zones_ht, &zones_params);
  625. }
  626. static void tcf_ct_flow_tables_uninit(void)
  627. {
  628. rhashtable_destroy(&zones_ht);
  629. }
  630. static struct tc_action_ops act_ct_ops;
  631. struct tc_ct_action_net {
  632. struct tc_action_net tn; /* Must be first */
  633. };
  634. /* Determine whether skb->_nfct is equal to the result of conntrack lookup. */
  635. static bool tcf_ct_skb_nfct_cached(struct net *net, struct sk_buff *skb,
  636. struct tcf_ct_params *p)
  637. {
  638. enum ip_conntrack_info ctinfo;
  639. struct nf_conn *ct;
  640. ct = nf_ct_get(skb, &ctinfo);
  641. if (!ct)
  642. return false;
  643. if (!net_eq(net, read_pnet(&ct->ct_net)))
  644. goto drop_ct;
  645. if (nf_ct_zone(ct)->id != p->zone)
  646. goto drop_ct;
  647. if (p->helper) {
  648. struct nf_conn_help *help;
  649. help = nf_ct_ext_find(ct, NF_CT_EXT_HELPER);
  650. if (help && rcu_access_pointer(help->helper) != p->helper)
  651. goto drop_ct;
  652. }
  653. /* Force conntrack entry direction. */
  654. if ((p->ct_action & TCA_CT_ACT_FORCE) &&
  655. CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) {
  656. if (nf_ct_is_confirmed(ct))
  657. nf_ct_kill(ct);
  658. goto drop_ct;
  659. }
  660. return true;
  661. drop_ct:
  662. nf_ct_put(ct);
  663. nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
  664. return false;
  665. }
  666. static u8 tcf_ct_skb_nf_family(struct sk_buff *skb)
  667. {
  668. u8 family = NFPROTO_UNSPEC;
  669. switch (skb_protocol(skb, true)) {
  670. case htons(ETH_P_IP):
  671. family = NFPROTO_IPV4;
  672. break;
  673. case htons(ETH_P_IPV6):
  674. family = NFPROTO_IPV6;
  675. break;
  676. default:
  677. break;
  678. }
  679. return family;
  680. }
  681. static int tcf_ct_ipv4_is_fragment(struct sk_buff *skb, bool *frag)
  682. {
  683. unsigned int len;
  684. len = skb_network_offset(skb) + sizeof(struct iphdr);
  685. if (unlikely(skb->len < len))
  686. return -EINVAL;
  687. if (unlikely(!pskb_may_pull(skb, len)))
  688. return -ENOMEM;
  689. *frag = ip_is_fragment(ip_hdr(skb));
  690. return 0;
  691. }
  692. static int tcf_ct_ipv6_is_fragment(struct sk_buff *skb, bool *frag)
  693. {
  694. unsigned int flags = 0, len, payload_ofs = 0;
  695. unsigned short frag_off;
  696. int nexthdr;
  697. len = skb_network_offset(skb) + sizeof(struct ipv6hdr);
  698. if (unlikely(skb->len < len))
  699. return -EINVAL;
  700. if (unlikely(!pskb_may_pull(skb, len)))
  701. return -ENOMEM;
  702. nexthdr = ipv6_find_hdr(skb, &payload_ofs, -1, &frag_off, &flags);
  703. if (unlikely(nexthdr < 0))
  704. return -EPROTO;
  705. *frag = flags & IP6_FH_F_FRAG;
  706. return 0;
  707. }
  708. static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
  709. u8 family, u16 zone, bool *defrag)
  710. {
  711. enum ip_conntrack_info ctinfo;
  712. struct nf_conn *ct;
  713. int err = 0;
  714. bool frag;
  715. u8 proto;
  716. u16 mru;
  717. /* Previously seen (loopback)? Ignore. */
  718. ct = nf_ct_get(skb, &ctinfo);
  719. if ((ct && !nf_ct_is_template(ct)) || ctinfo == IP_CT_UNTRACKED)
  720. return 0;
  721. if (family == NFPROTO_IPV4)
  722. err = tcf_ct_ipv4_is_fragment(skb, &frag);
  723. else
  724. err = tcf_ct_ipv6_is_fragment(skb, &frag);
  725. if (err || !frag)
  726. return err;
  727. err = nf_ct_handle_fragments(net, skb, zone, family, &proto, &mru);
  728. if (err)
  729. return err;
  730. *defrag = true;
  731. tc_skb_cb(skb)->mru = mru;
  732. return 0;
  733. }
  734. static void tcf_ct_params_free(struct tcf_ct_params *params)
  735. {
  736. if (params->helper) {
  737. #if IS_ENABLED(CONFIG_NF_NAT)
  738. if (params->ct_action & TCA_CT_ACT_NAT)
  739. nf_nat_helper_put(params->helper);
  740. #endif
  741. nf_conntrack_helper_put(params->helper);
  742. }
  743. if (params->ct_ft)
  744. tcf_ct_flow_table_put(params->ct_ft);
  745. if (params->tmpl) {
  746. if (params->put_labels)
  747. nf_connlabels_put(nf_ct_net(params->tmpl));
  748. nf_ct_put(params->tmpl);
  749. }
  750. kfree(params);
  751. }
  752. static void tcf_ct_params_free_rcu(struct rcu_head *head)
  753. {
  754. struct tcf_ct_params *params;
  755. params = container_of(head, struct tcf_ct_params, rcu);
  756. tcf_ct_params_free(params);
  757. }
  758. static void tcf_ct_act_set_mark(struct nf_conn *ct, u32 mark, u32 mask)
  759. {
  760. #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
  761. u32 new_mark;
  762. if (!mask)
  763. return;
  764. new_mark = mark | (READ_ONCE(ct->mark) & ~(mask));
  765. if (READ_ONCE(ct->mark) != new_mark) {
  766. WRITE_ONCE(ct->mark, new_mark);
  767. if (nf_ct_is_confirmed(ct))
  768. nf_conntrack_event_cache(IPCT_MARK, ct);
  769. }
  770. #endif
  771. }
  772. static void tcf_ct_act_set_labels(struct nf_conn *ct,
  773. u32 *labels,
  774. u32 *labels_m)
  775. {
  776. #if IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS)
  777. size_t labels_sz = sizeof_field(struct tcf_ct_params, labels);
  778. if (!memchr_inv(labels_m, 0, labels_sz))
  779. return;
  780. nf_connlabels_replace(ct, labels, labels_m, 4);
  781. #endif
  782. }
  783. static int tcf_ct_act_nat(struct sk_buff *skb,
  784. struct nf_conn *ct,
  785. enum ip_conntrack_info ctinfo,
  786. int ct_action,
  787. struct nf_nat_range2 *range,
  788. bool commit)
  789. {
  790. #if IS_ENABLED(CONFIG_NF_NAT)
  791. int err, action = 0;
  792. if (!(ct_action & TCA_CT_ACT_NAT))
  793. return NF_ACCEPT;
  794. if (ct_action & TCA_CT_ACT_NAT_SRC)
  795. action |= BIT(NF_NAT_MANIP_SRC);
  796. if (ct_action & TCA_CT_ACT_NAT_DST)
  797. action |= BIT(NF_NAT_MANIP_DST);
  798. err = nf_ct_nat(skb, ct, ctinfo, &action, range, commit);
  799. if (err != NF_ACCEPT)
  800. return err & NF_VERDICT_MASK;
  801. if (action & BIT(NF_NAT_MANIP_SRC))
  802. qdisc_skb_cb(skb)->post_ct_snat = 1;
  803. if (action & BIT(NF_NAT_MANIP_DST))
  804. qdisc_skb_cb(skb)->post_ct_dnat = 1;
  805. return err;
  806. #else
  807. return NF_ACCEPT;
  808. #endif
  809. }
  810. TC_INDIRECT_SCOPE int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
  811. struct tcf_result *res)
  812. {
  813. struct net *net = dev_net(skb->dev);
  814. enum ip_conntrack_info ctinfo;
  815. struct tcf_ct *c = to_ct(a);
  816. struct nf_conn *tmpl = NULL;
  817. struct nf_hook_state state;
  818. bool cached, commit, clear;
  819. int nh_ofs, err, retval;
  820. struct tcf_ct_params *p;
  821. bool add_helper = false;
  822. bool skip_add = false;
  823. bool defrag = false;
  824. struct nf_conn *ct;
  825. u8 family;
  826. p = rcu_dereference_bh(c->params);
  827. retval = p->action;
  828. commit = p->ct_action & TCA_CT_ACT_COMMIT;
  829. clear = p->ct_action & TCA_CT_ACT_CLEAR;
  830. tmpl = p->tmpl;
  831. tcf_lastuse_update(&c->tcf_tm);
  832. tcf_action_update_bstats(&c->common, skb);
  833. if (clear) {
  834. qdisc_skb_cb(skb)->post_ct = false;
  835. ct = nf_ct_get(skb, &ctinfo);
  836. if (ct) {
  837. nf_ct_put(ct);
  838. nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
  839. }
  840. goto out_clear;
  841. }
  842. family = tcf_ct_skb_nf_family(skb);
  843. if (family == NFPROTO_UNSPEC)
  844. goto drop;
  845. /* The conntrack module expects to be working at L3.
  846. * We also try to pull the IPv4/6 header to linear area
  847. */
  848. nh_ofs = skb_network_offset(skb);
  849. skb_pull_rcsum(skb, nh_ofs);
  850. err = tcf_ct_handle_fragments(net, skb, family, p->zone, &defrag);
  851. if (err)
  852. goto out_frag;
  853. err = nf_ct_skb_network_trim(skb, family);
  854. if (err)
  855. goto drop;
  856. /* If we are recirculating packets to match on ct fields and
  857. * committing with a separate ct action, then we don't need to
  858. * actually run the packet through conntrack twice unless it's for a
  859. * different zone.
  860. */
  861. cached = tcf_ct_skb_nfct_cached(net, skb, p);
  862. if (!cached) {
  863. if (tcf_ct_flow_table_lookup(p, skb, family)) {
  864. skip_add = true;
  865. goto do_nat;
  866. }
  867. /* Associate skb with specified zone. */
  868. if (tmpl) {
  869. nf_conntrack_put(skb_nfct(skb));
  870. nf_conntrack_get(&tmpl->ct_general);
  871. nf_ct_set(skb, tmpl, IP_CT_NEW);
  872. }
  873. state.hook = NF_INET_PRE_ROUTING;
  874. state.net = net;
  875. state.pf = family;
  876. err = nf_conntrack_in(skb, &state);
  877. if (err != NF_ACCEPT)
  878. goto nf_error;
  879. }
  880. do_nat:
  881. ct = nf_ct_get(skb, &ctinfo);
  882. if (!ct)
  883. goto out_push;
  884. nf_ct_deliver_cached_events(ct);
  885. nf_conn_act_ct_ext_fill(skb, ct, ctinfo);
  886. err = tcf_ct_act_nat(skb, ct, ctinfo, p->ct_action, &p->range, commit);
  887. if (err != NF_ACCEPT)
  888. goto nf_error;
  889. if (!nf_ct_is_confirmed(ct) && commit && p->helper && !nfct_help(ct)) {
  890. err = __nf_ct_try_assign_helper(ct, p->tmpl, GFP_ATOMIC);
  891. if (err)
  892. goto drop;
  893. add_helper = true;
  894. if (p->ct_action & TCA_CT_ACT_NAT && !nfct_seqadj(ct)) {
  895. if (!nfct_seqadj_ext_add(ct))
  896. goto drop;
  897. }
  898. }
  899. if (nf_ct_is_confirmed(ct) ? ((!cached && !skip_add) || add_helper) : commit) {
  900. err = nf_ct_helper(skb, ct, ctinfo, family);
  901. if (err != NF_ACCEPT)
  902. goto nf_error;
  903. }
  904. if (commit) {
  905. tcf_ct_act_set_mark(ct, p->mark, p->mark_mask);
  906. tcf_ct_act_set_labels(ct, p->labels, p->labels_mask);
  907. if (!nf_ct_is_confirmed(ct))
  908. nf_conn_act_ct_ext_add(skb, ct, ctinfo);
  909. /* This will take care of sending queued events
  910. * even if the connection is already confirmed.
  911. */
  912. err = nf_conntrack_confirm(skb);
  913. if (err != NF_ACCEPT)
  914. goto nf_error;
  915. /* The ct may be dropped if a clash has been resolved,
  916. * so it's necessary to retrieve it from skb again to
  917. * prevent UAF.
  918. */
  919. ct = nf_ct_get(skb, &ctinfo);
  920. if (!ct)
  921. skip_add = true;
  922. }
  923. if (!skip_add)
  924. tcf_ct_flow_table_process_conn(p->ct_ft, ct, ctinfo);
  925. out_push:
  926. skb_push_rcsum(skb, nh_ofs);
  927. qdisc_skb_cb(skb)->post_ct = true;
  928. tc_skb_cb(skb)->zone = p->zone;
  929. out_clear:
  930. if (defrag)
  931. qdisc_skb_cb(skb)->pkt_len = skb->len;
  932. return retval;
  933. out_frag:
  934. if (err != -EINPROGRESS)
  935. tcf_action_inc_drop_qstats(&c->common);
  936. return TC_ACT_CONSUMED;
  937. drop:
  938. tcf_action_inc_drop_qstats(&c->common);
  939. return TC_ACT_SHOT;
  940. nf_error:
  941. /* some verdicts store extra data in upper bits, such
  942. * as errno or queue number.
  943. */
  944. switch (err & NF_VERDICT_MASK) {
  945. case NF_DROP:
  946. goto drop;
  947. case NF_STOLEN:
  948. tcf_action_inc_drop_qstats(&c->common);
  949. return TC_ACT_CONSUMED;
  950. default:
  951. DEBUG_NET_WARN_ON_ONCE(1);
  952. goto drop;
  953. }
  954. }
  955. static const struct nla_policy ct_policy[TCA_CT_MAX + 1] = {
  956. [TCA_CT_ACTION] = { .type = NLA_U16 },
  957. [TCA_CT_PARMS] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_ct)),
  958. [TCA_CT_ZONE] = { .type = NLA_U16 },
  959. [TCA_CT_MARK] = { .type = NLA_U32 },
  960. [TCA_CT_MARK_MASK] = { .type = NLA_U32 },
  961. [TCA_CT_LABELS] = { .type = NLA_BINARY,
  962. .len = 128 / BITS_PER_BYTE },
  963. [TCA_CT_LABELS_MASK] = { .type = NLA_BINARY,
  964. .len = 128 / BITS_PER_BYTE },
  965. [TCA_CT_NAT_IPV4_MIN] = { .type = NLA_U32 },
  966. [TCA_CT_NAT_IPV4_MAX] = { .type = NLA_U32 },
  967. [TCA_CT_NAT_IPV6_MIN] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
  968. [TCA_CT_NAT_IPV6_MAX] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
  969. [TCA_CT_NAT_PORT_MIN] = { .type = NLA_U16 },
  970. [TCA_CT_NAT_PORT_MAX] = { .type = NLA_U16 },
  971. [TCA_CT_HELPER_NAME] = { .type = NLA_STRING, .len = NF_CT_HELPER_NAME_LEN },
  972. [TCA_CT_HELPER_FAMILY] = { .type = NLA_U8 },
  973. [TCA_CT_HELPER_PROTO] = { .type = NLA_U8 },
  974. };
  975. static int tcf_ct_fill_params_nat(struct tcf_ct_params *p,
  976. struct tc_ct *parm,
  977. struct nlattr **tb,
  978. struct netlink_ext_ack *extack)
  979. {
  980. struct nf_nat_range2 *range;
  981. if (!(p->ct_action & TCA_CT_ACT_NAT))
  982. return 0;
  983. if (!IS_ENABLED(CONFIG_NF_NAT)) {
  984. NL_SET_ERR_MSG_MOD(extack, "Netfilter nat isn't enabled in kernel");
  985. return -EOPNOTSUPP;
  986. }
  987. if (!(p->ct_action & (TCA_CT_ACT_NAT_SRC | TCA_CT_ACT_NAT_DST)))
  988. return 0;
  989. if ((p->ct_action & TCA_CT_ACT_NAT_SRC) &&
  990. (p->ct_action & TCA_CT_ACT_NAT_DST)) {
  991. NL_SET_ERR_MSG_MOD(extack, "dnat and snat can't be enabled at the same time");
  992. return -EOPNOTSUPP;
  993. }
  994. range = &p->range;
  995. if (tb[TCA_CT_NAT_IPV4_MIN]) {
  996. struct nlattr *max_attr = tb[TCA_CT_NAT_IPV4_MAX];
  997. p->ipv4_range = true;
  998. range->flags |= NF_NAT_RANGE_MAP_IPS;
  999. range->min_addr.ip =
  1000. nla_get_in_addr(tb[TCA_CT_NAT_IPV4_MIN]);
  1001. range->max_addr.ip =
  1002. nla_get_in_addr_default(max_attr, range->min_addr.ip);
  1003. } else if (tb[TCA_CT_NAT_IPV6_MIN]) {
  1004. struct nlattr *max_attr = tb[TCA_CT_NAT_IPV6_MAX];
  1005. p->ipv4_range = false;
  1006. range->flags |= NF_NAT_RANGE_MAP_IPS;
  1007. range->min_addr.in6 =
  1008. nla_get_in6_addr(tb[TCA_CT_NAT_IPV6_MIN]);
  1009. range->max_addr.in6 = max_attr ?
  1010. nla_get_in6_addr(max_attr) :
  1011. range->min_addr.in6;
  1012. }
  1013. if (tb[TCA_CT_NAT_PORT_MIN]) {
  1014. range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
  1015. range->min_proto.all = nla_get_be16(tb[TCA_CT_NAT_PORT_MIN]);
  1016. range->max_proto.all = tb[TCA_CT_NAT_PORT_MAX] ?
  1017. nla_get_be16(tb[TCA_CT_NAT_PORT_MAX]) :
  1018. range->min_proto.all;
  1019. }
  1020. return 0;
  1021. }
  1022. static void tcf_ct_set_key_val(struct nlattr **tb,
  1023. void *val, int val_type,
  1024. void *mask, int mask_type,
  1025. int len)
  1026. {
  1027. if (!tb[val_type])
  1028. return;
  1029. nla_memcpy(val, tb[val_type], len);
  1030. if (!mask)
  1031. return;
  1032. if (mask_type == TCA_CT_UNSPEC || !tb[mask_type])
  1033. memset(mask, 0xff, len);
  1034. else
  1035. nla_memcpy(mask, tb[mask_type], len);
  1036. }
  1037. static int tcf_ct_fill_params(struct net *net,
  1038. struct tcf_ct_params *p,
  1039. struct tc_ct *parm,
  1040. struct nlattr **tb,
  1041. struct netlink_ext_ack *extack)
  1042. {
  1043. struct nf_conntrack_zone zone;
  1044. int err, family, proto, len;
  1045. bool put_labels = false;
  1046. struct nf_conn *tmpl;
  1047. char *name;
  1048. p->zone = NF_CT_DEFAULT_ZONE_ID;
  1049. tcf_ct_set_key_val(tb,
  1050. &p->ct_action, TCA_CT_ACTION,
  1051. NULL, TCA_CT_UNSPEC,
  1052. sizeof(p->ct_action));
  1053. if (p->ct_action & TCA_CT_ACT_CLEAR)
  1054. return 0;
  1055. err = tcf_ct_fill_params_nat(p, parm, tb, extack);
  1056. if (err)
  1057. return err;
  1058. if (tb[TCA_CT_MARK]) {
  1059. if (!IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)) {
  1060. NL_SET_ERR_MSG_MOD(extack, "Conntrack mark isn't enabled.");
  1061. return -EOPNOTSUPP;
  1062. }
  1063. tcf_ct_set_key_val(tb,
  1064. &p->mark, TCA_CT_MARK,
  1065. &p->mark_mask, TCA_CT_MARK_MASK,
  1066. sizeof(p->mark));
  1067. }
  1068. if (tb[TCA_CT_LABELS]) {
  1069. unsigned int n_bits = sizeof_field(struct tcf_ct_params, labels) * 8;
  1070. if (!IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS)) {
  1071. NL_SET_ERR_MSG_MOD(extack, "Conntrack labels isn't enabled.");
  1072. return -EOPNOTSUPP;
  1073. }
  1074. if (nf_connlabels_get(net, n_bits - 1)) {
  1075. NL_SET_ERR_MSG_MOD(extack, "Failed to set connlabel length");
  1076. return -EOPNOTSUPP;
  1077. } else {
  1078. put_labels = true;
  1079. }
  1080. tcf_ct_set_key_val(tb,
  1081. p->labels, TCA_CT_LABELS,
  1082. p->labels_mask, TCA_CT_LABELS_MASK,
  1083. sizeof(p->labels));
  1084. }
  1085. if (tb[TCA_CT_ZONE]) {
  1086. if (!IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES)) {
  1087. NL_SET_ERR_MSG_MOD(extack, "Conntrack zones isn't enabled.");
  1088. return -EOPNOTSUPP;
  1089. }
  1090. tcf_ct_set_key_val(tb,
  1091. &p->zone, TCA_CT_ZONE,
  1092. NULL, TCA_CT_UNSPEC,
  1093. sizeof(p->zone));
  1094. }
  1095. nf_ct_zone_init(&zone, p->zone, NF_CT_DEFAULT_ZONE_DIR, 0);
  1096. tmpl = nf_ct_tmpl_alloc(net, &zone, GFP_KERNEL);
  1097. if (!tmpl) {
  1098. NL_SET_ERR_MSG_MOD(extack, "Failed to allocate conntrack template");
  1099. return -ENOMEM;
  1100. }
  1101. p->tmpl = tmpl;
  1102. if (tb[TCA_CT_HELPER_NAME]) {
  1103. name = nla_data(tb[TCA_CT_HELPER_NAME]);
  1104. len = nla_len(tb[TCA_CT_HELPER_NAME]);
  1105. if (len > 16 || name[len - 1] != '\0') {
  1106. NL_SET_ERR_MSG_MOD(extack, "Failed to parse helper name.");
  1107. err = -EINVAL;
  1108. goto err;
  1109. }
  1110. family = nla_get_u8_default(tb[TCA_CT_HELPER_FAMILY], AF_INET);
  1111. proto = nla_get_u8_default(tb[TCA_CT_HELPER_PROTO],
  1112. IPPROTO_TCP);
  1113. err = nf_ct_add_helper(tmpl, name, family, proto,
  1114. p->ct_action & TCA_CT_ACT_NAT, &p->helper);
  1115. if (err) {
  1116. NL_SET_ERR_MSG_MOD(extack, "Failed to add helper");
  1117. goto err;
  1118. }
  1119. }
  1120. p->put_labels = put_labels;
  1121. if (p->ct_action & TCA_CT_ACT_COMMIT)
  1122. __set_bit(IPS_CONFIRMED_BIT, &tmpl->status);
  1123. return 0;
  1124. err:
  1125. if (put_labels)
  1126. nf_connlabels_put(net);
  1127. nf_ct_put(p->tmpl);
  1128. p->tmpl = NULL;
  1129. return err;
  1130. }
  1131. static int tcf_ct_init(struct net *net, struct nlattr *nla,
  1132. struct nlattr *est, struct tc_action **a,
  1133. struct tcf_proto *tp, u32 flags,
  1134. struct netlink_ext_ack *extack)
  1135. {
  1136. struct tc_action_net *tn = net_generic(net, act_ct_ops.net_id);
  1137. bool bind = flags & TCA_ACT_FLAGS_BIND;
  1138. struct tcf_ct_params *params = NULL;
  1139. struct nlattr *tb[TCA_CT_MAX + 1];
  1140. struct tcf_chain *goto_ch = NULL;
  1141. struct tc_ct *parm;
  1142. struct tcf_ct *c;
  1143. int err, res = 0;
  1144. u32 index;
  1145. if (!nla) {
  1146. NL_SET_ERR_MSG_MOD(extack, "Ct requires attributes to be passed");
  1147. return -EINVAL;
  1148. }
  1149. if (bind && !(flags & TCA_ACT_FLAGS_AT_INGRESS_OR_CLSACT)) {
  1150. NL_SET_ERR_MSG_MOD(extack,
  1151. "Attaching ct to a non ingress/clsact qdisc is unsupported");
  1152. return -EOPNOTSUPP;
  1153. }
  1154. err = nla_parse_nested(tb, TCA_CT_MAX, nla, ct_policy, extack);
  1155. if (err < 0)
  1156. return err;
  1157. if (!tb[TCA_CT_PARMS]) {
  1158. NL_SET_ERR_MSG_MOD(extack, "Missing required ct parameters");
  1159. return -EINVAL;
  1160. }
  1161. parm = nla_data(tb[TCA_CT_PARMS]);
  1162. index = parm->index;
  1163. err = tcf_idr_check_alloc(tn, &index, a, bind);
  1164. if (err < 0)
  1165. return err;
  1166. if (!err) {
  1167. err = tcf_idr_create_from_flags(tn, index, est, a,
  1168. &act_ct_ops, bind, flags);
  1169. if (err) {
  1170. tcf_idr_cleanup(tn, index);
  1171. return err;
  1172. }
  1173. res = ACT_P_CREATED;
  1174. } else {
  1175. if (bind)
  1176. return ACT_P_BOUND;
  1177. if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
  1178. tcf_idr_release(*a, bind);
  1179. return -EEXIST;
  1180. }
  1181. }
  1182. err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
  1183. if (err < 0)
  1184. goto cleanup;
  1185. c = to_ct(*a);
  1186. params = kzalloc_obj(*params);
  1187. if (unlikely(!params)) {
  1188. err = -ENOMEM;
  1189. goto cleanup;
  1190. }
  1191. err = tcf_ct_fill_params(net, params, parm, tb, extack);
  1192. if (err)
  1193. goto cleanup;
  1194. err = tcf_ct_flow_table_get(net, params);
  1195. if (err)
  1196. goto cleanup;
  1197. params->action = parm->action;
  1198. spin_lock_bh(&c->tcf_lock);
  1199. goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
  1200. params = rcu_replace_pointer(c->params, params,
  1201. lockdep_is_held(&c->tcf_lock));
  1202. spin_unlock_bh(&c->tcf_lock);
  1203. if (goto_ch)
  1204. tcf_chain_put_by_act(goto_ch);
  1205. if (params)
  1206. call_rcu(&params->rcu, tcf_ct_params_free_rcu);
  1207. return res;
  1208. cleanup:
  1209. if (goto_ch)
  1210. tcf_chain_put_by_act(goto_ch);
  1211. if (params)
  1212. tcf_ct_params_free(params);
  1213. tcf_idr_release(*a, bind);
  1214. return err;
  1215. }
  1216. static void tcf_ct_cleanup(struct tc_action *a)
  1217. {
  1218. struct tcf_ct_params *params;
  1219. struct tcf_ct *c = to_ct(a);
  1220. params = rcu_dereference_protected(c->params, 1);
  1221. if (params)
  1222. call_rcu(&params->rcu, tcf_ct_params_free_rcu);
  1223. }
  1224. static int tcf_ct_dump_key_val(struct sk_buff *skb,
  1225. const void *val, int val_type,
  1226. const void *mask, int mask_type,
  1227. int len)
  1228. {
  1229. int err;
  1230. if (mask && !memchr_inv(mask, 0, len))
  1231. return 0;
  1232. err = nla_put(skb, val_type, len, val);
  1233. if (err)
  1234. return err;
  1235. if (mask_type != TCA_CT_UNSPEC) {
  1236. err = nla_put(skb, mask_type, len, mask);
  1237. if (err)
  1238. return err;
  1239. }
  1240. return 0;
  1241. }
  1242. static int tcf_ct_dump_nat(struct sk_buff *skb, const struct tcf_ct_params *p)
  1243. {
  1244. const struct nf_nat_range2 *range = &p->range;
  1245. if (!(p->ct_action & TCA_CT_ACT_NAT))
  1246. return 0;
  1247. if (!(p->ct_action & (TCA_CT_ACT_NAT_SRC | TCA_CT_ACT_NAT_DST)))
  1248. return 0;
  1249. if (range->flags & NF_NAT_RANGE_MAP_IPS) {
  1250. if (p->ipv4_range) {
  1251. if (nla_put_in_addr(skb, TCA_CT_NAT_IPV4_MIN,
  1252. range->min_addr.ip))
  1253. return -1;
  1254. if (nla_put_in_addr(skb, TCA_CT_NAT_IPV4_MAX,
  1255. range->max_addr.ip))
  1256. return -1;
  1257. } else {
  1258. if (nla_put_in6_addr(skb, TCA_CT_NAT_IPV6_MIN,
  1259. &range->min_addr.in6))
  1260. return -1;
  1261. if (nla_put_in6_addr(skb, TCA_CT_NAT_IPV6_MAX,
  1262. &range->max_addr.in6))
  1263. return -1;
  1264. }
  1265. }
  1266. if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
  1267. if (nla_put_be16(skb, TCA_CT_NAT_PORT_MIN,
  1268. range->min_proto.all))
  1269. return -1;
  1270. if (nla_put_be16(skb, TCA_CT_NAT_PORT_MAX,
  1271. range->max_proto.all))
  1272. return -1;
  1273. }
  1274. return 0;
  1275. }
  1276. static int tcf_ct_dump_helper(struct sk_buff *skb,
  1277. const struct nf_conntrack_helper *helper)
  1278. {
  1279. if (!helper)
  1280. return 0;
  1281. if (nla_put_string(skb, TCA_CT_HELPER_NAME, helper->name) ||
  1282. nla_put_u8(skb, TCA_CT_HELPER_FAMILY, helper->tuple.src.l3num) ||
  1283. nla_put_u8(skb, TCA_CT_HELPER_PROTO, helper->tuple.dst.protonum))
  1284. return -1;
  1285. return 0;
  1286. }
  1287. static inline int tcf_ct_dump(struct sk_buff *skb, struct tc_action *a,
  1288. int bind, int ref)
  1289. {
  1290. unsigned char *b = skb_tail_pointer(skb);
  1291. const struct tcf_ct *c = to_ct(a);
  1292. const struct tcf_ct_params *p;
  1293. struct tc_ct opt = {
  1294. .index = c->tcf_index,
  1295. .refcnt = refcount_read(&c->tcf_refcnt) - ref,
  1296. .bindcnt = atomic_read(&c->tcf_bindcnt) - bind,
  1297. };
  1298. struct tcf_t t;
  1299. rcu_read_lock();
  1300. p = rcu_dereference(c->params);
  1301. opt.action = p->action;
  1302. if (tcf_ct_dump_key_val(skb,
  1303. &p->ct_action, TCA_CT_ACTION,
  1304. NULL, TCA_CT_UNSPEC,
  1305. sizeof(p->ct_action)))
  1306. goto nla_put_failure;
  1307. if (p->ct_action & TCA_CT_ACT_CLEAR)
  1308. goto skip_dump;
  1309. if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) &&
  1310. tcf_ct_dump_key_val(skb,
  1311. &p->mark, TCA_CT_MARK,
  1312. &p->mark_mask, TCA_CT_MARK_MASK,
  1313. sizeof(p->mark)))
  1314. goto nla_put_failure;
  1315. if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) &&
  1316. tcf_ct_dump_key_val(skb,
  1317. p->labels, TCA_CT_LABELS,
  1318. p->labels_mask, TCA_CT_LABELS_MASK,
  1319. sizeof(p->labels)))
  1320. goto nla_put_failure;
  1321. if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&
  1322. tcf_ct_dump_key_val(skb,
  1323. &p->zone, TCA_CT_ZONE,
  1324. NULL, TCA_CT_UNSPEC,
  1325. sizeof(p->zone)))
  1326. goto nla_put_failure;
  1327. if (tcf_ct_dump_nat(skb, p))
  1328. goto nla_put_failure;
  1329. if (tcf_ct_dump_helper(skb, p->helper))
  1330. goto nla_put_failure;
  1331. skip_dump:
  1332. if (nla_put(skb, TCA_CT_PARMS, sizeof(opt), &opt))
  1333. goto nla_put_failure;
  1334. tcf_tm_dump(&t, &c->tcf_tm);
  1335. if (nla_put_64bit(skb, TCA_CT_TM, sizeof(t), &t, TCA_CT_PAD))
  1336. goto nla_put_failure;
  1337. rcu_read_unlock();
  1338. return skb->len;
  1339. nla_put_failure:
  1340. rcu_read_unlock();
  1341. nlmsg_trim(skb, b);
  1342. return -1;
  1343. }
  1344. static void tcf_stats_update(struct tc_action *a, u64 bytes, u64 packets,
  1345. u64 drops, u64 lastuse, bool hw)
  1346. {
  1347. struct tcf_ct *c = to_ct(a);
  1348. tcf_action_update_stats(a, bytes, packets, drops, hw);
  1349. c->tcf_tm.lastuse = max_t(u64, c->tcf_tm.lastuse, lastuse);
  1350. }
  1351. static int tcf_ct_offload_act_setup(struct tc_action *act, void *entry_data,
  1352. u32 *index_inc, bool bind,
  1353. struct netlink_ext_ack *extack)
  1354. {
  1355. if (bind) {
  1356. struct flow_action_entry *entry = entry_data;
  1357. if (tcf_ct_helper(act))
  1358. return -EOPNOTSUPP;
  1359. entry->id = FLOW_ACTION_CT;
  1360. entry->ct.action = tcf_ct_action(act);
  1361. entry->ct.zone = tcf_ct_zone(act);
  1362. entry->ct.flow_table = tcf_ct_ft(act);
  1363. *index_inc = 1;
  1364. } else {
  1365. struct flow_offload_action *fl_action = entry_data;
  1366. fl_action->id = FLOW_ACTION_CT;
  1367. }
  1368. return 0;
  1369. }
  1370. static struct tc_action_ops act_ct_ops = {
  1371. .kind = "ct",
  1372. .id = TCA_ID_CT,
  1373. .owner = THIS_MODULE,
  1374. .act = tcf_ct_act,
  1375. .dump = tcf_ct_dump,
  1376. .init = tcf_ct_init,
  1377. .cleanup = tcf_ct_cleanup,
  1378. .stats_update = tcf_stats_update,
  1379. .offload_act_setup = tcf_ct_offload_act_setup,
  1380. .size = sizeof(struct tcf_ct),
  1381. };
  1382. MODULE_ALIAS_NET_ACT("ct");
  1383. static __net_init int ct_init_net(struct net *net)
  1384. {
  1385. struct tc_ct_action_net *tn = net_generic(net, act_ct_ops.net_id);
  1386. return tc_action_net_init(net, &tn->tn, &act_ct_ops);
  1387. }
  1388. static void __net_exit ct_exit_net(struct list_head *net_list)
  1389. {
  1390. tc_action_net_exit(net_list, act_ct_ops.net_id);
  1391. }
  1392. static struct pernet_operations ct_net_ops = {
  1393. .init = ct_init_net,
  1394. .exit_batch = ct_exit_net,
  1395. .id = &act_ct_ops.net_id,
  1396. .size = sizeof(struct tc_ct_action_net),
  1397. };
  1398. static int __init ct_init_module(void)
  1399. {
  1400. int err;
  1401. act_ct_wq = alloc_ordered_workqueue("act_ct_workqueue", 0);
  1402. if (!act_ct_wq)
  1403. return -ENOMEM;
  1404. err = tcf_ct_flow_tables_init();
  1405. if (err)
  1406. goto err_tbl_init;
  1407. err = tcf_register_action(&act_ct_ops, &ct_net_ops);
  1408. if (err)
  1409. goto err_register;
  1410. static_branch_inc(&tcf_frag_xmit_count);
  1411. return 0;
  1412. err_register:
  1413. tcf_ct_flow_tables_uninit();
  1414. err_tbl_init:
  1415. destroy_workqueue(act_ct_wq);
  1416. return err;
  1417. }
  1418. static void __exit ct_cleanup_module(void)
  1419. {
  1420. static_branch_dec(&tcf_frag_xmit_count);
  1421. tcf_unregister_action(&act_ct_ops, &ct_net_ops);
  1422. tcf_ct_flow_tables_uninit();
  1423. destroy_workqueue(act_ct_wq);
  1424. }
  1425. module_init(ct_init_module);
  1426. module_exit(ct_cleanup_module);
  1427. MODULE_AUTHOR("Paul Blakey <paulb@mellanox.com>");
  1428. MODULE_AUTHOR("Yossi Kuperman <yossiku@mellanox.com>");
  1429. MODULE_AUTHOR("Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>");
  1430. MODULE_DESCRIPTION("Connection tracking action");
  1431. MODULE_LICENSE("GPL v2");