ipoib_main.c 70 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803
  1. /*
  2. * Copyright (c) 2004 Topspin Communications. All rights reserved.
  3. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
  4. * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
  5. *
  6. * This software is available to you under a choice of one of two
  7. * licenses. You may choose to be licensed under the terms of the GNU
  8. * General Public License (GPL) Version 2, available from the file
  9. * COPYING in the main directory of this source tree, or the
  10. * OpenIB.org BSD license below:
  11. *
  12. * Redistribution and use in source and binary forms, with or
  13. * without modification, are permitted provided that the following
  14. * conditions are met:
  15. *
  16. * - Redistributions of source code must retain the above
  17. * copyright notice, this list of conditions and the following
  18. * disclaimer.
  19. *
  20. * - Redistributions in binary form must reproduce the above
  21. * copyright notice, this list of conditions and the following
  22. * disclaimer in the documentation and/or other materials
  23. * provided with the distribution.
  24. *
  25. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  26. * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  27. * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  28. * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  29. * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  30. * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  31. * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  32. * SOFTWARE.
  33. */
  34. #include "ipoib.h"
  35. #include <linux/module.h>
  36. #include <linux/init.h>
  37. #include <linux/slab.h>
  38. #include <linux/kernel.h>
  39. #include <linux/vmalloc.h>
  40. #include <linux/if_arp.h> /* For ARPHRD_xxx */
  41. #include <linux/ip.h>
  42. #include <linux/in.h>
  43. #include <linux/jhash.h>
  44. #include <net/arp.h>
  45. #include <net/addrconf.h>
  46. #include <net/netdev_lock.h>
  47. #include <net/pkt_sched.h>
  48. #include <linux/inetdevice.h>
  49. #include <rdma/ib_cache.h>
  50. MODULE_AUTHOR("Roland Dreier");
  51. MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
  52. MODULE_LICENSE("Dual BSD/GPL");
  53. int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE;
  54. int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE;
  55. module_param_named(send_queue_size, ipoib_sendq_size, int, 0444);
  56. MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue");
  57. module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444);
  58. MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue");
  59. #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
  60. int ipoib_debug_level;
  61. module_param_named(debug_level, ipoib_debug_level, int, 0644);
  62. MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
  63. #endif
  64. struct ipoib_path_iter {
  65. struct net_device *dev;
  66. struct ipoib_path path;
  67. };
  68. static const u8 ipv4_bcast_addr[] = {
  69. 0x00, 0xff, 0xff, 0xff,
  70. 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
  71. 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff
  72. };
  73. struct workqueue_struct *ipoib_workqueue;
  74. struct ib_sa_client ipoib_sa_client;
  75. static int ipoib_add_one(struct ib_device *device);
  76. static void ipoib_remove_one(struct ib_device *device, void *client_data);
  77. static void ipoib_neigh_reclaim(struct rcu_head *rp);
  78. static struct net_device *ipoib_get_net_dev_by_params(
  79. struct ib_device *dev, u32 port, u16 pkey,
  80. const union ib_gid *gid, const struct sockaddr *addr,
  81. void *client_data);
  82. static int ipoib_set_mac(struct net_device *dev, void *addr);
  83. static int ipoib_ioctl(struct net_device *dev, struct ifreq *ifr,
  84. int cmd);
  85. static struct ib_client ipoib_client = {
  86. .name = "ipoib",
  87. .add = ipoib_add_one,
  88. .remove = ipoib_remove_one,
  89. .get_net_dev_by_params = ipoib_get_net_dev_by_params,
  90. };
  91. #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
  92. static int ipoib_netdev_event(struct notifier_block *this,
  93. unsigned long event, void *ptr)
  94. {
  95. struct netdev_notifier_info *ni = ptr;
  96. struct net_device *dev = ni->dev;
  97. if (dev->netdev_ops->ndo_open != ipoib_open)
  98. return NOTIFY_DONE;
  99. switch (event) {
  100. case NETDEV_REGISTER:
  101. ipoib_create_debug_files(dev);
  102. break;
  103. case NETDEV_CHANGENAME:
  104. ipoib_delete_debug_files(dev);
  105. ipoib_create_debug_files(dev);
  106. break;
  107. case NETDEV_UNREGISTER:
  108. ipoib_delete_debug_files(dev);
  109. break;
  110. }
  111. return NOTIFY_DONE;
  112. }
  113. #endif
  114. struct ipoib_ifupdown_work {
  115. struct work_struct work;
  116. struct net_device *dev;
  117. netdevice_tracker dev_tracker;
  118. bool up;
  119. };
  120. static void ipoib_ifupdown_task(struct work_struct *work)
  121. {
  122. struct ipoib_ifupdown_work *pwork =
  123. container_of(work, struct ipoib_ifupdown_work, work);
  124. struct net_device *dev = pwork->dev;
  125. unsigned int flags;
  126. rtnl_lock();
  127. flags = dev->flags;
  128. if (pwork->up)
  129. flags |= IFF_UP;
  130. else
  131. flags &= ~IFF_UP;
  132. if (dev->flags != flags)
  133. dev_change_flags(dev, flags, NULL);
  134. rtnl_unlock();
  135. netdev_put(dev, &pwork->dev_tracker);
  136. kfree(pwork);
  137. }
  138. static void ipoib_schedule_ifupdown_task(struct net_device *dev, bool up)
  139. {
  140. struct ipoib_ifupdown_work *work;
  141. if ((up && (dev->flags & IFF_UP)) ||
  142. (!up && !(dev->flags & IFF_UP)))
  143. return;
  144. work = kmalloc_obj(*work);
  145. if (!work)
  146. return;
  147. work->dev = dev;
  148. netdev_hold(dev, &work->dev_tracker, GFP_KERNEL);
  149. work->up = up;
  150. INIT_WORK(&work->work, ipoib_ifupdown_task);
  151. queue_work(ipoib_workqueue, &work->work);
  152. }
  153. int ipoib_open(struct net_device *dev)
  154. {
  155. struct ipoib_dev_priv *priv = ipoib_priv(dev);
  156. ipoib_dbg(priv, "bringing up interface\n");
  157. netif_carrier_off(dev);
  158. set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
  159. if (ipoib_ib_dev_open(dev)) {
  160. if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags))
  161. return 0;
  162. goto err_disable;
  163. }
  164. ipoib_ib_dev_up(dev);
  165. if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
  166. struct ipoib_dev_priv *cpriv;
  167. /* Bring up any child interfaces too */
  168. netdev_lock_ops_to_full(dev);
  169. list_for_each_entry(cpriv, &priv->child_intfs, list)
  170. ipoib_schedule_ifupdown_task(cpriv->dev, true);
  171. netdev_unlock_full_to_ops(dev);
  172. } else if (priv->parent) {
  173. struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent);
  174. if (!test_bit(IPOIB_FLAG_ADMIN_UP, &ppriv->flags))
  175. ipoib_dbg(priv, "parent device %s is not up, so child device may be not functioning.\n",
  176. ppriv->dev->name);
  177. }
  178. netif_start_queue(dev);
  179. return 0;
  180. err_disable:
  181. clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
  182. return -EINVAL;
  183. }
  184. static int ipoib_stop(struct net_device *dev)
  185. {
  186. struct ipoib_dev_priv *priv = ipoib_priv(dev);
  187. ipoib_dbg(priv, "stopping interface\n");
  188. clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
  189. netif_stop_queue(dev);
  190. ipoib_ib_dev_down(dev);
  191. ipoib_ib_dev_stop(dev);
  192. if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
  193. struct ipoib_dev_priv *cpriv;
  194. /* Bring down any child interfaces too */
  195. netdev_lock_ops_to_full(dev);
  196. list_for_each_entry(cpriv, &priv->child_intfs, list)
  197. ipoib_schedule_ifupdown_task(cpriv->dev, false);
  198. netdev_unlock_full_to_ops(dev);
  199. }
  200. return 0;
  201. }
  202. static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_features_t features)
  203. {
  204. struct ipoib_dev_priv *priv = ipoib_priv(dev);
  205. if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags))
  206. features &= ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
  207. return features;
  208. }
  209. static int ipoib_change_mtu(struct net_device *dev, int new_mtu)
  210. {
  211. struct ipoib_dev_priv *priv = ipoib_priv(dev);
  212. int ret = 0;
  213. /* dev->mtu > 2K ==> connected mode */
  214. if (ipoib_cm_admin_enabled(dev)) {
  215. if (new_mtu > ipoib_cm_max_mtu(dev))
  216. return -EINVAL;
  217. if (new_mtu > priv->mcast_mtu)
  218. ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n",
  219. priv->mcast_mtu);
  220. WRITE_ONCE(dev->mtu, new_mtu);
  221. return 0;
  222. }
  223. if (new_mtu < (ETH_MIN_MTU + IPOIB_ENCAP_LEN) ||
  224. new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu))
  225. return -EINVAL;
  226. priv->admin_mtu = new_mtu;
  227. if (priv->mcast_mtu < priv->admin_mtu)
  228. ipoib_dbg(priv, "MTU must be smaller than the underlying "
  229. "link layer MTU - 4 (%u)\n", priv->mcast_mtu);
  230. new_mtu = min(priv->mcast_mtu, priv->admin_mtu);
  231. if (priv->rn_ops->ndo_change_mtu) {
  232. bool carrier_status = netif_carrier_ok(dev);
  233. netif_carrier_off(dev);
  234. /* notify lower level on the real mtu */
  235. ret = priv->rn_ops->ndo_change_mtu(dev, new_mtu);
  236. if (carrier_status)
  237. netif_carrier_on(dev);
  238. } else {
  239. WRITE_ONCE(dev->mtu, new_mtu);
  240. }
  241. return ret;
  242. }
  243. static void ipoib_get_stats(struct net_device *dev,
  244. struct rtnl_link_stats64 *stats)
  245. {
  246. struct ipoib_dev_priv *priv = ipoib_priv(dev);
  247. if (priv->rn_ops->ndo_get_stats64)
  248. priv->rn_ops->ndo_get_stats64(dev, stats);
  249. else
  250. netdev_stats_to_stats64(stats, &dev->stats);
  251. }
  252. /* Called with an RCU read lock taken */
  253. static bool ipoib_is_dev_match_addr_rcu(const struct sockaddr *addr,
  254. struct net_device *dev)
  255. {
  256. struct net *net = dev_net(dev);
  257. struct in_device *in_dev;
  258. struct sockaddr_in *addr_in = (struct sockaddr_in *)addr;
  259. struct sockaddr_in6 *addr_in6 = (struct sockaddr_in6 *)addr;
  260. __be32 ret_addr;
  261. switch (addr->sa_family) {
  262. case AF_INET:
  263. in_dev = in_dev_get(dev);
  264. if (!in_dev)
  265. return false;
  266. ret_addr = inet_confirm_addr(net, in_dev, 0,
  267. addr_in->sin_addr.s_addr,
  268. RT_SCOPE_HOST);
  269. in_dev_put(in_dev);
  270. if (ret_addr)
  271. return true;
  272. break;
  273. case AF_INET6:
  274. if (IS_ENABLED(CONFIG_IPV6) &&
  275. ipv6_chk_addr(net, &addr_in6->sin6_addr, dev, 1))
  276. return true;
  277. break;
  278. }
  279. return false;
  280. }
  281. /*
  282. * Find the L2 master net_device on top of the given net_device.
  283. * @dev: base IPoIB net_device
  284. *
  285. * Returns the L2 master net_device with reference held if the L2 master
  286. * exists (such as bond netdevice), or returns same netdev with reference
  287. * held when master does not exist or when L3 master (such as VRF netdev).
  288. */
  289. static struct net_device *ipoib_get_master_net_dev(struct net_device *dev)
  290. {
  291. struct net_device *master;
  292. rcu_read_lock();
  293. master = netdev_master_upper_dev_get_rcu(dev);
  294. if (!master || netif_is_l3_master(master))
  295. master = dev;
  296. dev_hold(master);
  297. rcu_read_unlock();
  298. return master;
  299. }
  300. struct ipoib_walk_data {
  301. const struct sockaddr *addr;
  302. struct net_device *result;
  303. };
  304. static int ipoib_upper_walk(struct net_device *upper,
  305. struct netdev_nested_priv *priv)
  306. {
  307. struct ipoib_walk_data *data = (struct ipoib_walk_data *)priv->data;
  308. int ret = 0;
  309. if (ipoib_is_dev_match_addr_rcu(data->addr, upper)) {
  310. dev_hold(upper);
  311. data->result = upper;
  312. ret = 1;
  313. }
  314. return ret;
  315. }
  316. /**
  317. * ipoib_get_net_dev_match_addr - Find a net_device matching
  318. * the given address, which is an upper device of the given net_device.
  319. *
  320. * @addr: IP address to look for.
  321. * @dev: base IPoIB net_device
  322. *
  323. * If found, returns the net_device with a reference held. Otherwise return
  324. * NULL.
  325. */
  326. static struct net_device *ipoib_get_net_dev_match_addr(
  327. const struct sockaddr *addr, struct net_device *dev)
  328. {
  329. struct netdev_nested_priv priv;
  330. struct ipoib_walk_data data = {
  331. .addr = addr,
  332. };
  333. priv.data = (void *)&data;
  334. rcu_read_lock();
  335. if (ipoib_is_dev_match_addr_rcu(addr, dev)) {
  336. dev_hold(dev);
  337. data.result = dev;
  338. goto out;
  339. }
  340. netdev_walk_all_upper_dev_rcu(dev, ipoib_upper_walk, &priv);
  341. out:
  342. rcu_read_unlock();
  343. return data.result;
  344. }
  345. /* returns the number of IPoIB netdevs on top a given ipoib device matching a
  346. * pkey_index and address, if one exists.
  347. *
  348. * @found_net_dev: contains a matching net_device if the return value >= 1,
  349. * with a reference held. */
  350. static int ipoib_match_gid_pkey_addr(struct ipoib_dev_priv *priv,
  351. const union ib_gid *gid,
  352. u16 pkey_index,
  353. const struct sockaddr *addr,
  354. int nesting,
  355. struct net_device **found_net_dev)
  356. {
  357. struct ipoib_dev_priv *child_priv;
  358. struct net_device *net_dev = NULL;
  359. int matches = 0;
  360. if (priv->pkey_index == pkey_index &&
  361. (!gid || !memcmp(gid, &priv->local_gid, sizeof(*gid)))) {
  362. if (!addr) {
  363. net_dev = ipoib_get_master_net_dev(priv->dev);
  364. } else {
  365. /* Verify the net_device matches the IP address, as
  366. * IPoIB child devices currently share a GID. */
  367. net_dev = ipoib_get_net_dev_match_addr(addr, priv->dev);
  368. }
  369. if (net_dev) {
  370. if (!*found_net_dev)
  371. *found_net_dev = net_dev;
  372. else
  373. dev_put(net_dev);
  374. ++matches;
  375. }
  376. }
  377. if (test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags))
  378. return matches;
  379. /* Check child interfaces */
  380. netdev_lock(priv->dev);
  381. list_for_each_entry(child_priv, &priv->child_intfs, list) {
  382. matches += ipoib_match_gid_pkey_addr(child_priv, gid,
  383. pkey_index, addr,
  384. nesting + 1,
  385. found_net_dev);
  386. if (matches > 1)
  387. break;
  388. }
  389. netdev_unlock(priv->dev);
  390. return matches;
  391. }
  392. /* Returns the number of matching net_devs found (between 0 and 2). Also
  393. * return the matching net_device in the @net_dev parameter, holding a
  394. * reference to the net_device, if the number of matches >= 1 */
  395. static int __ipoib_get_net_dev_by_params(struct list_head *dev_list, u32 port,
  396. u16 pkey_index,
  397. const union ib_gid *gid,
  398. const struct sockaddr *addr,
  399. struct net_device **net_dev)
  400. {
  401. struct ipoib_dev_priv *priv;
  402. int matches = 0;
  403. *net_dev = NULL;
  404. list_for_each_entry(priv, dev_list, list) {
  405. if (priv->port != port)
  406. continue;
  407. matches += ipoib_match_gid_pkey_addr(priv, gid, pkey_index,
  408. addr, 0, net_dev);
  409. if (matches > 1)
  410. break;
  411. }
  412. return matches;
  413. }
  414. static struct net_device *ipoib_get_net_dev_by_params(
  415. struct ib_device *dev, u32 port, u16 pkey,
  416. const union ib_gid *gid, const struct sockaddr *addr,
  417. void *client_data)
  418. {
  419. struct net_device *net_dev;
  420. struct list_head *dev_list = client_data;
  421. u16 pkey_index;
  422. int matches;
  423. int ret;
  424. if (!rdma_protocol_ib(dev, port))
  425. return NULL;
  426. ret = ib_find_cached_pkey(dev, port, pkey, &pkey_index);
  427. if (ret)
  428. return NULL;
  429. /* See if we can find a unique device matching the pkey and GID */
  430. matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index,
  431. gid, NULL, &net_dev);
  432. switch (matches) {
  433. case 0:
  434. return NULL;
  435. case 1:
  436. return net_dev;
  437. }
  438. dev_put(net_dev);
  439. /* Couldn't find a unique device with pkey and GID only. Use L3
  440. * address to uniquely match the net device */
  441. matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index,
  442. gid, addr, &net_dev);
  443. switch (matches) {
  444. case 0:
  445. return NULL;
  446. default:
  447. dev_warn_ratelimited(&dev->dev,
  448. "duplicate IP address detected\n");
  449. fallthrough;
  450. case 1:
  451. return net_dev;
  452. }
  453. }
  454. int ipoib_set_mode(struct net_device *dev, const char *buf)
  455. {
  456. struct ipoib_dev_priv *priv = ipoib_priv(dev);
  457. if ((test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags) &&
  458. !strcmp(buf, "connected\n")) ||
  459. (!test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags) &&
  460. !strcmp(buf, "datagram\n"))) {
  461. return 0;
  462. }
  463. /* flush paths if we switch modes so that connections are restarted */
  464. if (IPOIB_CM_SUPPORTED(dev->dev_addr) && !strcmp(buf, "connected\n")) {
  465. set_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
  466. ipoib_warn(priv, "enabling connected mode "
  467. "will cause multicast packet drops\n");
  468. netdev_lock_ops(dev);
  469. netdev_update_features(dev);
  470. netif_set_mtu(dev, ipoib_cm_max_mtu(dev));
  471. netif_set_real_num_tx_queues(dev, 1);
  472. netdev_unlock_ops(dev);
  473. rtnl_unlock();
  474. priv->tx_wr.wr.send_flags &= ~IB_SEND_IP_CSUM;
  475. ipoib_flush_paths(dev);
  476. return (!rtnl_trylock()) ? -EBUSY : 0;
  477. }
  478. if (!strcmp(buf, "datagram\n")) {
  479. clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
  480. netdev_lock_ops(dev);
  481. netdev_update_features(dev);
  482. netif_set_mtu(dev, min(priv->mcast_mtu, dev->mtu));
  483. netif_set_real_num_tx_queues(dev, dev->num_tx_queues);
  484. netdev_unlock_ops(dev);
  485. rtnl_unlock();
  486. ipoib_flush_paths(dev);
  487. return (!rtnl_trylock()) ? -EBUSY : 0;
  488. }
  489. return -EINVAL;
  490. }
  491. struct ipoib_path *__path_find(struct net_device *dev, void *gid)
  492. {
  493. struct ipoib_dev_priv *priv = ipoib_priv(dev);
  494. struct rb_node *n = priv->path_tree.rb_node;
  495. struct ipoib_path *path;
  496. int ret;
  497. while (n) {
  498. path = rb_entry(n, struct ipoib_path, rb_node);
  499. ret = memcmp(gid, path->pathrec.dgid.raw,
  500. sizeof (union ib_gid));
  501. if (ret < 0)
  502. n = n->rb_left;
  503. else if (ret > 0)
  504. n = n->rb_right;
  505. else
  506. return path;
  507. }
  508. return NULL;
  509. }
  510. static int __path_add(struct net_device *dev, struct ipoib_path *path)
  511. {
  512. struct ipoib_dev_priv *priv = ipoib_priv(dev);
  513. struct rb_node **n = &priv->path_tree.rb_node;
  514. struct rb_node *pn = NULL;
  515. struct ipoib_path *tpath;
  516. int ret;
  517. while (*n) {
  518. pn = *n;
  519. tpath = rb_entry(pn, struct ipoib_path, rb_node);
  520. ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw,
  521. sizeof (union ib_gid));
  522. if (ret < 0)
  523. n = &pn->rb_left;
  524. else if (ret > 0)
  525. n = &pn->rb_right;
  526. else
  527. return -EEXIST;
  528. }
  529. rb_link_node(&path->rb_node, pn, n);
  530. rb_insert_color(&path->rb_node, &priv->path_tree);
  531. list_add_tail(&path->list, &priv->path_list);
  532. return 0;
  533. }
  534. static void path_free(struct net_device *dev, struct ipoib_path *path)
  535. {
  536. struct sk_buff *skb;
  537. while ((skb = __skb_dequeue(&path->queue)))
  538. dev_kfree_skb_irq(skb);
  539. ipoib_dbg(ipoib_priv(dev), "%s\n", __func__);
  540. /* remove all neigh connected to this path */
  541. ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw);
  542. if (path->ah)
  543. ipoib_put_ah(path->ah);
  544. kfree(path);
  545. }
  546. #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
  547. struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev)
  548. {
  549. struct ipoib_path_iter *iter;
  550. iter = kmalloc_obj(*iter);
  551. if (!iter)
  552. return NULL;
  553. iter->dev = dev;
  554. memset(iter->path.pathrec.dgid.raw, 0, 16);
  555. if (ipoib_path_iter_next(iter)) {
  556. kfree(iter);
  557. return NULL;
  558. }
  559. return iter;
  560. }
  561. int ipoib_path_iter_next(struct ipoib_path_iter *iter)
  562. {
  563. struct ipoib_dev_priv *priv = ipoib_priv(iter->dev);
  564. struct rb_node *n;
  565. struct ipoib_path *path;
  566. int ret = 1;
  567. spin_lock_irq(&priv->lock);
  568. n = rb_first(&priv->path_tree);
  569. while (n) {
  570. path = rb_entry(n, struct ipoib_path, rb_node);
  571. if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw,
  572. sizeof (union ib_gid)) < 0) {
  573. iter->path = *path;
  574. ret = 0;
  575. break;
  576. }
  577. n = rb_next(n);
  578. }
  579. spin_unlock_irq(&priv->lock);
  580. return ret;
  581. }
  582. void ipoib_path_iter_read(struct ipoib_path_iter *iter,
  583. struct ipoib_path *path)
  584. {
  585. *path = iter->path;
  586. }
  587. #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
  588. void ipoib_mark_paths_invalid(struct net_device *dev)
  589. {
  590. struct ipoib_dev_priv *priv = ipoib_priv(dev);
  591. struct ipoib_path *path, *tp;
  592. spin_lock_irq(&priv->lock);
  593. list_for_each_entry_safe(path, tp, &priv->path_list, list) {
  594. ipoib_dbg(priv, "mark path LID 0x%08x GID %pI6 invalid\n",
  595. be32_to_cpu(sa_path_get_dlid(&path->pathrec)),
  596. path->pathrec.dgid.raw);
  597. if (path->ah)
  598. path->ah->valid = 0;
  599. }
  600. spin_unlock_irq(&priv->lock);
  601. }
  602. static void push_pseudo_header(struct sk_buff *skb, const char *daddr)
  603. {
  604. struct ipoib_pseudo_header *phdr;
  605. phdr = skb_push(skb, sizeof(*phdr));
  606. memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN);
  607. }
  608. void ipoib_flush_paths(struct net_device *dev)
  609. {
  610. struct ipoib_dev_priv *priv = ipoib_priv(dev);
  611. struct ipoib_path *path, *tp;
  612. LIST_HEAD(remove_list);
  613. unsigned long flags;
  614. netif_tx_lock_bh(dev);
  615. spin_lock_irqsave(&priv->lock, flags);
  616. list_splice_init(&priv->path_list, &remove_list);
  617. list_for_each_entry(path, &remove_list, list)
  618. rb_erase(&path->rb_node, &priv->path_tree);
  619. list_for_each_entry_safe(path, tp, &remove_list, list) {
  620. if (path->query)
  621. ib_sa_cancel_query(path->query_id, path->query);
  622. spin_unlock_irqrestore(&priv->lock, flags);
  623. netif_tx_unlock_bh(dev);
  624. wait_for_completion(&path->done);
  625. path_free(dev, path);
  626. netif_tx_lock_bh(dev);
  627. spin_lock_irqsave(&priv->lock, flags);
  628. }
  629. spin_unlock_irqrestore(&priv->lock, flags);
  630. netif_tx_unlock_bh(dev);
  631. }
  632. static void path_rec_completion(int status,
  633. struct sa_path_rec *pathrec,
  634. unsigned int num_prs, void *path_ptr)
  635. {
  636. struct ipoib_path *path = path_ptr;
  637. struct net_device *dev = path->dev;
  638. struct ipoib_dev_priv *priv = ipoib_priv(dev);
  639. struct ipoib_ah *ah = NULL;
  640. struct ipoib_ah *old_ah = NULL;
  641. struct ipoib_neigh *neigh, *tn;
  642. struct sk_buff_head skqueue;
  643. struct sk_buff *skb;
  644. unsigned long flags;
  645. if (!status)
  646. ipoib_dbg(priv, "PathRec LID 0x%04x for GID %pI6\n",
  647. be32_to_cpu(sa_path_get_dlid(pathrec)),
  648. pathrec->dgid.raw);
  649. else
  650. ipoib_dbg(priv, "PathRec status %d for GID %pI6\n",
  651. status, path->pathrec.dgid.raw);
  652. skb_queue_head_init(&skqueue);
  653. if (!status) {
  654. struct rdma_ah_attr av;
  655. if (!ib_init_ah_attr_from_path(priv->ca, priv->port,
  656. pathrec, &av, NULL)) {
  657. ah = ipoib_create_ah(dev, priv->pd, &av);
  658. rdma_destroy_ah_attr(&av);
  659. }
  660. }
  661. spin_lock_irqsave(&priv->lock, flags);
  662. if (!IS_ERR_OR_NULL(ah)) {
  663. /*
  664. * pathrec.dgid is used as the database key from the LLADDR,
  665. * it must remain unchanged even if the SA returns a different
  666. * GID to use in the AH.
  667. */
  668. if (memcmp(pathrec->dgid.raw, path->pathrec.dgid.raw,
  669. sizeof(union ib_gid))) {
  670. ipoib_dbg(
  671. priv,
  672. "%s got PathRec for gid %pI6 while asked for %pI6\n",
  673. dev->name, pathrec->dgid.raw,
  674. path->pathrec.dgid.raw);
  675. memcpy(pathrec->dgid.raw, path->pathrec.dgid.raw,
  676. sizeof(union ib_gid));
  677. }
  678. path->pathrec = *pathrec;
  679. old_ah = path->ah;
  680. path->ah = ah;
  681. ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n",
  682. ah, be32_to_cpu(sa_path_get_dlid(pathrec)),
  683. pathrec->sl);
  684. while ((skb = __skb_dequeue(&path->queue)))
  685. __skb_queue_tail(&skqueue, skb);
  686. list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) {
  687. if (neigh->ah) {
  688. WARN_ON(neigh->ah != old_ah);
  689. /*
  690. * Dropping the ah reference inside
  691. * priv->lock is safe here, because we
  692. * will hold one more reference from
  693. * the original value of path->ah (ie
  694. * old_ah).
  695. */
  696. ipoib_put_ah(neigh->ah);
  697. }
  698. kref_get(&path->ah->ref);
  699. neigh->ah = path->ah;
  700. if (ipoib_cm_enabled(dev, neigh->daddr)) {
  701. if (!ipoib_cm_get(neigh))
  702. ipoib_cm_set(neigh, ipoib_cm_create_tx(dev,
  703. path,
  704. neigh));
  705. if (!ipoib_cm_get(neigh)) {
  706. ipoib_neigh_free(neigh);
  707. continue;
  708. }
  709. }
  710. while ((skb = __skb_dequeue(&neigh->queue)))
  711. __skb_queue_tail(&skqueue, skb);
  712. }
  713. path->ah->valid = 1;
  714. }
  715. path->query = NULL;
  716. complete(&path->done);
  717. spin_unlock_irqrestore(&priv->lock, flags);
  718. if (IS_ERR_OR_NULL(ah))
  719. ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw);
  720. if (old_ah)
  721. ipoib_put_ah(old_ah);
  722. while ((skb = __skb_dequeue(&skqueue))) {
  723. int ret;
  724. skb->dev = dev;
  725. ret = dev_queue_xmit(skb);
  726. if (ret)
  727. ipoib_warn(priv, "%s: dev_queue_xmit failed to re-queue packet, ret:%d\n",
  728. __func__, ret);
  729. }
  730. }
  731. static void init_path_rec(struct ipoib_dev_priv *priv, struct ipoib_path *path,
  732. void *gid)
  733. {
  734. path->dev = priv->dev;
  735. if (rdma_cap_opa_ah(priv->ca, priv->port))
  736. path->pathrec.rec_type = SA_PATH_REC_TYPE_OPA;
  737. else
  738. path->pathrec.rec_type = SA_PATH_REC_TYPE_IB;
  739. memcpy(path->pathrec.dgid.raw, gid, sizeof(union ib_gid));
  740. path->pathrec.sgid = priv->local_gid;
  741. path->pathrec.pkey = cpu_to_be16(priv->pkey);
  742. path->pathrec.numb_path = 1;
  743. path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class;
  744. }
  745. static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid)
  746. {
  747. struct ipoib_dev_priv *priv = ipoib_priv(dev);
  748. struct ipoib_path *path;
  749. if (!priv->broadcast)
  750. return NULL;
  751. path = kzalloc_obj(*path, GFP_ATOMIC);
  752. if (!path)
  753. return NULL;
  754. skb_queue_head_init(&path->queue);
  755. INIT_LIST_HEAD(&path->neigh_list);
  756. init_path_rec(priv, path, gid);
  757. return path;
  758. }
  759. static int path_rec_start(struct net_device *dev,
  760. struct ipoib_path *path)
  761. {
  762. struct ipoib_dev_priv *priv = ipoib_priv(dev);
  763. ipoib_dbg(priv, "Start path record lookup for %pI6\n",
  764. path->pathrec.dgid.raw);
  765. init_completion(&path->done);
  766. path->query_id =
  767. ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port,
  768. &path->pathrec,
  769. IB_SA_PATH_REC_DGID |
  770. IB_SA_PATH_REC_SGID |
  771. IB_SA_PATH_REC_NUMB_PATH |
  772. IB_SA_PATH_REC_TRAFFIC_CLASS |
  773. IB_SA_PATH_REC_PKEY,
  774. 1000, GFP_ATOMIC,
  775. path_rec_completion,
  776. path, &path->query);
  777. if (path->query_id < 0) {
  778. ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id);
  779. path->query = NULL;
  780. complete(&path->done);
  781. return path->query_id;
  782. }
  783. return 0;
  784. }
  785. static void neigh_refresh_path(struct ipoib_neigh *neigh, u8 *daddr,
  786. struct net_device *dev)
  787. {
  788. struct ipoib_dev_priv *priv = ipoib_priv(dev);
  789. struct ipoib_path *path;
  790. unsigned long flags;
  791. spin_lock_irqsave(&priv->lock, flags);
  792. path = __path_find(dev, daddr + 4);
  793. if (!path)
  794. goto out;
  795. if (!path->query)
  796. path_rec_start(dev, path);
  797. out:
  798. spin_unlock_irqrestore(&priv->lock, flags);
  799. }
  800. static struct ipoib_neigh *neigh_add_path(struct sk_buff *skb, u8 *daddr,
  801. struct net_device *dev)
  802. {
  803. struct ipoib_dev_priv *priv = ipoib_priv(dev);
  804. struct rdma_netdev *rn = netdev_priv(dev);
  805. struct ipoib_path *path;
  806. struct ipoib_neigh *neigh;
  807. unsigned long flags;
  808. spin_lock_irqsave(&priv->lock, flags);
  809. neigh = ipoib_neigh_alloc(daddr, dev);
  810. if (!neigh) {
  811. spin_unlock_irqrestore(&priv->lock, flags);
  812. ++dev->stats.tx_dropped;
  813. dev_kfree_skb_any(skb);
  814. return NULL;
  815. }
  816. /* To avoid race condition, make sure that the
  817. * neigh will be added only once.
  818. */
  819. if (unlikely(!list_empty(&neigh->list))) {
  820. spin_unlock_irqrestore(&priv->lock, flags);
  821. return neigh;
  822. }
  823. path = __path_find(dev, daddr + 4);
  824. if (!path) {
  825. path = path_rec_create(dev, daddr + 4);
  826. if (!path)
  827. goto err_path;
  828. __path_add(dev, path);
  829. }
  830. list_add_tail(&neigh->list, &path->neigh_list);
  831. if (path->ah && path->ah->valid) {
  832. kref_get(&path->ah->ref);
  833. neigh->ah = path->ah;
  834. if (ipoib_cm_enabled(dev, neigh->daddr)) {
  835. if (!ipoib_cm_get(neigh))
  836. ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh));
  837. if (!ipoib_cm_get(neigh)) {
  838. ipoib_neigh_free(neigh);
  839. goto err_drop;
  840. }
  841. if (skb_queue_len(&neigh->queue) <
  842. IPOIB_MAX_PATH_REC_QUEUE) {
  843. push_pseudo_header(skb, neigh->daddr);
  844. __skb_queue_tail(&neigh->queue, skb);
  845. } else {
  846. ipoib_warn(priv, "queue length limit %d. Packet drop.\n",
  847. skb_queue_len(&neigh->queue));
  848. goto err_drop;
  849. }
  850. } else {
  851. spin_unlock_irqrestore(&priv->lock, flags);
  852. path->ah->last_send = rn->send(dev, skb, path->ah->ah,
  853. IPOIB_QPN(daddr));
  854. ipoib_neigh_put(neigh);
  855. return NULL;
  856. }
  857. } else {
  858. neigh->ah = NULL;
  859. if (!path->query && path_rec_start(dev, path))
  860. goto err_path;
  861. if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
  862. push_pseudo_header(skb, neigh->daddr);
  863. __skb_queue_tail(&neigh->queue, skb);
  864. } else {
  865. goto err_drop;
  866. }
  867. }
  868. spin_unlock_irqrestore(&priv->lock, flags);
  869. ipoib_neigh_put(neigh);
  870. return NULL;
  871. err_path:
  872. ipoib_neigh_free(neigh);
  873. err_drop:
  874. ++dev->stats.tx_dropped;
  875. dev_kfree_skb_any(skb);
  876. spin_unlock_irqrestore(&priv->lock, flags);
  877. ipoib_neigh_put(neigh);
  878. return NULL;
  879. }
  880. static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
  881. struct ipoib_pseudo_header *phdr)
  882. {
  883. struct ipoib_dev_priv *priv = ipoib_priv(dev);
  884. struct rdma_netdev *rn = netdev_priv(dev);
  885. struct ipoib_path *path;
  886. unsigned long flags;
  887. spin_lock_irqsave(&priv->lock, flags);
  888. /* no broadcast means that all paths are (going to be) not valid */
  889. if (!priv->broadcast)
  890. goto drop_and_unlock;
  891. path = __path_find(dev, phdr->hwaddr + 4);
  892. if (!path || !path->ah || !path->ah->valid) {
  893. if (!path) {
  894. path = path_rec_create(dev, phdr->hwaddr + 4);
  895. if (!path)
  896. goto drop_and_unlock;
  897. __path_add(dev, path);
  898. } else {
  899. /*
  900. * make sure there are no changes in the existing
  901. * path record
  902. */
  903. init_path_rec(priv, path, phdr->hwaddr + 4);
  904. }
  905. if (!path->query && path_rec_start(dev, path)) {
  906. goto drop_and_unlock;
  907. }
  908. if (skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
  909. push_pseudo_header(skb, phdr->hwaddr);
  910. __skb_queue_tail(&path->queue, skb);
  911. goto unlock;
  912. } else {
  913. goto drop_and_unlock;
  914. }
  915. }
  916. spin_unlock_irqrestore(&priv->lock, flags);
  917. ipoib_dbg(priv, "Send unicast ARP to %08x\n",
  918. be32_to_cpu(sa_path_get_dlid(&path->pathrec)));
  919. path->ah->last_send = rn->send(dev, skb, path->ah->ah,
  920. IPOIB_QPN(phdr->hwaddr));
  921. return;
  922. drop_and_unlock:
  923. ++dev->stats.tx_dropped;
  924. dev_kfree_skb_any(skb);
  925. unlock:
  926. spin_unlock_irqrestore(&priv->lock, flags);
  927. }
  928. static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
  929. {
  930. struct ipoib_dev_priv *priv = ipoib_priv(dev);
  931. struct rdma_netdev *rn = netdev_priv(dev);
  932. struct ipoib_neigh *neigh;
  933. struct ipoib_pseudo_header *phdr;
  934. struct ipoib_header *header;
  935. unsigned long flags;
  936. phdr = (struct ipoib_pseudo_header *) skb->data;
  937. skb_pull(skb, sizeof(*phdr));
  938. header = (struct ipoib_header *) skb->data;
  939. if (unlikely(phdr->hwaddr[4] == 0xff)) {
  940. /* multicast, arrange "if" according to probability */
  941. if ((header->proto != htons(ETH_P_IP)) &&
  942. (header->proto != htons(ETH_P_IPV6)) &&
  943. (header->proto != htons(ETH_P_ARP)) &&
  944. (header->proto != htons(ETH_P_RARP)) &&
  945. (header->proto != htons(ETH_P_TIPC))) {
  946. /* ethertype not supported by IPoIB */
  947. ++dev->stats.tx_dropped;
  948. dev_kfree_skb_any(skb);
  949. return NETDEV_TX_OK;
  950. }
  951. /* Add in the P_Key for multicast*/
  952. phdr->hwaddr[8] = (priv->pkey >> 8) & 0xff;
  953. phdr->hwaddr[9] = priv->pkey & 0xff;
  954. neigh = ipoib_neigh_get(dev, phdr->hwaddr);
  955. if (likely(neigh))
  956. goto send_using_neigh;
  957. ipoib_mcast_send(dev, phdr->hwaddr, skb);
  958. return NETDEV_TX_OK;
  959. }
  960. /* unicast, arrange "switch" according to probability */
  961. switch (header->proto) {
  962. case htons(ETH_P_IP):
  963. case htons(ETH_P_IPV6):
  964. case htons(ETH_P_TIPC):
  965. neigh = ipoib_neigh_get(dev, phdr->hwaddr);
  966. if (unlikely(!neigh)) {
  967. neigh = neigh_add_path(skb, phdr->hwaddr, dev);
  968. if (likely(!neigh))
  969. return NETDEV_TX_OK;
  970. }
  971. break;
  972. case htons(ETH_P_ARP):
  973. case htons(ETH_P_RARP):
  974. /* for unicast ARP and RARP should always perform path find */
  975. unicast_arp_send(skb, dev, phdr);
  976. return NETDEV_TX_OK;
  977. default:
  978. /* ethertype not supported by IPoIB */
  979. ++dev->stats.tx_dropped;
  980. dev_kfree_skb_any(skb);
  981. return NETDEV_TX_OK;
  982. }
  983. send_using_neigh:
  984. /* note we now hold a ref to neigh */
  985. if (ipoib_cm_get(neigh)) {
  986. if (ipoib_cm_up(neigh)) {
  987. ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
  988. goto unref;
  989. }
  990. } else if (neigh->ah && neigh->ah->valid) {
  991. neigh->ah->last_send = rn->send(dev, skb, neigh->ah->ah,
  992. IPOIB_QPN(phdr->hwaddr));
  993. goto unref;
  994. } else if (neigh->ah) {
  995. neigh_refresh_path(neigh, phdr->hwaddr, dev);
  996. }
  997. if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
  998. push_pseudo_header(skb, phdr->hwaddr);
  999. spin_lock_irqsave(&priv->lock, flags);
  1000. __skb_queue_tail(&neigh->queue, skb);
  1001. spin_unlock_irqrestore(&priv->lock, flags);
  1002. } else {
  1003. ++dev->stats.tx_dropped;
  1004. dev_kfree_skb_any(skb);
  1005. }
  1006. unref:
  1007. ipoib_neigh_put(neigh);
  1008. return NETDEV_TX_OK;
  1009. }
  1010. static void ipoib_timeout(struct net_device *dev, unsigned int txqueue)
  1011. {
  1012. struct ipoib_dev_priv *priv = ipoib_priv(dev);
  1013. struct rdma_netdev *rn = netdev_priv(dev);
  1014. if (rn->tx_timeout) {
  1015. rn->tx_timeout(dev, txqueue);
  1016. return;
  1017. }
  1018. ipoib_warn(priv, "transmit timeout: latency %d msecs\n",
  1019. jiffies_to_msecs(jiffies - dev_trans_start(dev)));
  1020. ipoib_warn(priv,
  1021. "queue stopped %d, tx_head %u, tx_tail %u, global_tx_head %u, global_tx_tail %u\n",
  1022. netif_queue_stopped(dev), priv->tx_head, priv->tx_tail,
  1023. priv->global_tx_head, priv->global_tx_tail);
  1024. schedule_work(&priv->tx_timeout_work);
  1025. }
  1026. void ipoib_ib_tx_timeout_work(struct work_struct *work)
  1027. {
  1028. struct ipoib_dev_priv *priv = container_of(work,
  1029. struct ipoib_dev_priv,
  1030. tx_timeout_work);
  1031. int err;
  1032. rtnl_lock();
  1033. netdev_lock_ops(priv->dev);
  1034. if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
  1035. goto unlock;
  1036. ipoib_stop(priv->dev);
  1037. err = ipoib_open(priv->dev);
  1038. if (err) {
  1039. ipoib_warn(priv, "ipoib_open failed recovering from a tx_timeout, err(%d).\n",
  1040. err);
  1041. goto unlock;
  1042. }
  1043. netif_tx_wake_all_queues(priv->dev);
  1044. unlock:
  1045. netdev_unlock_ops(priv->dev);
  1046. rtnl_unlock();
  1047. }
  1048. static int ipoib_hard_header(struct sk_buff *skb,
  1049. struct net_device *dev,
  1050. unsigned short type,
  1051. const void *daddr,
  1052. const void *saddr,
  1053. unsigned int len)
  1054. {
  1055. struct ipoib_header *header;
  1056. header = skb_push(skb, sizeof(*header));
  1057. header->proto = htons(type);
  1058. header->reserved = 0;
  1059. /*
  1060. * we don't rely on dst_entry structure, always stuff the
  1061. * destination address into skb hard header so we can figure out where
  1062. * to send the packet later.
  1063. */
  1064. push_pseudo_header(skb, daddr);
  1065. return IPOIB_HARD_LEN;
  1066. }
  1067. static void ipoib_set_mcast_list(struct net_device *dev)
  1068. {
  1069. struct ipoib_dev_priv *priv = ipoib_priv(dev);
  1070. if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
  1071. ipoib_dbg(priv, "IPOIB_FLAG_OPER_UP not set");
  1072. return;
  1073. }
  1074. queue_work(priv->wq, &priv->restart_task);
  1075. }
  1076. static int ipoib_get_iflink(const struct net_device *dev)
  1077. {
  1078. struct ipoib_dev_priv *priv = ipoib_priv(dev);
  1079. /* parent interface */
  1080. if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags))
  1081. return READ_ONCE(dev->ifindex);
  1082. /* child/vlan interface */
  1083. return READ_ONCE(priv->parent->ifindex);
  1084. }
  1085. static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr)
  1086. {
  1087. /*
  1088. * Use only the address parts that contributes to spreading
  1089. * The subnet prefix is not used as one can not connect to
  1090. * same remote port (GUID) using the same remote QPN via two
  1091. * different subnets.
  1092. */
  1093. /* qpn octets[1:4) & port GUID octets[12:20) */
  1094. u32 *d32 = (u32 *) daddr;
  1095. u32 hv;
  1096. hv = jhash_3words(d32[3], d32[4], IPOIB_QPN_MASK & d32[0], 0);
  1097. return hv & htbl->mask;
  1098. }
  1099. struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr)
  1100. {
  1101. struct ipoib_dev_priv *priv = ipoib_priv(dev);
  1102. struct ipoib_neigh_table *ntbl = &priv->ntbl;
  1103. struct ipoib_neigh_hash *htbl;
  1104. struct ipoib_neigh *neigh = NULL;
  1105. u32 hash_val;
  1106. rcu_read_lock_bh();
  1107. htbl = rcu_dereference_bh(ntbl->htbl);
  1108. if (!htbl)
  1109. goto out_unlock;
  1110. hash_val = ipoib_addr_hash(htbl, daddr);
  1111. for (neigh = rcu_dereference_bh(htbl->buckets[hash_val]);
  1112. neigh != NULL;
  1113. neigh = rcu_dereference_bh(neigh->hnext)) {
  1114. if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) {
  1115. /* found, take one ref on behalf of the caller */
  1116. if (!refcount_inc_not_zero(&neigh->refcnt)) {
  1117. /* deleted */
  1118. neigh = NULL;
  1119. goto out_unlock;
  1120. }
  1121. if (likely(skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE))
  1122. neigh->alive = jiffies;
  1123. goto out_unlock;
  1124. }
  1125. }
  1126. out_unlock:
  1127. rcu_read_unlock_bh();
  1128. return neigh;
  1129. }
  1130. static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv)
  1131. {
  1132. struct ipoib_neigh_table *ntbl = &priv->ntbl;
  1133. struct ipoib_neigh_hash *htbl;
  1134. unsigned long neigh_obsolete;
  1135. unsigned long dt;
  1136. unsigned long flags;
  1137. int i;
  1138. LIST_HEAD(remove_list);
  1139. spin_lock_irqsave(&priv->lock, flags);
  1140. htbl = rcu_dereference_protected(ntbl->htbl,
  1141. lockdep_is_held(&priv->lock));
  1142. if (!htbl)
  1143. goto out_unlock;
  1144. /* neigh is obsolete if it was idle for two GC periods */
  1145. dt = 2 * arp_tbl.gc_interval;
  1146. neigh_obsolete = jiffies - dt;
  1147. for (i = 0; i < htbl->size; i++) {
  1148. struct ipoib_neigh *neigh;
  1149. struct ipoib_neigh __rcu **np = &htbl->buckets[i];
  1150. while ((neigh = rcu_dereference_protected(*np,
  1151. lockdep_is_held(&priv->lock))) != NULL) {
  1152. /* was the neigh idle for two GC periods */
  1153. if (time_after(neigh_obsolete, neigh->alive)) {
  1154. ipoib_check_and_add_mcast_sendonly(priv, neigh->daddr + 4, &remove_list);
  1155. rcu_assign_pointer(*np,
  1156. rcu_dereference_protected(neigh->hnext,
  1157. lockdep_is_held(&priv->lock)));
  1158. /* remove from path/mc list */
  1159. list_del_init(&neigh->list);
  1160. call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
  1161. } else {
  1162. np = &neigh->hnext;
  1163. }
  1164. }
  1165. }
  1166. out_unlock:
  1167. spin_unlock_irqrestore(&priv->lock, flags);
  1168. ipoib_mcast_remove_list(&remove_list);
  1169. }
  1170. static void ipoib_reap_neigh(struct work_struct *work)
  1171. {
  1172. struct ipoib_dev_priv *priv =
  1173. container_of(work, struct ipoib_dev_priv, neigh_reap_task.work);
  1174. __ipoib_reap_neigh(priv);
  1175. queue_delayed_work(priv->wq, &priv->neigh_reap_task,
  1176. arp_tbl.gc_interval);
  1177. }
  1178. static struct ipoib_neigh *ipoib_neigh_ctor(u8 *daddr,
  1179. struct net_device *dev)
  1180. {
  1181. struct ipoib_neigh *neigh;
  1182. neigh = kzalloc_obj(*neigh, GFP_ATOMIC);
  1183. if (!neigh)
  1184. return NULL;
  1185. neigh->dev = dev;
  1186. memcpy(&neigh->daddr, daddr, sizeof(neigh->daddr));
  1187. skb_queue_head_init(&neigh->queue);
  1188. INIT_LIST_HEAD(&neigh->list);
  1189. ipoib_cm_set(neigh, NULL);
  1190. /* one ref on behalf of the caller */
  1191. refcount_set(&neigh->refcnt, 1);
  1192. return neigh;
  1193. }
  1194. struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr,
  1195. struct net_device *dev)
  1196. {
  1197. struct ipoib_dev_priv *priv = ipoib_priv(dev);
  1198. struct ipoib_neigh_table *ntbl = &priv->ntbl;
  1199. struct ipoib_neigh_hash *htbl;
  1200. struct ipoib_neigh *neigh;
  1201. u32 hash_val;
  1202. htbl = rcu_dereference_protected(ntbl->htbl,
  1203. lockdep_is_held(&priv->lock));
  1204. if (!htbl) {
  1205. neigh = NULL;
  1206. goto out_unlock;
  1207. }
  1208. /* need to add a new neigh, but maybe some other thread succeeded?
  1209. * recalc hash, maybe hash resize took place so we do a search
  1210. */
  1211. hash_val = ipoib_addr_hash(htbl, daddr);
  1212. for (neigh = rcu_dereference_protected(htbl->buckets[hash_val],
  1213. lockdep_is_held(&priv->lock));
  1214. neigh != NULL;
  1215. neigh = rcu_dereference_protected(neigh->hnext,
  1216. lockdep_is_held(&priv->lock))) {
  1217. if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) {
  1218. /* found, take one ref on behalf of the caller */
  1219. if (!refcount_inc_not_zero(&neigh->refcnt)) {
  1220. /* deleted */
  1221. neigh = NULL;
  1222. break;
  1223. }
  1224. neigh->alive = jiffies;
  1225. goto out_unlock;
  1226. }
  1227. }
  1228. neigh = ipoib_neigh_ctor(daddr, dev);
  1229. if (!neigh)
  1230. goto out_unlock;
  1231. /* one ref on behalf of the hash table */
  1232. refcount_inc(&neigh->refcnt);
  1233. neigh->alive = jiffies;
  1234. /* put in hash */
  1235. rcu_assign_pointer(neigh->hnext,
  1236. rcu_dereference_protected(htbl->buckets[hash_val],
  1237. lockdep_is_held(&priv->lock)));
  1238. rcu_assign_pointer(htbl->buckets[hash_val], neigh);
  1239. atomic_inc(&ntbl->entries);
  1240. out_unlock:
  1241. return neigh;
  1242. }
  1243. void ipoib_neigh_dtor(struct ipoib_neigh *neigh)
  1244. {
  1245. /* neigh reference count was dropprd to zero */
  1246. struct net_device *dev = neigh->dev;
  1247. struct ipoib_dev_priv *priv = ipoib_priv(dev);
  1248. struct sk_buff *skb;
  1249. if (neigh->ah)
  1250. ipoib_put_ah(neigh->ah);
  1251. while ((skb = __skb_dequeue(&neigh->queue))) {
  1252. ++dev->stats.tx_dropped;
  1253. dev_kfree_skb_any(skb);
  1254. }
  1255. if (ipoib_cm_get(neigh))
  1256. ipoib_cm_destroy_tx(ipoib_cm_get(neigh));
  1257. ipoib_dbg(ipoib_priv(dev),
  1258. "neigh free for %06x %pI6\n",
  1259. IPOIB_QPN(neigh->daddr),
  1260. neigh->daddr + 4);
  1261. kfree(neigh);
  1262. if (atomic_dec_and_test(&priv->ntbl.entries)) {
  1263. if (test_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags))
  1264. complete(&priv->ntbl.flushed);
  1265. }
  1266. }
  1267. static void ipoib_neigh_reclaim(struct rcu_head *rp)
  1268. {
  1269. /* Called as a result of removal from hash table */
  1270. struct ipoib_neigh *neigh = container_of(rp, struct ipoib_neigh, rcu);
  1271. /* note TX context may hold another ref */
  1272. ipoib_neigh_put(neigh);
  1273. }
  1274. void ipoib_neigh_free(struct ipoib_neigh *neigh)
  1275. {
  1276. struct net_device *dev = neigh->dev;
  1277. struct ipoib_dev_priv *priv = ipoib_priv(dev);
  1278. struct ipoib_neigh_table *ntbl = &priv->ntbl;
  1279. struct ipoib_neigh_hash *htbl;
  1280. struct ipoib_neigh __rcu **np;
  1281. struct ipoib_neigh *n;
  1282. u32 hash_val;
  1283. htbl = rcu_dereference_protected(ntbl->htbl,
  1284. lockdep_is_held(&priv->lock));
  1285. if (!htbl)
  1286. return;
  1287. hash_val = ipoib_addr_hash(htbl, neigh->daddr);
  1288. np = &htbl->buckets[hash_val];
  1289. for (n = rcu_dereference_protected(*np,
  1290. lockdep_is_held(&priv->lock));
  1291. n != NULL;
  1292. n = rcu_dereference_protected(*np,
  1293. lockdep_is_held(&priv->lock))) {
  1294. if (n == neigh) {
  1295. /* found */
  1296. rcu_assign_pointer(*np,
  1297. rcu_dereference_protected(neigh->hnext,
  1298. lockdep_is_held(&priv->lock)));
  1299. /* remove from parent list */
  1300. list_del_init(&neigh->list);
  1301. call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
  1302. return;
  1303. } else {
  1304. np = &n->hnext;
  1305. }
  1306. }
  1307. }
  1308. static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv)
  1309. {
  1310. struct ipoib_neigh_table *ntbl = &priv->ntbl;
  1311. struct ipoib_neigh_hash *htbl;
  1312. struct ipoib_neigh __rcu **buckets;
  1313. u32 size;
  1314. clear_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags);
  1315. ntbl->htbl = NULL;
  1316. htbl = kzalloc_obj(*htbl);
  1317. if (!htbl)
  1318. return -ENOMEM;
  1319. size = roundup_pow_of_two(arp_tbl.gc_thresh3);
  1320. buckets = kvzalloc_objs(*buckets, size);
  1321. if (!buckets) {
  1322. kfree(htbl);
  1323. return -ENOMEM;
  1324. }
  1325. htbl->size = size;
  1326. htbl->mask = (size - 1);
  1327. htbl->buckets = buckets;
  1328. RCU_INIT_POINTER(ntbl->htbl, htbl);
  1329. htbl->ntbl = ntbl;
  1330. atomic_set(&ntbl->entries, 0);
  1331. /* start garbage collection */
  1332. queue_delayed_work(priv->wq, &priv->neigh_reap_task,
  1333. arp_tbl.gc_interval);
  1334. return 0;
  1335. }
  1336. static void neigh_hash_free_rcu(struct rcu_head *head)
  1337. {
  1338. struct ipoib_neigh_hash *htbl = container_of(head,
  1339. struct ipoib_neigh_hash,
  1340. rcu);
  1341. struct ipoib_neigh __rcu **buckets = htbl->buckets;
  1342. struct ipoib_neigh_table *ntbl = htbl->ntbl;
  1343. kvfree(buckets);
  1344. kfree(htbl);
  1345. complete(&ntbl->deleted);
  1346. }
  1347. void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid)
  1348. {
  1349. struct ipoib_dev_priv *priv = ipoib_priv(dev);
  1350. struct ipoib_neigh_table *ntbl = &priv->ntbl;
  1351. struct ipoib_neigh_hash *htbl;
  1352. unsigned long flags;
  1353. int i;
  1354. /* remove all neigh connected to a given path or mcast */
  1355. spin_lock_irqsave(&priv->lock, flags);
  1356. htbl = rcu_dereference_protected(ntbl->htbl,
  1357. lockdep_is_held(&priv->lock));
  1358. if (!htbl)
  1359. goto out_unlock;
  1360. for (i = 0; i < htbl->size; i++) {
  1361. struct ipoib_neigh *neigh;
  1362. struct ipoib_neigh __rcu **np = &htbl->buckets[i];
  1363. while ((neigh = rcu_dereference_protected(*np,
  1364. lockdep_is_held(&priv->lock))) != NULL) {
  1365. /* delete neighs belong to this parent */
  1366. if (!memcmp(gid, neigh->daddr + 4, sizeof (union ib_gid))) {
  1367. rcu_assign_pointer(*np,
  1368. rcu_dereference_protected(neigh->hnext,
  1369. lockdep_is_held(&priv->lock)));
  1370. /* remove from parent list */
  1371. list_del_init(&neigh->list);
  1372. call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
  1373. } else {
  1374. np = &neigh->hnext;
  1375. }
  1376. }
  1377. }
  1378. out_unlock:
  1379. spin_unlock_irqrestore(&priv->lock, flags);
  1380. }
  1381. static void ipoib_flush_neighs(struct ipoib_dev_priv *priv)
  1382. {
  1383. struct ipoib_neigh_table *ntbl = &priv->ntbl;
  1384. struct ipoib_neigh_hash *htbl;
  1385. unsigned long flags;
  1386. int i, wait_flushed = 0;
  1387. init_completion(&priv->ntbl.flushed);
  1388. set_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags);
  1389. spin_lock_irqsave(&priv->lock, flags);
  1390. htbl = rcu_dereference_protected(ntbl->htbl,
  1391. lockdep_is_held(&priv->lock));
  1392. if (!htbl)
  1393. goto out_unlock;
  1394. wait_flushed = atomic_read(&priv->ntbl.entries);
  1395. if (!wait_flushed)
  1396. goto free_htbl;
  1397. for (i = 0; i < htbl->size; i++) {
  1398. struct ipoib_neigh *neigh;
  1399. struct ipoib_neigh __rcu **np = &htbl->buckets[i];
  1400. while ((neigh = rcu_dereference_protected(*np,
  1401. lockdep_is_held(&priv->lock))) != NULL) {
  1402. rcu_assign_pointer(*np,
  1403. rcu_dereference_protected(neigh->hnext,
  1404. lockdep_is_held(&priv->lock)));
  1405. /* remove from path/mc list */
  1406. list_del_init(&neigh->list);
  1407. call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
  1408. }
  1409. }
  1410. free_htbl:
  1411. rcu_assign_pointer(ntbl->htbl, NULL);
  1412. call_rcu(&htbl->rcu, neigh_hash_free_rcu);
  1413. out_unlock:
  1414. spin_unlock_irqrestore(&priv->lock, flags);
  1415. if (wait_flushed)
  1416. wait_for_completion(&priv->ntbl.flushed);
  1417. }
  1418. static void ipoib_neigh_hash_uninit(struct net_device *dev)
  1419. {
  1420. struct ipoib_dev_priv *priv = ipoib_priv(dev);
  1421. ipoib_dbg(priv, "%s\n", __func__);
  1422. init_completion(&priv->ntbl.deleted);
  1423. cancel_delayed_work_sync(&priv->neigh_reap_task);
  1424. ipoib_flush_neighs(priv);
  1425. wait_for_completion(&priv->ntbl.deleted);
  1426. }
  1427. static void ipoib_napi_add(struct net_device *dev)
  1428. {
  1429. struct ipoib_dev_priv *priv = ipoib_priv(dev);
  1430. netif_napi_add_weight(dev, &priv->recv_napi, ipoib_rx_poll,
  1431. IPOIB_NUM_WC);
  1432. netif_napi_add_weight(dev, &priv->send_napi, ipoib_tx_poll,
  1433. MAX_SEND_CQE);
  1434. }
  1435. static void ipoib_napi_del(struct net_device *dev)
  1436. {
  1437. struct ipoib_dev_priv *priv = ipoib_priv(dev);
  1438. netif_napi_del(&priv->recv_napi);
  1439. netif_napi_del(&priv->send_napi);
  1440. }
  1441. static void ipoib_dev_uninit_default(struct net_device *dev)
  1442. {
  1443. struct ipoib_dev_priv *priv = ipoib_priv(dev);
  1444. ipoib_transport_dev_cleanup(dev);
  1445. ipoib_napi_del(dev);
  1446. ipoib_cm_dev_cleanup(dev);
  1447. kfree(priv->rx_ring);
  1448. vfree(priv->tx_ring);
  1449. priv->rx_ring = NULL;
  1450. priv->tx_ring = NULL;
  1451. }
  1452. static int ipoib_dev_init_default(struct net_device *dev)
  1453. {
  1454. struct ipoib_dev_priv *priv = ipoib_priv(dev);
  1455. u8 addr_mod[3];
  1456. ipoib_napi_add(dev);
  1457. /* Allocate RX/TX "rings" to hold queued skbs */
  1458. priv->rx_ring = kzalloc_objs(*priv->rx_ring, ipoib_recvq_size);
  1459. if (!priv->rx_ring)
  1460. goto out;
  1461. priv->tx_ring = vzalloc(array_size(ipoib_sendq_size,
  1462. sizeof(*priv->tx_ring)));
  1463. if (!priv->tx_ring) {
  1464. pr_warn("%s: failed to allocate TX ring (%d entries)\n",
  1465. priv->ca->name, ipoib_sendq_size);
  1466. goto out_rx_ring_cleanup;
  1467. }
  1468. /* priv->tx_head, tx_tail and global_tx_tail/head are already 0 */
  1469. if (ipoib_transport_dev_init(dev, priv->ca)) {
  1470. pr_warn("%s: ipoib_transport_dev_init failed\n",
  1471. priv->ca->name);
  1472. goto out_tx_ring_cleanup;
  1473. }
  1474. /* after qp created set dev address */
  1475. addr_mod[0] = (priv->qp->qp_num >> 16) & 0xff;
  1476. addr_mod[1] = (priv->qp->qp_num >> 8) & 0xff;
  1477. addr_mod[2] = (priv->qp->qp_num) & 0xff;
  1478. dev_addr_mod(priv->dev, 1, addr_mod, sizeof(addr_mod));
  1479. return 0;
  1480. out_tx_ring_cleanup:
  1481. vfree(priv->tx_ring);
  1482. out_rx_ring_cleanup:
  1483. kfree(priv->rx_ring);
  1484. out:
  1485. ipoib_napi_del(dev);
  1486. return -ENOMEM;
  1487. }
  1488. static int ipoib_ioctl(struct net_device *dev, struct ifreq *ifr,
  1489. int cmd)
  1490. {
  1491. struct ipoib_dev_priv *priv = ipoib_priv(dev);
  1492. if (!priv->rn_ops->ndo_eth_ioctl)
  1493. return -EOPNOTSUPP;
  1494. return priv->rn_ops->ndo_eth_ioctl(dev, ifr, cmd);
  1495. }
  1496. static int ipoib_hwtstamp_get(struct net_device *dev,
  1497. struct kernel_hwtstamp_config *config)
  1498. {
  1499. struct ipoib_dev_priv *priv = ipoib_priv(dev);
  1500. if (!priv->rn_ops->ndo_hwtstamp_get)
  1501. return -EOPNOTSUPP;
  1502. return priv->rn_ops->ndo_hwtstamp_get(dev, config);
  1503. }
  1504. static int ipoib_hwtstamp_set(struct net_device *dev,
  1505. struct kernel_hwtstamp_config *config,
  1506. struct netlink_ext_ack *extack)
  1507. {
  1508. struct ipoib_dev_priv *priv = ipoib_priv(dev);
  1509. if (!priv->rn_ops->ndo_hwtstamp_set)
  1510. return -EOPNOTSUPP;
  1511. return priv->rn_ops->ndo_hwtstamp_set(dev, config, extack);
  1512. }
  1513. static int ipoib_dev_init(struct net_device *dev)
  1514. {
  1515. struct ipoib_dev_priv *priv = ipoib_priv(dev);
  1516. int ret = -ENOMEM;
  1517. priv->qp = NULL;
  1518. /*
  1519. * the various IPoIB tasks assume they will never race against
  1520. * themselves, so always use a single thread workqueue
  1521. */
  1522. priv->wq = alloc_ordered_workqueue("ipoib_wq", WQ_MEM_RECLAIM);
  1523. if (!priv->wq) {
  1524. pr_warn("%s: failed to allocate device WQ\n", dev->name);
  1525. goto out;
  1526. }
  1527. /* create pd, which used both for control and datapath*/
  1528. priv->pd = ib_alloc_pd(priv->ca, 0);
  1529. if (IS_ERR(priv->pd)) {
  1530. pr_warn("%s: failed to allocate PD\n", priv->ca->name);
  1531. goto clean_wq;
  1532. }
  1533. ret = priv->rn_ops->ndo_init(dev);
  1534. if (ret) {
  1535. pr_warn("%s failed to init HW resource\n", dev->name);
  1536. goto out_free_pd;
  1537. }
  1538. ret = ipoib_neigh_hash_init(priv);
  1539. if (ret) {
  1540. pr_warn("%s failed to init neigh hash\n", dev->name);
  1541. goto out_dev_uninit;
  1542. }
  1543. if (dev->flags & IFF_UP) {
  1544. if (ipoib_ib_dev_open(dev)) {
  1545. pr_warn("%s failed to open device\n", dev->name);
  1546. ret = -ENODEV;
  1547. goto out_hash_uninit;
  1548. }
  1549. }
  1550. return 0;
  1551. out_hash_uninit:
  1552. ipoib_neigh_hash_uninit(dev);
  1553. out_dev_uninit:
  1554. ipoib_ib_dev_cleanup(dev);
  1555. out_free_pd:
  1556. if (priv->pd) {
  1557. ib_dealloc_pd(priv->pd);
  1558. priv->pd = NULL;
  1559. }
  1560. clean_wq:
  1561. if (priv->wq) {
  1562. destroy_workqueue(priv->wq);
  1563. priv->wq = NULL;
  1564. }
  1565. out:
  1566. return ret;
  1567. }
  1568. /*
  1569. * This must be called before doing an unregister_netdev on a parent device to
  1570. * shutdown the IB event handler.
  1571. */
  1572. static void ipoib_parent_unregister_pre(struct net_device *ndev)
  1573. {
  1574. struct ipoib_dev_priv *priv = ipoib_priv(ndev);
  1575. /*
  1576. * ipoib_set_mac checks netif_running before pushing work, clearing
  1577. * running ensures the it will not add more work.
  1578. */
  1579. rtnl_lock();
  1580. dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP, NULL);
  1581. rtnl_unlock();
  1582. /* ipoib_event() cannot be running once this returns */
  1583. ib_unregister_event_handler(&priv->event_handler);
  1584. /*
  1585. * Work on the queue grabs the rtnl lock, so this cannot be done while
  1586. * also holding it.
  1587. */
  1588. flush_workqueue(ipoib_workqueue);
  1589. }
  1590. static void ipoib_set_dev_features(struct ipoib_dev_priv *priv)
  1591. {
  1592. priv->hca_caps = priv->ca->attrs.device_cap_flags;
  1593. priv->kernel_caps = priv->ca->attrs.kernel_cap_flags;
  1594. if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) {
  1595. priv->dev->hw_features |= NETIF_F_IP_CSUM | NETIF_F_RXCSUM;
  1596. if (priv->kernel_caps & IBK_UD_TSO)
  1597. priv->dev->hw_features |= NETIF_F_TSO;
  1598. priv->dev->features |= priv->dev->hw_features;
  1599. }
  1600. }
  1601. static int ipoib_parent_init(struct net_device *ndev)
  1602. {
  1603. struct ipoib_dev_priv *priv = ipoib_priv(ndev);
  1604. struct ib_port_attr attr;
  1605. int result;
  1606. result = ib_query_port(priv->ca, priv->port, &attr);
  1607. if (result) {
  1608. pr_warn("%s: ib_query_port %d failed\n", priv->ca->name,
  1609. priv->port);
  1610. return result;
  1611. }
  1612. priv->max_ib_mtu = rdma_mtu_from_attr(priv->ca, priv->port, &attr);
  1613. result = ib_query_pkey(priv->ca, priv->port, 0, &priv->pkey);
  1614. if (result) {
  1615. pr_warn("%s: ib_query_pkey port %d failed (ret = %d)\n",
  1616. priv->ca->name, priv->port, result);
  1617. return result;
  1618. }
  1619. result = rdma_query_gid(priv->ca, priv->port, 0, &priv->local_gid);
  1620. if (result) {
  1621. pr_warn("%s: rdma_query_gid port %d failed (ret = %d)\n",
  1622. priv->ca->name, priv->port, result);
  1623. return result;
  1624. }
  1625. dev_addr_mod(priv->dev, 4, priv->local_gid.raw, sizeof(union ib_gid));
  1626. SET_NETDEV_DEV(priv->dev, priv->ca->dev.parent);
  1627. priv->dev->dev_port = priv->port - 1;
  1628. /* Let's set this one too for backwards compatibility. */
  1629. priv->dev->dev_id = priv->port - 1;
  1630. return 0;
  1631. }
  1632. static void ipoib_child_init(struct net_device *ndev)
  1633. {
  1634. struct ipoib_dev_priv *priv = ipoib_priv(ndev);
  1635. struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent);
  1636. priv->max_ib_mtu = ppriv->max_ib_mtu;
  1637. set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags);
  1638. if (memchr_inv(priv->dev->dev_addr, 0, INFINIBAND_ALEN))
  1639. memcpy(&priv->local_gid, priv->dev->dev_addr + 4,
  1640. sizeof(priv->local_gid));
  1641. else {
  1642. __dev_addr_set(priv->dev, ppriv->dev->dev_addr,
  1643. INFINIBAND_ALEN);
  1644. memcpy(&priv->local_gid, &ppriv->local_gid,
  1645. sizeof(priv->local_gid));
  1646. }
  1647. }
  1648. static int ipoib_ndo_init(struct net_device *ndev)
  1649. {
  1650. struct ipoib_dev_priv *priv = ipoib_priv(ndev);
  1651. int rc;
  1652. struct rdma_netdev *rn = netdev_priv(ndev);
  1653. if (priv->parent) {
  1654. ipoib_child_init(ndev);
  1655. } else {
  1656. rc = ipoib_parent_init(ndev);
  1657. if (rc)
  1658. return rc;
  1659. }
  1660. /* MTU will be reset when mcast join happens */
  1661. ndev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu);
  1662. priv->mcast_mtu = priv->admin_mtu = ndev->mtu;
  1663. rn->mtu = priv->mcast_mtu;
  1664. ndev->max_mtu = IPOIB_CM_MTU;
  1665. ndev->neigh_priv_len = sizeof(struct ipoib_neigh);
  1666. /*
  1667. * Set the full membership bit, so that we join the right
  1668. * broadcast group, etc.
  1669. */
  1670. priv->pkey |= 0x8000;
  1671. ndev->broadcast[8] = priv->pkey >> 8;
  1672. ndev->broadcast[9] = priv->pkey & 0xff;
  1673. set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags);
  1674. ipoib_set_dev_features(priv);
  1675. rc = ipoib_dev_init(ndev);
  1676. if (rc) {
  1677. pr_warn("%s: failed to initialize device: %s port %d (ret = %d)\n",
  1678. priv->ca->name, priv->dev->name, priv->port, rc);
  1679. return rc;
  1680. }
  1681. if (priv->parent) {
  1682. struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent);
  1683. dev_hold(priv->parent);
  1684. netdev_lock(priv->parent);
  1685. list_add_tail(&priv->list, &ppriv->child_intfs);
  1686. netdev_unlock(priv->parent);
  1687. }
  1688. return 0;
  1689. }
  1690. static void ipoib_ndo_uninit(struct net_device *dev)
  1691. {
  1692. struct ipoib_dev_priv *priv = ipoib_priv(dev);
  1693. /*
  1694. * ipoib_remove_one guarantees the children are removed before the
  1695. * parent, and that is the only place where a parent can be removed.
  1696. */
  1697. WARN_ON(!list_empty(&priv->child_intfs));
  1698. if (priv->parent) {
  1699. struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent);
  1700. netdev_lock(ppriv->dev);
  1701. list_del(&priv->list);
  1702. netdev_unlock(ppriv->dev);
  1703. }
  1704. ipoib_neigh_hash_uninit(dev);
  1705. ipoib_ib_dev_cleanup(dev);
  1706. /* no more works over the priv->wq */
  1707. if (priv->wq) {
  1708. /* See ipoib_mcast_carrier_on_task() */
  1709. WARN_ON(test_bit(IPOIB_FLAG_OPER_UP, &priv->flags));
  1710. destroy_workqueue(priv->wq);
  1711. priv->wq = NULL;
  1712. }
  1713. dev_put(priv->parent);
  1714. }
  1715. static int ipoib_set_vf_link_state(struct net_device *dev, int vf, int link_state)
  1716. {
  1717. struct ipoib_dev_priv *priv = ipoib_priv(dev);
  1718. return ib_set_vf_link_state(priv->ca, vf, priv->port, link_state);
  1719. }
  1720. static int ipoib_get_vf_config(struct net_device *dev, int vf,
  1721. struct ifla_vf_info *ivf)
  1722. {
  1723. struct ipoib_dev_priv *priv = ipoib_priv(dev);
  1724. int err;
  1725. err = ib_get_vf_config(priv->ca, vf, priv->port, ivf);
  1726. if (err)
  1727. return err;
  1728. ivf->vf = vf;
  1729. memcpy(ivf->mac, dev->dev_addr, dev->addr_len);
  1730. return 0;
  1731. }
  1732. static int ipoib_set_vf_guid(struct net_device *dev, int vf, u64 guid, int type)
  1733. {
  1734. struct ipoib_dev_priv *priv = ipoib_priv(dev);
  1735. if (type != IFLA_VF_IB_NODE_GUID && type != IFLA_VF_IB_PORT_GUID)
  1736. return -EINVAL;
  1737. return ib_set_vf_guid(priv->ca, vf, priv->port, guid, type);
  1738. }
  1739. static int ipoib_get_vf_guid(struct net_device *dev, int vf,
  1740. struct ifla_vf_guid *node_guid,
  1741. struct ifla_vf_guid *port_guid)
  1742. {
  1743. struct ipoib_dev_priv *priv = ipoib_priv(dev);
  1744. return ib_get_vf_guid(priv->ca, vf, priv->port, node_guid, port_guid);
  1745. }
  1746. static int ipoib_get_vf_stats(struct net_device *dev, int vf,
  1747. struct ifla_vf_stats *vf_stats)
  1748. {
  1749. struct ipoib_dev_priv *priv = ipoib_priv(dev);
  1750. return ib_get_vf_stats(priv->ca, vf, priv->port, vf_stats);
  1751. }
  1752. static const struct header_ops ipoib_header_ops = {
  1753. .create = ipoib_hard_header,
  1754. };
  1755. static const struct net_device_ops ipoib_netdev_ops_pf = {
  1756. .ndo_init = ipoib_ndo_init,
  1757. .ndo_uninit = ipoib_ndo_uninit,
  1758. .ndo_open = ipoib_open,
  1759. .ndo_stop = ipoib_stop,
  1760. .ndo_change_mtu = ipoib_change_mtu,
  1761. .ndo_fix_features = ipoib_fix_features,
  1762. .ndo_start_xmit = ipoib_start_xmit,
  1763. .ndo_tx_timeout = ipoib_timeout,
  1764. .ndo_set_rx_mode = ipoib_set_mcast_list,
  1765. .ndo_get_iflink = ipoib_get_iflink,
  1766. .ndo_set_vf_link_state = ipoib_set_vf_link_state,
  1767. .ndo_get_vf_config = ipoib_get_vf_config,
  1768. .ndo_get_vf_stats = ipoib_get_vf_stats,
  1769. .ndo_get_vf_guid = ipoib_get_vf_guid,
  1770. .ndo_set_vf_guid = ipoib_set_vf_guid,
  1771. .ndo_set_mac_address = ipoib_set_mac,
  1772. .ndo_get_stats64 = ipoib_get_stats,
  1773. .ndo_eth_ioctl = ipoib_ioctl,
  1774. .ndo_hwtstamp_get = ipoib_hwtstamp_get,
  1775. .ndo_hwtstamp_set = ipoib_hwtstamp_set,
  1776. };
  1777. static const struct net_device_ops ipoib_netdev_ops_vf = {
  1778. .ndo_init = ipoib_ndo_init,
  1779. .ndo_uninit = ipoib_ndo_uninit,
  1780. .ndo_open = ipoib_open,
  1781. .ndo_stop = ipoib_stop,
  1782. .ndo_change_mtu = ipoib_change_mtu,
  1783. .ndo_fix_features = ipoib_fix_features,
  1784. .ndo_start_xmit = ipoib_start_xmit,
  1785. .ndo_tx_timeout = ipoib_timeout,
  1786. .ndo_set_rx_mode = ipoib_set_mcast_list,
  1787. .ndo_get_iflink = ipoib_get_iflink,
  1788. .ndo_get_stats64 = ipoib_get_stats,
  1789. .ndo_eth_ioctl = ipoib_ioctl,
  1790. .ndo_hwtstamp_get = ipoib_hwtstamp_get,
  1791. .ndo_hwtstamp_set = ipoib_hwtstamp_set,
  1792. };
  1793. static const struct net_device_ops ipoib_netdev_default_pf = {
  1794. .ndo_init = ipoib_dev_init_default,
  1795. .ndo_uninit = ipoib_dev_uninit_default,
  1796. .ndo_open = ipoib_ib_dev_open_default,
  1797. .ndo_stop = ipoib_ib_dev_stop_default,
  1798. };
  1799. void ipoib_setup_common(struct net_device *dev)
  1800. {
  1801. dev->header_ops = &ipoib_header_ops;
  1802. dev->netdev_ops = &ipoib_netdev_default_pf;
  1803. ipoib_set_ethtool_ops(dev);
  1804. dev->watchdog_timeo = 10 * HZ;
  1805. dev->flags |= IFF_BROADCAST | IFF_MULTICAST;
  1806. dev->hard_header_len = IPOIB_HARD_LEN;
  1807. dev->addr_len = INFINIBAND_ALEN;
  1808. dev->type = ARPHRD_INFINIBAND;
  1809. dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
  1810. dev->features = (NETIF_F_VLAN_CHALLENGED |
  1811. NETIF_F_HIGHDMA);
  1812. netif_keep_dst(dev);
  1813. memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN);
  1814. /*
  1815. * unregister_netdev always frees the netdev, we use this mode
  1816. * consistently to unify all the various unregister paths, including
  1817. * those connected to rtnl_link_ops which require it.
  1818. */
  1819. dev->needs_free_netdev = true;
  1820. }
  1821. static void ipoib_build_priv(struct net_device *dev)
  1822. {
  1823. struct ipoib_dev_priv *priv = ipoib_priv(dev);
  1824. priv->dev = dev;
  1825. spin_lock_init(&priv->lock);
  1826. mutex_init(&priv->mcast_mutex);
  1827. INIT_LIST_HEAD(&priv->path_list);
  1828. INIT_LIST_HEAD(&priv->child_intfs);
  1829. INIT_LIST_HEAD(&priv->dead_ahs);
  1830. INIT_LIST_HEAD(&priv->multicast_list);
  1831. INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task);
  1832. INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task);
  1833. INIT_WORK(&priv->reschedule_napi_work, ipoib_napi_schedule_work);
  1834. INIT_WORK(&priv->flush_light, ipoib_ib_dev_flush_light);
  1835. INIT_WORK(&priv->flush_normal, ipoib_ib_dev_flush_normal);
  1836. INIT_WORK(&priv->flush_heavy, ipoib_ib_dev_flush_heavy);
  1837. INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);
  1838. INIT_WORK(&priv->tx_timeout_work, ipoib_ib_tx_timeout_work);
  1839. INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah);
  1840. INIT_DELAYED_WORK(&priv->neigh_reap_task, ipoib_reap_neigh);
  1841. }
  1842. static struct net_device *ipoib_alloc_netdev(struct ib_device *hca, u32 port,
  1843. const char *name)
  1844. {
  1845. struct net_device *dev;
  1846. dev = rdma_alloc_netdev(hca, port, RDMA_NETDEV_IPOIB, name,
  1847. NET_NAME_UNKNOWN, ipoib_setup_common);
  1848. if (!IS_ERR(dev) || PTR_ERR(dev) != -EOPNOTSUPP)
  1849. return dev;
  1850. dev = alloc_netdev(sizeof(struct rdma_netdev), name, NET_NAME_UNKNOWN,
  1851. ipoib_setup_common);
  1852. if (!dev)
  1853. return ERR_PTR(-ENOMEM);
  1854. return dev;
  1855. }
  1856. int ipoib_intf_init(struct ib_device *hca, u32 port, const char *name,
  1857. struct net_device *dev)
  1858. {
  1859. struct rdma_netdev *rn = netdev_priv(dev);
  1860. struct ipoib_dev_priv *priv;
  1861. int rc;
  1862. priv = kzalloc_obj(*priv);
  1863. if (!priv)
  1864. return -ENOMEM;
  1865. priv->ca = hca;
  1866. priv->port = port;
  1867. rc = rdma_init_netdev(hca, port, RDMA_NETDEV_IPOIB, name,
  1868. NET_NAME_UNKNOWN, ipoib_setup_common, dev);
  1869. if (rc) {
  1870. if (rc != -EOPNOTSUPP)
  1871. goto out;
  1872. rn->send = ipoib_send;
  1873. rn->attach_mcast = ipoib_mcast_attach;
  1874. rn->detach_mcast = ipoib_mcast_detach;
  1875. rn->hca = hca;
  1876. rc = netif_set_real_num_tx_queues(dev, 1);
  1877. if (rc)
  1878. goto out;
  1879. rc = netif_set_real_num_rx_queues(dev, 1);
  1880. if (rc)
  1881. goto out;
  1882. }
  1883. priv->rn_ops = dev->netdev_ops;
  1884. if (hca->attrs.kernel_cap_flags & IBK_VIRTUAL_FUNCTION)
  1885. dev->netdev_ops = &ipoib_netdev_ops_vf;
  1886. else
  1887. dev->netdev_ops = &ipoib_netdev_ops_pf;
  1888. rn->clnt_priv = priv;
  1889. /*
  1890. * Only the child register_netdev flows can handle priv_destructor
  1891. * being set, so we force it to NULL here and handle manually until it
  1892. * is safe to turn on.
  1893. */
  1894. priv->next_priv_destructor = dev->priv_destructor;
  1895. dev->priv_destructor = NULL;
  1896. ipoib_build_priv(dev);
  1897. return 0;
  1898. out:
  1899. kfree(priv);
  1900. return rc;
  1901. }
  1902. struct net_device *ipoib_intf_alloc(struct ib_device *hca, u32 port,
  1903. const char *name)
  1904. {
  1905. struct net_device *dev;
  1906. int rc;
  1907. dev = ipoib_alloc_netdev(hca, port, name);
  1908. if (IS_ERR(dev))
  1909. return dev;
  1910. rc = ipoib_intf_init(hca, port, name, dev);
  1911. if (rc) {
  1912. free_netdev(dev);
  1913. return ERR_PTR(rc);
  1914. }
  1915. /*
  1916. * Upon success the caller must ensure ipoib_intf_free is called or
  1917. * register_netdevice succeed'd and priv_destructor is set to
  1918. * ipoib_intf_free.
  1919. */
  1920. return dev;
  1921. }
  1922. void ipoib_intf_free(struct net_device *dev)
  1923. {
  1924. struct ipoib_dev_priv *priv = ipoib_priv(dev);
  1925. struct rdma_netdev *rn = netdev_priv(dev);
  1926. dev->priv_destructor = priv->next_priv_destructor;
  1927. if (dev->priv_destructor)
  1928. dev->priv_destructor(dev);
  1929. /*
  1930. * There are some error flows around register_netdev failing that may
  1931. * attempt to call priv_destructor twice, prevent that from happening.
  1932. */
  1933. dev->priv_destructor = NULL;
  1934. /* unregister/destroy is very complicated. Make bugs more obvious. */
  1935. rn->clnt_priv = NULL;
  1936. kfree(priv);
  1937. }
  1938. static ssize_t pkey_show(struct device *dev, struct device_attribute *attr,
  1939. char *buf)
  1940. {
  1941. struct net_device *ndev = to_net_dev(dev);
  1942. struct ipoib_dev_priv *priv = ipoib_priv(ndev);
  1943. return sysfs_emit(buf, "0x%04x\n", priv->pkey);
  1944. }
  1945. static DEVICE_ATTR_RO(pkey);
  1946. static ssize_t umcast_show(struct device *dev, struct device_attribute *attr,
  1947. char *buf)
  1948. {
  1949. struct net_device *ndev = to_net_dev(dev);
  1950. struct ipoib_dev_priv *priv = ipoib_priv(ndev);
  1951. return sysfs_emit(buf, "%d\n",
  1952. test_bit(IPOIB_FLAG_UMCAST, &priv->flags));
  1953. }
  1954. void ipoib_set_umcast(struct net_device *ndev, int umcast_val)
  1955. {
  1956. struct ipoib_dev_priv *priv = ipoib_priv(ndev);
  1957. if (umcast_val > 0) {
  1958. set_bit(IPOIB_FLAG_UMCAST, &priv->flags);
  1959. ipoib_warn(priv, "ignoring multicast groups joined directly "
  1960. "by userspace\n");
  1961. } else
  1962. clear_bit(IPOIB_FLAG_UMCAST, &priv->flags);
  1963. }
  1964. static ssize_t umcast_store(struct device *dev, struct device_attribute *attr,
  1965. const char *buf, size_t count)
  1966. {
  1967. unsigned long umcast_val = simple_strtoul(buf, NULL, 0);
  1968. ipoib_set_umcast(to_net_dev(dev), umcast_val);
  1969. return count;
  1970. }
  1971. static DEVICE_ATTR_RW(umcast);
  1972. int ipoib_add_umcast_attr(struct net_device *dev)
  1973. {
  1974. return device_create_file(&dev->dev, &dev_attr_umcast);
  1975. }
  1976. static void set_base_guid(struct ipoib_dev_priv *priv, union ib_gid *gid)
  1977. {
  1978. struct ipoib_dev_priv *child_priv;
  1979. struct net_device *netdev = priv->dev;
  1980. netif_addr_lock_bh(netdev);
  1981. memcpy(&priv->local_gid.global.interface_id,
  1982. &gid->global.interface_id,
  1983. sizeof(gid->global.interface_id));
  1984. dev_addr_mod(netdev, 4, (u8 *)&priv->local_gid, sizeof(priv->local_gid));
  1985. clear_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags);
  1986. netif_addr_unlock_bh(netdev);
  1987. if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
  1988. netdev_lock_ops_to_full(priv->dev);
  1989. list_for_each_entry(child_priv, &priv->child_intfs, list)
  1990. set_base_guid(child_priv, gid);
  1991. netdev_unlock_full_to_ops(priv->dev);
  1992. }
  1993. }
  1994. static int ipoib_check_lladdr(struct net_device *dev,
  1995. struct sockaddr_storage *ss)
  1996. {
  1997. union ib_gid *gid = (union ib_gid *)(ss->__data + 4);
  1998. int ret = 0;
  1999. netif_addr_lock_bh(dev);
  2000. /* Make sure the QPN, reserved and subnet prefix match the current
  2001. * lladdr, it also makes sure the lladdr is unicast.
  2002. */
  2003. if (memcmp(dev->dev_addr, ss->__data,
  2004. 4 + sizeof(gid->global.subnet_prefix)) ||
  2005. gid->global.interface_id == 0)
  2006. ret = -EINVAL;
  2007. netif_addr_unlock_bh(dev);
  2008. return ret;
  2009. }
  2010. static int ipoib_set_mac(struct net_device *dev, void *addr)
  2011. {
  2012. struct ipoib_dev_priv *priv = ipoib_priv(dev);
  2013. struct sockaddr_storage *ss = addr;
  2014. int ret;
  2015. if (!(dev->priv_flags & IFF_LIVE_ADDR_CHANGE) && netif_running(dev))
  2016. return -EBUSY;
  2017. ret = ipoib_check_lladdr(dev, ss);
  2018. if (ret)
  2019. return ret;
  2020. set_base_guid(priv, (union ib_gid *)(ss->__data + 4));
  2021. if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
  2022. struct ipoib_dev_priv *cpriv;
  2023. netdev_lock_ops_to_full(dev);
  2024. list_for_each_entry(cpriv, &priv->child_intfs, list)
  2025. queue_work(ipoib_workqueue, &cpriv->flush_light);
  2026. netdev_unlock_full_to_ops(dev);
  2027. }
  2028. queue_work(ipoib_workqueue, &priv->flush_light);
  2029. return 0;
  2030. }
  2031. static ssize_t create_child_store(struct device *dev,
  2032. struct device_attribute *attr,
  2033. const char *buf, size_t count)
  2034. {
  2035. int pkey;
  2036. int ret;
  2037. if (sscanf(buf, "%i", &pkey) != 1)
  2038. return -EINVAL;
  2039. if (pkey <= 0 || pkey > 0xffff || pkey == 0x8000)
  2040. return -EINVAL;
  2041. ret = ipoib_vlan_add(to_net_dev(dev), pkey);
  2042. return ret ? ret : count;
  2043. }
  2044. static DEVICE_ATTR_WO(create_child);
  2045. static ssize_t delete_child_store(struct device *dev,
  2046. struct device_attribute *attr,
  2047. const char *buf, size_t count)
  2048. {
  2049. int pkey;
  2050. int ret;
  2051. if (sscanf(buf, "%i", &pkey) != 1)
  2052. return -EINVAL;
  2053. if (pkey < 0 || pkey > 0xffff)
  2054. return -EINVAL;
  2055. ret = ipoib_vlan_delete(to_net_dev(dev), pkey);
  2056. return ret ? ret : count;
  2057. }
  2058. static DEVICE_ATTR_WO(delete_child);
  2059. int ipoib_add_pkey_attr(struct net_device *dev)
  2060. {
  2061. return device_create_file(&dev->dev, &dev_attr_pkey);
  2062. }
  2063. /*
  2064. * We erroneously exposed the iface's port number in the dev_id
  2065. * sysfs field long after dev_port was introduced for that purpose[1],
  2066. * and we need to stop everyone from relying on that.
  2067. * Let's overload the shower routine for the dev_id file here
  2068. * to gently bring the issue up.
  2069. *
  2070. * [1] https://www.spinics.net/lists/netdev/msg272123.html
  2071. */
  2072. static ssize_t dev_id_show(struct device *dev,
  2073. struct device_attribute *attr, char *buf)
  2074. {
  2075. struct net_device *ndev = to_net_dev(dev);
  2076. /*
  2077. * ndev->dev_port will be equal to 0 in old kernel prior to commit
  2078. * 9b8b2a323008 ("IB/ipoib: Use dev_port to expose network interface
  2079. * port numbers") Zero was chosen as special case for user space
  2080. * applications to fallback and query dev_id to check if it has
  2081. * different value or not.
  2082. *
  2083. * Don't print warning in such scenario.
  2084. *
  2085. * https://github.com/systemd/systemd/blob/master/src/udev/udev-builtin-net_id.c#L358
  2086. */
  2087. if (ndev->dev_port && ndev->dev_id == ndev->dev_port)
  2088. netdev_info_once(ndev,
  2089. "\"%s\" wants to know my dev_id. Should it look at dev_port instead? See Documentation/ABI/testing/sysfs-class-net for more info.\n",
  2090. current->comm);
  2091. return sysfs_emit(buf, "%#x\n", ndev->dev_id);
  2092. }
  2093. static DEVICE_ATTR_RO(dev_id);
  2094. static int ipoib_intercept_dev_id_attr(struct net_device *dev)
  2095. {
  2096. device_remove_file(&dev->dev, &dev_attr_dev_id);
  2097. return device_create_file(&dev->dev, &dev_attr_dev_id);
  2098. }
  2099. static struct net_device *ipoib_add_port(const char *format,
  2100. struct ib_device *hca, u32 port)
  2101. {
  2102. struct rtnl_link_ops *ops = ipoib_get_link_ops();
  2103. struct rdma_netdev_alloc_params params;
  2104. struct ipoib_dev_priv *priv;
  2105. struct net_device *ndev;
  2106. int result;
  2107. ndev = ipoib_intf_alloc(hca, port, format);
  2108. if (IS_ERR(ndev)) {
  2109. pr_warn("%s, %d: ipoib_intf_alloc failed %ld\n", hca->name, port,
  2110. PTR_ERR(ndev));
  2111. return ndev;
  2112. }
  2113. priv = ipoib_priv(ndev);
  2114. INIT_IB_EVENT_HANDLER(&priv->event_handler,
  2115. priv->ca, ipoib_event);
  2116. ib_register_event_handler(&priv->event_handler);
  2117. /* call event handler to ensure pkey in sync */
  2118. ipoib_queue_work(priv, IPOIB_FLUSH_HEAVY);
  2119. ndev->rtnl_link_ops = ipoib_get_link_ops();
  2120. dev_net_set(ndev, rdma_dev_net(hca));
  2121. result = register_netdev(ndev);
  2122. if (result) {
  2123. pr_warn("%s: couldn't register ipoib port %d; error %d\n",
  2124. hca->name, port, result);
  2125. ipoib_parent_unregister_pre(ndev);
  2126. ipoib_intf_free(ndev);
  2127. free_netdev(ndev);
  2128. return ERR_PTR(result);
  2129. }
  2130. if (hca->ops.rdma_netdev_get_params) {
  2131. int rc = hca->ops.rdma_netdev_get_params(hca, port,
  2132. RDMA_NETDEV_IPOIB,
  2133. &params);
  2134. if (!rc && ops->priv_size < params.sizeof_priv)
  2135. ops->priv_size = params.sizeof_priv;
  2136. }
  2137. /*
  2138. * We cannot set priv_destructor before register_netdev because we
  2139. * need priv to be always valid during the error flow to execute
  2140. * ipoib_parent_unregister_pre(). Instead handle it manually and only
  2141. * enter priv_destructor mode once we are completely registered.
  2142. */
  2143. ndev->priv_destructor = ipoib_intf_free;
  2144. if (ipoib_intercept_dev_id_attr(ndev))
  2145. goto sysfs_failed;
  2146. if (ipoib_cm_add_mode_attr(ndev))
  2147. goto sysfs_failed;
  2148. if (ipoib_add_pkey_attr(ndev))
  2149. goto sysfs_failed;
  2150. if (ipoib_add_umcast_attr(ndev))
  2151. goto sysfs_failed;
  2152. if (device_create_file(&ndev->dev, &dev_attr_create_child))
  2153. goto sysfs_failed;
  2154. if (device_create_file(&ndev->dev, &dev_attr_delete_child))
  2155. goto sysfs_failed;
  2156. return ndev;
  2157. sysfs_failed:
  2158. ipoib_parent_unregister_pre(ndev);
  2159. unregister_netdev(ndev);
  2160. return ERR_PTR(-ENOMEM);
  2161. }
  2162. static int ipoib_add_one(struct ib_device *device)
  2163. {
  2164. struct list_head *dev_list;
  2165. struct net_device *dev;
  2166. struct ipoib_dev_priv *priv;
  2167. unsigned int p;
  2168. int count = 0;
  2169. dev_list = kmalloc_obj(*dev_list);
  2170. if (!dev_list)
  2171. return -ENOMEM;
  2172. INIT_LIST_HEAD(dev_list);
  2173. rdma_for_each_port (device, p) {
  2174. if (!rdma_protocol_ib(device, p))
  2175. continue;
  2176. dev = ipoib_add_port("ib%d", device, p);
  2177. if (!IS_ERR(dev)) {
  2178. priv = ipoib_priv(dev);
  2179. list_add_tail(&priv->list, dev_list);
  2180. count++;
  2181. }
  2182. }
  2183. if (!count) {
  2184. kfree(dev_list);
  2185. return -EOPNOTSUPP;
  2186. }
  2187. ib_set_client_data(device, &ipoib_client, dev_list);
  2188. return 0;
  2189. }
  2190. static void ipoib_remove_one(struct ib_device *device, void *client_data)
  2191. {
  2192. struct ipoib_dev_priv *priv, *tmp, *cpriv, *tcpriv;
  2193. struct list_head *dev_list = client_data;
  2194. list_for_each_entry_safe(priv, tmp, dev_list, list) {
  2195. LIST_HEAD(head);
  2196. ipoib_parent_unregister_pre(priv->dev);
  2197. rtnl_lock();
  2198. netdev_lock(priv->dev);
  2199. list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs,
  2200. list)
  2201. unregister_netdevice_queue(cpriv->dev, &head);
  2202. netdev_unlock(priv->dev);
  2203. unregister_netdevice_queue(priv->dev, &head);
  2204. unregister_netdevice_many(&head);
  2205. rtnl_unlock();
  2206. }
  2207. kfree(dev_list);
  2208. }
  2209. #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
  2210. static struct notifier_block ipoib_netdev_notifier = {
  2211. .notifier_call = ipoib_netdev_event,
  2212. };
  2213. #endif
  2214. static int __init ipoib_init_module(void)
  2215. {
  2216. int ret;
  2217. ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size);
  2218. ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE);
  2219. ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE);
  2220. ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size);
  2221. ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE);
  2222. ipoib_sendq_size = max3(ipoib_sendq_size, 2 * MAX_SEND_CQE, IPOIB_MIN_QUEUE_SIZE);
  2223. #ifdef CONFIG_INFINIBAND_IPOIB_CM
  2224. ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP);
  2225. ipoib_max_conn_qp = max(ipoib_max_conn_qp, 0);
  2226. #endif
  2227. /*
  2228. * When copying small received packets, we only copy from the
  2229. * linear data part of the SKB, so we rely on this condition.
  2230. */
  2231. BUILD_BUG_ON(IPOIB_CM_COPYBREAK > IPOIB_CM_HEAD_SIZE);
  2232. ipoib_register_debugfs();
  2233. /*
  2234. * We create a global workqueue here that is used for all flush
  2235. * operations. However, if you attempt to flush a workqueue
  2236. * from a task on that same workqueue, it deadlocks the system.
  2237. * We want to be able to flush the tasks associated with a
  2238. * specific net device, so we also create a workqueue for each
  2239. * netdevice. We queue up the tasks for that device only on
  2240. * its private workqueue, and we only queue up flush events
  2241. * on our global flush workqueue. This avoids the deadlocks.
  2242. */
  2243. ipoib_workqueue = alloc_ordered_workqueue("ipoib_flush", 0);
  2244. if (!ipoib_workqueue) {
  2245. ret = -ENOMEM;
  2246. goto err_fs;
  2247. }
  2248. ib_sa_register_client(&ipoib_sa_client);
  2249. ret = ib_register_client(&ipoib_client);
  2250. if (ret)
  2251. goto err_sa;
  2252. ret = ipoib_netlink_init();
  2253. if (ret)
  2254. goto err_client;
  2255. #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
  2256. register_netdevice_notifier(&ipoib_netdev_notifier);
  2257. #endif
  2258. return 0;
  2259. err_client:
  2260. ib_unregister_client(&ipoib_client);
  2261. err_sa:
  2262. ib_sa_unregister_client(&ipoib_sa_client);
  2263. destroy_workqueue(ipoib_workqueue);
  2264. err_fs:
  2265. ipoib_unregister_debugfs();
  2266. return ret;
  2267. }
  2268. static void __exit ipoib_cleanup_module(void)
  2269. {
  2270. #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
  2271. unregister_netdevice_notifier(&ipoib_netdev_notifier);
  2272. #endif
  2273. ipoib_netlink_fini();
  2274. ib_unregister_client(&ipoib_client);
  2275. ib_sa_unregister_client(&ipoib_sa_client);
  2276. ipoib_unregister_debugfs();
  2277. destroy_workqueue(ipoib_workqueue);
  2278. }
  2279. module_init(ipoib_init_module);
  2280. module_exit(ipoib_cleanup_module);