genhd.c 41 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * gendisk handling
  4. *
  5. * Portions Copyright (C) 2020 Christoph Hellwig
  6. */
  7. #include <linux/module.h>
  8. #include <linux/ctype.h>
  9. #include <linux/fs.h>
  10. #include <linux/kdev_t.h>
  11. #include <linux/kernel.h>
  12. #include <linux/blkdev.h>
  13. #include <linux/backing-dev.h>
  14. #include <linux/init.h>
  15. #include <linux/spinlock.h>
  16. #include <linux/proc_fs.h>
  17. #include <linux/seq_file.h>
  18. #include <linux/slab.h>
  19. #include <linux/kmod.h>
  20. #include <linux/major.h>
  21. #include <linux/mutex.h>
  22. #include <linux/idr.h>
  23. #include <linux/log2.h>
  24. #include <linux/pm_runtime.h>
  25. #include <linux/badblocks.h>
  26. #include <linux/part_stat.h>
  27. #include <linux/blktrace_api.h>
  28. #include "blk-throttle.h"
  29. #include "blk.h"
  30. #include "blk-mq-sched.h"
  31. #include "blk-rq-qos.h"
  32. #include "blk-cgroup.h"
  33. static struct kobject *block_depr;
  34. /*
  35. * Unique, monotonically increasing sequential number associated with block
  36. * devices instances (i.e. incremented each time a device is attached).
  37. * Associating uevents with block devices in userspace is difficult and racy:
  38. * the uevent netlink socket is lossy, and on slow and overloaded systems has
  39. * a very high latency.
  40. * Block devices do not have exclusive owners in userspace, any process can set
  41. * one up (e.g. loop devices). Moreover, device names can be reused (e.g. loop0
  42. * can be reused again and again).
  43. * A userspace process setting up a block device and watching for its events
  44. * cannot thus reliably tell whether an event relates to the device it just set
  45. * up or another earlier instance with the same name.
  46. * This sequential number allows userspace processes to solve this problem, and
  47. * uniquely associate an uevent to the lifetime to a device.
  48. */
  49. static atomic64_t diskseq;
  50. /* for extended dynamic devt allocation, currently only one major is used */
  51. #define NR_EXT_DEVT (1 << MINORBITS)
  52. static DEFINE_IDA(ext_devt_ida);
  53. void set_capacity(struct gendisk *disk, sector_t sectors)
  54. {
  55. if (sectors > BLK_DEV_MAX_SECTORS) {
  56. pr_warn_once("%s: truncate capacity from %lld to %lld\n",
  57. disk->disk_name, sectors,
  58. BLK_DEV_MAX_SECTORS);
  59. sectors = BLK_DEV_MAX_SECTORS;
  60. }
  61. bdev_set_nr_sectors(disk->part0, sectors);
  62. }
  63. EXPORT_SYMBOL(set_capacity);
  64. /*
  65. * Set disk capacity and notify if the size is not currently zero and will not
  66. * be set to zero. Returns true if a uevent was sent, otherwise false.
  67. */
  68. bool set_capacity_and_notify(struct gendisk *disk, sector_t size)
  69. {
  70. sector_t capacity = get_capacity(disk);
  71. char *envp[] = { "RESIZE=1", NULL };
  72. set_capacity(disk, size);
  73. /*
  74. * Only print a message and send a uevent if the gendisk is user visible
  75. * and alive. This avoids spamming the log and udev when setting the
  76. * initial capacity during probing.
  77. */
  78. if (size == capacity ||
  79. !disk_live(disk) ||
  80. (disk->flags & GENHD_FL_HIDDEN))
  81. return false;
  82. pr_info_ratelimited("%s: detected capacity change from %lld to %lld\n",
  83. disk->disk_name, capacity, size);
  84. /*
  85. * Historically we did not send a uevent for changes to/from an empty
  86. * device.
  87. */
  88. if (!capacity || !size)
  89. return false;
  90. kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
  91. return true;
  92. }
  93. EXPORT_SYMBOL_GPL(set_capacity_and_notify);
  94. static void part_stat_read_all(struct block_device *part,
  95. struct disk_stats *stat)
  96. {
  97. int cpu;
  98. memset(stat, 0, sizeof(struct disk_stats));
  99. for_each_possible_cpu(cpu) {
  100. struct disk_stats *ptr = per_cpu_ptr(part->bd_stats, cpu);
  101. int group;
  102. for (group = 0; group < NR_STAT_GROUPS; group++) {
  103. stat->nsecs[group] += ptr->nsecs[group];
  104. stat->sectors[group] += ptr->sectors[group];
  105. stat->ios[group] += ptr->ios[group];
  106. stat->merges[group] += ptr->merges[group];
  107. }
  108. stat->io_ticks += ptr->io_ticks;
  109. }
  110. }
  111. static void bdev_count_inflight_rw(struct block_device *part,
  112. unsigned int inflight[2], bool mq_driver)
  113. {
  114. int write = 0;
  115. int read = 0;
  116. int cpu;
  117. if (mq_driver) {
  118. blk_mq_in_driver_rw(part, inflight);
  119. return;
  120. }
  121. for_each_possible_cpu(cpu) {
  122. read += part_stat_local_read_cpu(part, in_flight[READ], cpu);
  123. write += part_stat_local_read_cpu(part, in_flight[WRITE], cpu);
  124. }
  125. /*
  126. * While iterating all CPUs, some IOs may be issued from a CPU already
  127. * traversed and complete on a CPU that has not yet been traversed,
  128. * causing the inflight number to be negative.
  129. */
  130. inflight[READ] = read > 0 ? read : 0;
  131. inflight[WRITE] = write > 0 ? write : 0;
  132. }
  133. /**
  134. * bdev_count_inflight - get the number of inflight IOs for a block device.
  135. *
  136. * @part: the block device.
  137. *
  138. * Inflight here means started IO accounting, from bdev_start_io_acct() for
  139. * bio-based block device, and from blk_account_io_start() for rq-based block
  140. * device.
  141. */
  142. unsigned int bdev_count_inflight(struct block_device *part)
  143. {
  144. unsigned int inflight[2] = {0};
  145. bdev_count_inflight_rw(part, inflight, false);
  146. return inflight[READ] + inflight[WRITE];
  147. }
  148. EXPORT_SYMBOL_GPL(bdev_count_inflight);
  149. /*
  150. * Can be deleted altogether. Later.
  151. *
  152. */
  153. #define BLKDEV_MAJOR_HASH_SIZE 255
  154. static struct blk_major_name {
  155. struct blk_major_name *next;
  156. int major;
  157. char name[16];
  158. #ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD
  159. void (*probe)(dev_t devt);
  160. #endif
  161. } *major_names[BLKDEV_MAJOR_HASH_SIZE];
  162. static DEFINE_MUTEX(major_names_lock);
  163. static DEFINE_SPINLOCK(major_names_spinlock);
  164. /* index in the above - for now: assume no multimajor ranges */
  165. static inline int major_to_index(unsigned major)
  166. {
  167. return major % BLKDEV_MAJOR_HASH_SIZE;
  168. }
  169. #ifdef CONFIG_PROC_FS
  170. void blkdev_show(struct seq_file *seqf, off_t offset)
  171. {
  172. struct blk_major_name *dp;
  173. spin_lock(&major_names_spinlock);
  174. for (dp = major_names[major_to_index(offset)]; dp; dp = dp->next)
  175. if (dp->major == offset)
  176. seq_printf(seqf, "%3d %s\n", dp->major, dp->name);
  177. spin_unlock(&major_names_spinlock);
  178. }
  179. #endif /* CONFIG_PROC_FS */
  180. /**
  181. * __register_blkdev - register a new block device
  182. *
  183. * @major: the requested major device number [1..BLKDEV_MAJOR_MAX-1]. If
  184. * @major = 0, try to allocate any unused major number.
  185. * @name: the name of the new block device as a zero terminated string
  186. * @probe: pre-devtmpfs / pre-udev callback used to create disks when their
  187. * pre-created device node is accessed. When a probe call uses
  188. * add_disk() and it fails the driver must cleanup resources. This
  189. * interface may soon be removed.
  190. *
  191. * The @name must be unique within the system.
  192. *
  193. * The return value depends on the @major input parameter:
  194. *
  195. * - if a major device number was requested in range [1..BLKDEV_MAJOR_MAX-1]
  196. * then the function returns zero on success, or a negative error code
  197. * - if any unused major number was requested with @major = 0 parameter
  198. * then the return value is the allocated major number in range
  199. * [1..BLKDEV_MAJOR_MAX-1] or a negative error code otherwise
  200. *
  201. * See Documentation/admin-guide/devices.txt for the list of allocated
  202. * major numbers.
  203. *
  204. * Use register_blkdev instead for any new code.
  205. */
  206. int __register_blkdev(unsigned int major, const char *name,
  207. void (*probe)(dev_t devt))
  208. {
  209. struct blk_major_name **n, *p;
  210. int index, ret = 0;
  211. mutex_lock(&major_names_lock);
  212. /* temporary */
  213. if (major == 0) {
  214. for (index = ARRAY_SIZE(major_names)-1; index > 0; index--) {
  215. if (major_names[index] == NULL)
  216. break;
  217. }
  218. if (index == 0) {
  219. printk("%s: failed to get major for %s\n",
  220. __func__, name);
  221. ret = -EBUSY;
  222. goto out;
  223. }
  224. major = index;
  225. ret = major;
  226. }
  227. if (major >= BLKDEV_MAJOR_MAX) {
  228. pr_err("%s: major requested (%u) is greater than the maximum (%u) for %s\n",
  229. __func__, major, BLKDEV_MAJOR_MAX-1, name);
  230. ret = -EINVAL;
  231. goto out;
  232. }
  233. p = kmalloc_obj(struct blk_major_name);
  234. if (p == NULL) {
  235. ret = -ENOMEM;
  236. goto out;
  237. }
  238. p->major = major;
  239. #ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD
  240. p->probe = probe;
  241. #endif
  242. strscpy(p->name, name, sizeof(p->name));
  243. p->next = NULL;
  244. index = major_to_index(major);
  245. spin_lock(&major_names_spinlock);
  246. for (n = &major_names[index]; *n; n = &(*n)->next) {
  247. if ((*n)->major == major)
  248. break;
  249. }
  250. if (!*n)
  251. *n = p;
  252. else
  253. ret = -EBUSY;
  254. spin_unlock(&major_names_spinlock);
  255. if (ret < 0) {
  256. printk("register_blkdev: cannot get major %u for %s\n",
  257. major, name);
  258. kfree(p);
  259. }
  260. out:
  261. mutex_unlock(&major_names_lock);
  262. return ret;
  263. }
  264. EXPORT_SYMBOL(__register_blkdev);
  265. void unregister_blkdev(unsigned int major, const char *name)
  266. {
  267. struct blk_major_name **n;
  268. struct blk_major_name *p = NULL;
  269. int index = major_to_index(major);
  270. mutex_lock(&major_names_lock);
  271. spin_lock(&major_names_spinlock);
  272. for (n = &major_names[index]; *n; n = &(*n)->next)
  273. if ((*n)->major == major)
  274. break;
  275. if (!*n || strcmp((*n)->name, name)) {
  276. WARN_ON(1);
  277. } else {
  278. p = *n;
  279. *n = p->next;
  280. }
  281. spin_unlock(&major_names_spinlock);
  282. mutex_unlock(&major_names_lock);
  283. kfree(p);
  284. }
  285. EXPORT_SYMBOL(unregister_blkdev);
  286. int blk_alloc_ext_minor(void)
  287. {
  288. int idx;
  289. idx = ida_alloc_range(&ext_devt_ida, 0, NR_EXT_DEVT - 1, GFP_KERNEL);
  290. if (idx == -ENOSPC)
  291. return -EBUSY;
  292. return idx;
  293. }
  294. void blk_free_ext_minor(unsigned int minor)
  295. {
  296. ida_free(&ext_devt_ida, minor);
  297. }
  298. void disk_uevent(struct gendisk *disk, enum kobject_action action)
  299. {
  300. struct block_device *part;
  301. unsigned long idx;
  302. rcu_read_lock();
  303. xa_for_each(&disk->part_tbl, idx, part) {
  304. if (bdev_is_partition(part) && !bdev_nr_sectors(part))
  305. continue;
  306. if (!kobject_get_unless_zero(&part->bd_device.kobj))
  307. continue;
  308. rcu_read_unlock();
  309. kobject_uevent(bdev_kobj(part), action);
  310. put_device(&part->bd_device);
  311. rcu_read_lock();
  312. }
  313. rcu_read_unlock();
  314. }
  315. EXPORT_SYMBOL_GPL(disk_uevent);
  316. int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode)
  317. {
  318. struct file *file;
  319. int ret = 0;
  320. if (!disk_has_partscan(disk))
  321. return -EINVAL;
  322. if (disk->open_partitions)
  323. return -EBUSY;
  324. /*
  325. * If the device is opened exclusively by current thread already, it's
  326. * safe to scan partitons, otherwise, use bd_prepare_to_claim() to
  327. * synchronize with other exclusive openers and other partition
  328. * scanners.
  329. */
  330. if (!(mode & BLK_OPEN_EXCL)) {
  331. ret = bd_prepare_to_claim(disk->part0, disk_scan_partitions,
  332. NULL);
  333. if (ret)
  334. return ret;
  335. }
  336. set_bit(GD_NEED_PART_SCAN, &disk->state);
  337. file = bdev_file_open_by_dev(disk_devt(disk), mode & ~BLK_OPEN_EXCL,
  338. NULL, NULL);
  339. if (IS_ERR(file))
  340. ret = PTR_ERR(file);
  341. else
  342. fput(file);
  343. /*
  344. * If blkdev_get_by_dev() failed early, GD_NEED_PART_SCAN is still set,
  345. * and this will cause that re-assemble partitioned raid device will
  346. * creat partition for underlying disk.
  347. */
  348. clear_bit(GD_NEED_PART_SCAN, &disk->state);
  349. if (!(mode & BLK_OPEN_EXCL))
  350. bd_abort_claiming(disk->part0, disk_scan_partitions);
  351. return ret;
  352. }
  353. static void add_disk_final(struct gendisk *disk)
  354. {
  355. struct device *ddev = disk_to_dev(disk);
  356. if (!(disk->flags & GENHD_FL_HIDDEN)) {
  357. /* Make sure the first partition scan will be proceed */
  358. if (get_capacity(disk) && disk_has_partscan(disk))
  359. set_bit(GD_NEED_PART_SCAN, &disk->state);
  360. bdev_add(disk->part0, ddev->devt);
  361. if (get_capacity(disk))
  362. disk_scan_partitions(disk, BLK_OPEN_READ);
  363. /*
  364. * Announce the disk and partitions after all partitions are
  365. * created. (for hidden disks uevents remain suppressed forever)
  366. */
  367. dev_set_uevent_suppress(ddev, 0);
  368. disk_uevent(disk, KOBJ_ADD);
  369. }
  370. blk_apply_bdi_limits(disk->bdi, &disk->queue->limits);
  371. disk_add_events(disk);
  372. set_bit(GD_ADDED, &disk->state);
  373. }
  374. static int __add_disk(struct device *parent, struct gendisk *disk,
  375. const struct attribute_group **groups,
  376. struct fwnode_handle *fwnode)
  377. {
  378. struct device *ddev = disk_to_dev(disk);
  379. int ret;
  380. if (WARN_ON_ONCE(bdev_nr_sectors(disk->part0) > BLK_DEV_MAX_SECTORS))
  381. return -EINVAL;
  382. if (queue_is_mq(disk->queue)) {
  383. /*
  384. * ->submit_bio and ->poll_bio are bypassed for blk-mq drivers.
  385. */
  386. if (disk->fops->submit_bio || disk->fops->poll_bio)
  387. return -EINVAL;
  388. } else {
  389. if (!disk->fops->submit_bio)
  390. return -EINVAL;
  391. bdev_set_flag(disk->part0, BD_HAS_SUBMIT_BIO);
  392. }
  393. /*
  394. * If the driver provides an explicit major number it also must provide
  395. * the number of minors numbers supported, and those will be used to
  396. * setup the gendisk.
  397. * Otherwise just allocate the device numbers for both the whole device
  398. * and all partitions from the extended dev_t space.
  399. */
  400. ret = -EINVAL;
  401. if (disk->major) {
  402. if (WARN_ON(!disk->minors))
  403. goto out;
  404. if (disk->minors > DISK_MAX_PARTS) {
  405. pr_err("block: can't allocate more than %d partitions\n",
  406. DISK_MAX_PARTS);
  407. disk->minors = DISK_MAX_PARTS;
  408. }
  409. if (disk->first_minor > MINORMASK ||
  410. disk->minors > MINORMASK + 1 ||
  411. disk->first_minor + disk->minors > MINORMASK + 1)
  412. goto out;
  413. } else {
  414. if (WARN_ON(disk->minors))
  415. goto out;
  416. ret = blk_alloc_ext_minor();
  417. if (ret < 0)
  418. goto out;
  419. disk->major = BLOCK_EXT_MAJOR;
  420. disk->first_minor = ret;
  421. }
  422. /* delay uevents, until we scanned partition table */
  423. dev_set_uevent_suppress(ddev, 1);
  424. ddev->parent = parent;
  425. ddev->groups = groups;
  426. dev_set_name(ddev, "%s", disk->disk_name);
  427. if (fwnode)
  428. device_set_node(ddev, fwnode);
  429. if (!(disk->flags & GENHD_FL_HIDDEN))
  430. ddev->devt = MKDEV(disk->major, disk->first_minor);
  431. ret = device_add(ddev);
  432. if (ret)
  433. goto out_free_ext_minor;
  434. ret = disk_alloc_events(disk);
  435. if (ret)
  436. goto out_device_del;
  437. ret = sysfs_create_link(block_depr, &ddev->kobj,
  438. kobject_name(&ddev->kobj));
  439. if (ret)
  440. goto out_device_del;
  441. /*
  442. * avoid probable deadlock caused by allocating memory with
  443. * GFP_KERNEL in runtime_resume callback of its all ancestor
  444. * devices
  445. */
  446. pm_runtime_set_memalloc_noio(ddev, true);
  447. disk->part0->bd_holder_dir =
  448. kobject_create_and_add("holders", &ddev->kobj);
  449. if (!disk->part0->bd_holder_dir) {
  450. ret = -ENOMEM;
  451. goto out_del_block_link;
  452. }
  453. disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
  454. if (!disk->slave_dir) {
  455. ret = -ENOMEM;
  456. goto out_put_holder_dir;
  457. }
  458. ret = blk_register_queue(disk);
  459. if (ret)
  460. goto out_put_slave_dir;
  461. if (!(disk->flags & GENHD_FL_HIDDEN)) {
  462. ret = bdi_register(disk->bdi, "%u:%u",
  463. disk->major, disk->first_minor);
  464. if (ret)
  465. goto out_unregister_queue;
  466. bdi_set_owner(disk->bdi, ddev);
  467. ret = sysfs_create_link(&ddev->kobj,
  468. &disk->bdi->dev->kobj, "bdi");
  469. if (ret)
  470. goto out_unregister_bdi;
  471. } else {
  472. /*
  473. * Even if the block_device for a hidden gendisk is not
  474. * registered, it needs to have a valid bd_dev so that the
  475. * freeing of the dynamic major works.
  476. */
  477. disk->part0->bd_dev = MKDEV(disk->major, disk->first_minor);
  478. }
  479. return 0;
  480. out_unregister_bdi:
  481. if (!(disk->flags & GENHD_FL_HIDDEN))
  482. bdi_unregister(disk->bdi);
  483. out_unregister_queue:
  484. blk_unregister_queue(disk);
  485. rq_qos_exit(disk->queue);
  486. out_put_slave_dir:
  487. kobject_put(disk->slave_dir);
  488. disk->slave_dir = NULL;
  489. out_put_holder_dir:
  490. kobject_put(disk->part0->bd_holder_dir);
  491. out_del_block_link:
  492. sysfs_remove_link(block_depr, dev_name(ddev));
  493. pm_runtime_set_memalloc_noio(ddev, false);
  494. out_device_del:
  495. device_del(ddev);
  496. out_free_ext_minor:
  497. if (disk->major == BLOCK_EXT_MAJOR)
  498. blk_free_ext_minor(disk->first_minor);
  499. out:
  500. return ret;
  501. }
  502. /**
  503. * add_disk_fwnode - add disk information to kernel list with fwnode
  504. * @parent: parent device for the disk
  505. * @disk: per-device partitioning information
  506. * @groups: Additional per-device sysfs groups
  507. * @fwnode: attached disk fwnode
  508. *
  509. * This function registers the partitioning information in @disk
  510. * with the kernel. Also attach a fwnode to the disk device.
  511. */
  512. int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk,
  513. const struct attribute_group **groups,
  514. struct fwnode_handle *fwnode)
  515. {
  516. struct blk_mq_tag_set *set;
  517. unsigned int memflags;
  518. int ret;
  519. if (queue_is_mq(disk->queue)) {
  520. set = disk->queue->tag_set;
  521. memflags = memalloc_noio_save();
  522. down_read(&set->update_nr_hwq_lock);
  523. ret = __add_disk(parent, disk, groups, fwnode);
  524. up_read(&set->update_nr_hwq_lock);
  525. memalloc_noio_restore(memflags);
  526. } else {
  527. ret = __add_disk(parent, disk, groups, fwnode);
  528. }
  529. /*
  530. * add_disk_final() needn't to read `nr_hw_queues`, so move it out
  531. * of read lock `set->update_nr_hwq_lock` for avoiding unnecessary
  532. * lock dependency on `disk->open_mutex` from scanning partition.
  533. */
  534. if (!ret)
  535. add_disk_final(disk);
  536. return ret;
  537. }
  538. EXPORT_SYMBOL_GPL(add_disk_fwnode);
  539. /**
  540. * device_add_disk - add disk information to kernel list
  541. * @parent: parent device for the disk
  542. * @disk: per-device partitioning information
  543. * @groups: Additional per-device sysfs groups
  544. *
  545. * This function registers the partitioning information in @disk
  546. * with the kernel.
  547. */
  548. int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
  549. const struct attribute_group **groups)
  550. {
  551. return add_disk_fwnode(parent, disk, groups, NULL);
  552. }
  553. EXPORT_SYMBOL(device_add_disk);
  554. static void blk_report_disk_dead(struct gendisk *disk, bool surprise)
  555. {
  556. struct block_device *bdev;
  557. unsigned long idx;
  558. /*
  559. * On surprise disk removal, bdev_mark_dead() may call into file
  560. * systems below. Make it clear that we're expecting to not hold
  561. * disk->open_mutex.
  562. */
  563. lockdep_assert_not_held(&disk->open_mutex);
  564. rcu_read_lock();
  565. xa_for_each(&disk->part_tbl, idx, bdev) {
  566. if (!kobject_get_unless_zero(&bdev->bd_device.kobj))
  567. continue;
  568. rcu_read_unlock();
  569. bdev_mark_dead(bdev, surprise);
  570. put_device(&bdev->bd_device);
  571. rcu_read_lock();
  572. }
  573. rcu_read_unlock();
  574. }
  575. static bool __blk_mark_disk_dead(struct gendisk *disk)
  576. {
  577. /*
  578. * Fail any new I/O.
  579. */
  580. if (test_and_set_bit(GD_DEAD, &disk->state))
  581. return false;
  582. if (test_bit(GD_OWNS_QUEUE, &disk->state))
  583. blk_queue_flag_set(QUEUE_FLAG_DYING, disk->queue);
  584. /*
  585. * Stop buffered writers from dirtying pages that can't be written out.
  586. */
  587. set_capacity(disk, 0);
  588. /*
  589. * Prevent new I/O from crossing bio_queue_enter().
  590. */
  591. return blk_queue_start_drain(disk->queue);
  592. }
  593. /**
  594. * blk_mark_disk_dead - mark a disk as dead
  595. * @disk: disk to mark as dead
  596. *
  597. * Mark as disk as dead (e.g. surprise removed) and don't accept any new I/O
  598. * to this disk.
  599. */
  600. void blk_mark_disk_dead(struct gendisk *disk)
  601. {
  602. __blk_mark_disk_dead(disk);
  603. blk_report_disk_dead(disk, true);
  604. }
  605. EXPORT_SYMBOL_GPL(blk_mark_disk_dead);
  606. static void __del_gendisk(struct gendisk *disk)
  607. {
  608. struct request_queue *q = disk->queue;
  609. struct block_device *part;
  610. unsigned long idx;
  611. bool start_drain;
  612. might_sleep();
  613. if (WARN_ON_ONCE(!disk_live(disk) && !(disk->flags & GENHD_FL_HIDDEN)))
  614. return;
  615. disk_del_events(disk);
  616. /*
  617. * Prevent new openers by unlinked the bdev inode.
  618. */
  619. mutex_lock(&disk->open_mutex);
  620. xa_for_each(&disk->part_tbl, idx, part)
  621. bdev_unhash(part);
  622. mutex_unlock(&disk->open_mutex);
  623. /*
  624. * Tell the file system to write back all dirty data and shut down if
  625. * it hasn't been notified earlier.
  626. */
  627. if (!test_bit(GD_DEAD, &disk->state))
  628. blk_report_disk_dead(disk, false);
  629. /*
  630. * Drop all partitions now that the disk is marked dead.
  631. */
  632. mutex_lock(&disk->open_mutex);
  633. start_drain = __blk_mark_disk_dead(disk);
  634. if (start_drain)
  635. blk_freeze_acquire_lock(q);
  636. xa_for_each_start(&disk->part_tbl, idx, part, 1)
  637. drop_partition(part);
  638. mutex_unlock(&disk->open_mutex);
  639. if (!(disk->flags & GENHD_FL_HIDDEN)) {
  640. sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
  641. /*
  642. * Unregister bdi before releasing device numbers (as they can
  643. * get reused and we'd get clashes in sysfs).
  644. */
  645. bdi_unregister(disk->bdi);
  646. }
  647. blk_unregister_queue(disk);
  648. kobject_put(disk->part0->bd_holder_dir);
  649. kobject_put(disk->slave_dir);
  650. disk->slave_dir = NULL;
  651. part_stat_set_all(disk->part0, 0);
  652. disk->part0->bd_stamp = 0;
  653. sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
  654. pm_runtime_set_memalloc_noio(disk_to_dev(disk), false);
  655. device_del(disk_to_dev(disk));
  656. blk_mq_freeze_queue_wait(q);
  657. blk_throtl_cancel_bios(disk);
  658. blk_sync_queue(q);
  659. blk_flush_integrity();
  660. if (queue_is_mq(q))
  661. blk_mq_cancel_work_sync(q);
  662. rq_qos_exit(q);
  663. /*
  664. * If the disk does not own the queue, allow using passthrough requests
  665. * again. Else leave the queue frozen to fail all I/O.
  666. */
  667. if (!test_bit(GD_OWNS_QUEUE, &disk->state))
  668. __blk_mq_unfreeze_queue(q, true);
  669. else if (queue_is_mq(q))
  670. blk_mq_exit_queue(q);
  671. if (start_drain)
  672. blk_unfreeze_release_lock(q);
  673. }
  674. static void disable_elv_switch(struct request_queue *q)
  675. {
  676. struct blk_mq_tag_set *set = q->tag_set;
  677. WARN_ON_ONCE(!queue_is_mq(q));
  678. down_write(&set->update_nr_hwq_lock);
  679. blk_queue_flag_set(QUEUE_FLAG_NO_ELV_SWITCH, q);
  680. up_write(&set->update_nr_hwq_lock);
  681. }
  682. /**
  683. * del_gendisk - remove the gendisk
  684. * @disk: the struct gendisk to remove
  685. *
  686. * Removes the gendisk and all its associated resources. This deletes the
  687. * partitions associated with the gendisk, and unregisters the associated
  688. * request_queue.
  689. *
  690. * This is the counter to the respective device_add_disk() call.
  691. *
  692. * The final removal of the struct gendisk happens when its refcount reaches 0
  693. * with put_disk(), which should be called after del_gendisk(), if
  694. * device_add_disk() was used.
  695. *
  696. * Drivers exist which depend on the release of the gendisk to be synchronous,
  697. * it should not be deferred.
  698. *
  699. * Context: can sleep
  700. */
  701. void del_gendisk(struct gendisk *disk)
  702. {
  703. struct blk_mq_tag_set *set;
  704. unsigned int memflags;
  705. if (!queue_is_mq(disk->queue)) {
  706. __del_gendisk(disk);
  707. } else {
  708. set = disk->queue->tag_set;
  709. disable_elv_switch(disk->queue);
  710. memflags = memalloc_noio_save();
  711. down_read(&set->update_nr_hwq_lock);
  712. __del_gendisk(disk);
  713. up_read(&set->update_nr_hwq_lock);
  714. memalloc_noio_restore(memflags);
  715. }
  716. }
  717. EXPORT_SYMBOL(del_gendisk);
  718. /**
  719. * invalidate_disk - invalidate the disk
  720. * @disk: the struct gendisk to invalidate
  721. *
  722. * A helper to invalidates the disk. It will clean the disk's associated
  723. * buffer/page caches and reset its internal states so that the disk
  724. * can be reused by the drivers.
  725. *
  726. * Context: can sleep
  727. */
  728. void invalidate_disk(struct gendisk *disk)
  729. {
  730. struct block_device *bdev = disk->part0;
  731. invalidate_bdev(bdev);
  732. bdev->bd_mapping->wb_err = 0;
  733. set_capacity(disk, 0);
  734. }
  735. EXPORT_SYMBOL(invalidate_disk);
  736. /* sysfs access to bad-blocks list. */
  737. static ssize_t disk_badblocks_show(struct device *dev,
  738. struct device_attribute *attr,
  739. char *page)
  740. {
  741. struct gendisk *disk = dev_to_disk(dev);
  742. if (!disk->bb)
  743. return sysfs_emit(page, "\n");
  744. return badblocks_show(disk->bb, page, 0);
  745. }
  746. static ssize_t disk_badblocks_store(struct device *dev,
  747. struct device_attribute *attr,
  748. const char *page, size_t len)
  749. {
  750. struct gendisk *disk = dev_to_disk(dev);
  751. if (!disk->bb)
  752. return -ENXIO;
  753. return badblocks_store(disk->bb, page, len, 0);
  754. }
  755. #ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD
  756. static bool blk_probe_dev(dev_t devt)
  757. {
  758. unsigned int major = MAJOR(devt);
  759. struct blk_major_name **n;
  760. mutex_lock(&major_names_lock);
  761. for (n = &major_names[major_to_index(major)]; *n; n = &(*n)->next) {
  762. if ((*n)->major == major && (*n)->probe) {
  763. (*n)->probe(devt);
  764. mutex_unlock(&major_names_lock);
  765. return true;
  766. }
  767. }
  768. mutex_unlock(&major_names_lock);
  769. return false;
  770. }
  771. void blk_request_module(dev_t devt)
  772. {
  773. int error;
  774. if (blk_probe_dev(devt))
  775. return;
  776. error = request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt));
  777. /* Make old-style 2.4 aliases work */
  778. if (error > 0)
  779. error = request_module("block-major-%d", MAJOR(devt));
  780. if (!error)
  781. blk_probe_dev(devt);
  782. }
  783. #endif /* CONFIG_BLOCK_LEGACY_AUTOLOAD */
  784. #ifdef CONFIG_PROC_FS
  785. /* iterator */
  786. static void *disk_seqf_start(struct seq_file *seqf, loff_t *pos)
  787. {
  788. loff_t skip = *pos;
  789. struct class_dev_iter *iter;
  790. struct device *dev;
  791. iter = kmalloc_obj(*iter);
  792. if (!iter)
  793. return ERR_PTR(-ENOMEM);
  794. seqf->private = iter;
  795. class_dev_iter_init(iter, &block_class, NULL, &disk_type);
  796. do {
  797. dev = class_dev_iter_next(iter);
  798. if (!dev)
  799. return NULL;
  800. } while (skip--);
  801. return dev_to_disk(dev);
  802. }
  803. static void *disk_seqf_next(struct seq_file *seqf, void *v, loff_t *pos)
  804. {
  805. struct device *dev;
  806. (*pos)++;
  807. dev = class_dev_iter_next(seqf->private);
  808. if (dev)
  809. return dev_to_disk(dev);
  810. return NULL;
  811. }
  812. static void disk_seqf_stop(struct seq_file *seqf, void *v)
  813. {
  814. struct class_dev_iter *iter = seqf->private;
  815. /* stop is called even after start failed :-( */
  816. if (iter) {
  817. class_dev_iter_exit(iter);
  818. kfree(iter);
  819. seqf->private = NULL;
  820. }
  821. }
  822. static void *show_partition_start(struct seq_file *seqf, loff_t *pos)
  823. {
  824. void *p;
  825. p = disk_seqf_start(seqf, pos);
  826. if (!IS_ERR_OR_NULL(p) && !*pos)
  827. seq_puts(seqf, "major minor #blocks name\n\n");
  828. return p;
  829. }
  830. static int show_partition(struct seq_file *seqf, void *v)
  831. {
  832. struct gendisk *sgp = v;
  833. struct block_device *part;
  834. unsigned long idx;
  835. if (!get_capacity(sgp) || (sgp->flags & GENHD_FL_HIDDEN))
  836. return 0;
  837. rcu_read_lock();
  838. xa_for_each(&sgp->part_tbl, idx, part) {
  839. if (!bdev_nr_sectors(part))
  840. continue;
  841. seq_printf(seqf, "%4d %7d %10llu %pg\n",
  842. MAJOR(part->bd_dev), MINOR(part->bd_dev),
  843. bdev_nr_sectors(part) >> 1, part);
  844. }
  845. rcu_read_unlock();
  846. return 0;
  847. }
  848. static const struct seq_operations partitions_op = {
  849. .start = show_partition_start,
  850. .next = disk_seqf_next,
  851. .stop = disk_seqf_stop,
  852. .show = show_partition
  853. };
  854. #endif
  855. static int __init genhd_device_init(void)
  856. {
  857. int error;
  858. error = class_register(&block_class);
  859. if (unlikely(error))
  860. return error;
  861. blk_dev_init();
  862. register_blkdev(BLOCK_EXT_MAJOR, "blkext");
  863. /* create top-level block dir */
  864. block_depr = kobject_create_and_add("block", NULL);
  865. return 0;
  866. }
  867. subsys_initcall(genhd_device_init);
  868. static ssize_t disk_range_show(struct device *dev,
  869. struct device_attribute *attr, char *buf)
  870. {
  871. struct gendisk *disk = dev_to_disk(dev);
  872. return sysfs_emit(buf, "%d\n", disk->minors);
  873. }
  874. static ssize_t disk_ext_range_show(struct device *dev,
  875. struct device_attribute *attr, char *buf)
  876. {
  877. struct gendisk *disk = dev_to_disk(dev);
  878. return sysfs_emit(buf, "%d\n",
  879. (disk->flags & GENHD_FL_NO_PART) ? 1 : DISK_MAX_PARTS);
  880. }
  881. static ssize_t disk_removable_show(struct device *dev,
  882. struct device_attribute *attr, char *buf)
  883. {
  884. struct gendisk *disk = dev_to_disk(dev);
  885. return sysfs_emit(buf, "%d\n",
  886. (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0));
  887. }
  888. static ssize_t disk_hidden_show(struct device *dev,
  889. struct device_attribute *attr, char *buf)
  890. {
  891. struct gendisk *disk = dev_to_disk(dev);
  892. return sysfs_emit(buf, "%d\n",
  893. (disk->flags & GENHD_FL_HIDDEN ? 1 : 0));
  894. }
  895. static ssize_t disk_ro_show(struct device *dev,
  896. struct device_attribute *attr, char *buf)
  897. {
  898. struct gendisk *disk = dev_to_disk(dev);
  899. return sysfs_emit(buf, "%d\n", get_disk_ro(disk) ? 1 : 0);
  900. }
  901. ssize_t part_size_show(struct device *dev,
  902. struct device_attribute *attr, char *buf)
  903. {
  904. return sysfs_emit(buf, "%llu\n", bdev_nr_sectors(dev_to_bdev(dev)));
  905. }
  906. ssize_t part_stat_show(struct device *dev,
  907. struct device_attribute *attr, char *buf)
  908. {
  909. struct block_device *bdev = dev_to_bdev(dev);
  910. struct disk_stats stat;
  911. unsigned int inflight;
  912. inflight = bdev_count_inflight(bdev);
  913. if (inflight) {
  914. part_stat_lock();
  915. update_io_ticks(bdev, jiffies, true);
  916. part_stat_unlock();
  917. }
  918. part_stat_read_all(bdev, &stat);
  919. return sysfs_emit(buf,
  920. "%8lu %8lu %8llu %8u "
  921. "%8lu %8lu %8llu %8u "
  922. "%8u %8u %8u "
  923. "%8lu %8lu %8llu %8u "
  924. "%8lu %8u"
  925. "\n",
  926. stat.ios[STAT_READ],
  927. stat.merges[STAT_READ],
  928. (unsigned long long)stat.sectors[STAT_READ],
  929. (unsigned int)div_u64(stat.nsecs[STAT_READ], NSEC_PER_MSEC),
  930. stat.ios[STAT_WRITE],
  931. stat.merges[STAT_WRITE],
  932. (unsigned long long)stat.sectors[STAT_WRITE],
  933. (unsigned int)div_u64(stat.nsecs[STAT_WRITE], NSEC_PER_MSEC),
  934. inflight,
  935. jiffies_to_msecs(stat.io_ticks),
  936. (unsigned int)div_u64(stat.nsecs[STAT_READ] +
  937. stat.nsecs[STAT_WRITE] +
  938. stat.nsecs[STAT_DISCARD] +
  939. stat.nsecs[STAT_FLUSH],
  940. NSEC_PER_MSEC),
  941. stat.ios[STAT_DISCARD],
  942. stat.merges[STAT_DISCARD],
  943. (unsigned long long)stat.sectors[STAT_DISCARD],
  944. (unsigned int)div_u64(stat.nsecs[STAT_DISCARD], NSEC_PER_MSEC),
  945. stat.ios[STAT_FLUSH],
  946. (unsigned int)div_u64(stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC));
  947. }
  948. /*
  949. * Show the number of IOs issued to driver.
  950. * For bio-based device, started from bdev_start_io_acct();
  951. * For rq-based device, started from blk_mq_start_request();
  952. */
  953. ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
  954. char *buf)
  955. {
  956. struct block_device *bdev = dev_to_bdev(dev);
  957. struct request_queue *q = bdev_get_queue(bdev);
  958. unsigned int inflight[2] = {0};
  959. bdev_count_inflight_rw(bdev, inflight, queue_is_mq(q));
  960. return sysfs_emit(buf, "%8u %8u\n", inflight[READ], inflight[WRITE]);
  961. }
  962. static ssize_t disk_capability_show(struct device *dev,
  963. struct device_attribute *attr, char *buf)
  964. {
  965. dev_warn_once(dev, "the capability attribute has been deprecated.\n");
  966. return sysfs_emit(buf, "0\n");
  967. }
  968. static ssize_t disk_alignment_offset_show(struct device *dev,
  969. struct device_attribute *attr,
  970. char *buf)
  971. {
  972. struct gendisk *disk = dev_to_disk(dev);
  973. return sysfs_emit(buf, "%d\n", bdev_alignment_offset(disk->part0));
  974. }
  975. static ssize_t disk_discard_alignment_show(struct device *dev,
  976. struct device_attribute *attr,
  977. char *buf)
  978. {
  979. struct gendisk *disk = dev_to_disk(dev);
  980. return sysfs_emit(buf, "%d\n", bdev_alignment_offset(disk->part0));
  981. }
  982. static ssize_t diskseq_show(struct device *dev,
  983. struct device_attribute *attr, char *buf)
  984. {
  985. struct gendisk *disk = dev_to_disk(dev);
  986. return sysfs_emit(buf, "%llu\n", disk->diskseq);
  987. }
  988. static ssize_t partscan_show(struct device *dev,
  989. struct device_attribute *attr, char *buf)
  990. {
  991. return sysfs_emit(buf, "%u\n", disk_has_partscan(dev_to_disk(dev)));
  992. }
  993. static DEVICE_ATTR(range, 0444, disk_range_show, NULL);
  994. static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL);
  995. static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL);
  996. static DEVICE_ATTR(hidden, 0444, disk_hidden_show, NULL);
  997. static DEVICE_ATTR(ro, 0444, disk_ro_show, NULL);
  998. static DEVICE_ATTR(size, 0444, part_size_show, NULL);
  999. static DEVICE_ATTR(alignment_offset, 0444, disk_alignment_offset_show, NULL);
  1000. static DEVICE_ATTR(discard_alignment, 0444, disk_discard_alignment_show, NULL);
  1001. static DEVICE_ATTR(capability, 0444, disk_capability_show, NULL);
  1002. static DEVICE_ATTR(stat, 0444, part_stat_show, NULL);
  1003. static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL);
  1004. static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store);
  1005. static DEVICE_ATTR(diskseq, 0444, diskseq_show, NULL);
  1006. static DEVICE_ATTR(partscan, 0444, partscan_show, NULL);
  1007. #ifdef CONFIG_FAIL_MAKE_REQUEST
  1008. ssize_t part_fail_show(struct device *dev,
  1009. struct device_attribute *attr, char *buf)
  1010. {
  1011. return sysfs_emit(buf, "%d\n",
  1012. bdev_test_flag(dev_to_bdev(dev), BD_MAKE_IT_FAIL));
  1013. }
  1014. ssize_t part_fail_store(struct device *dev,
  1015. struct device_attribute *attr,
  1016. const char *buf, size_t count)
  1017. {
  1018. int i;
  1019. if (count > 0 && sscanf(buf, "%d", &i) > 0) {
  1020. if (i)
  1021. bdev_set_flag(dev_to_bdev(dev), BD_MAKE_IT_FAIL);
  1022. else
  1023. bdev_clear_flag(dev_to_bdev(dev), BD_MAKE_IT_FAIL);
  1024. }
  1025. return count;
  1026. }
  1027. static struct device_attribute dev_attr_fail =
  1028. __ATTR(make-it-fail, 0644, part_fail_show, part_fail_store);
  1029. #endif /* CONFIG_FAIL_MAKE_REQUEST */
  1030. #ifdef CONFIG_FAIL_IO_TIMEOUT
  1031. static struct device_attribute dev_attr_fail_timeout =
  1032. __ATTR(io-timeout-fail, 0644, part_timeout_show, part_timeout_store);
  1033. #endif
  1034. static struct attribute *disk_attrs[] = {
  1035. &dev_attr_range.attr,
  1036. &dev_attr_ext_range.attr,
  1037. &dev_attr_removable.attr,
  1038. &dev_attr_hidden.attr,
  1039. &dev_attr_ro.attr,
  1040. &dev_attr_size.attr,
  1041. &dev_attr_alignment_offset.attr,
  1042. &dev_attr_discard_alignment.attr,
  1043. &dev_attr_capability.attr,
  1044. &dev_attr_stat.attr,
  1045. &dev_attr_inflight.attr,
  1046. &dev_attr_badblocks.attr,
  1047. &dev_attr_events.attr,
  1048. &dev_attr_events_async.attr,
  1049. &dev_attr_events_poll_msecs.attr,
  1050. &dev_attr_diskseq.attr,
  1051. &dev_attr_partscan.attr,
  1052. #ifdef CONFIG_FAIL_MAKE_REQUEST
  1053. &dev_attr_fail.attr,
  1054. #endif
  1055. #ifdef CONFIG_FAIL_IO_TIMEOUT
  1056. &dev_attr_fail_timeout.attr,
  1057. #endif
  1058. NULL
  1059. };
  1060. static umode_t disk_visible(struct kobject *kobj, struct attribute *a, int n)
  1061. {
  1062. struct device *dev = container_of(kobj, typeof(*dev), kobj);
  1063. struct gendisk *disk = dev_to_disk(dev);
  1064. if (a == &dev_attr_badblocks.attr && !disk->bb)
  1065. return 0;
  1066. return a->mode;
  1067. }
  1068. static struct attribute_group disk_attr_group = {
  1069. .attrs = disk_attrs,
  1070. .is_visible = disk_visible,
  1071. };
  1072. static const struct attribute_group *disk_attr_groups[] = {
  1073. &disk_attr_group,
  1074. #ifdef CONFIG_BLK_DEV_IO_TRACE
  1075. &blk_trace_attr_group,
  1076. #endif
  1077. #ifdef CONFIG_BLK_DEV_INTEGRITY
  1078. &blk_integrity_attr_group,
  1079. #endif
  1080. NULL
  1081. };
  1082. /**
  1083. * disk_release - releases all allocated resources of the gendisk
  1084. * @dev: the device representing this disk
  1085. *
  1086. * This function releases all allocated resources of the gendisk.
  1087. *
  1088. * Drivers which used device_add_disk() have a gendisk with a request_queue
  1089. * assigned. Since the request_queue sits on top of the gendisk for these
  1090. * drivers we also call blk_put_queue() for them, and we expect the
  1091. * request_queue refcount to reach 0 at this point, and so the request_queue
  1092. * will also be freed prior to the disk.
  1093. *
  1094. * Context: can sleep
  1095. */
  1096. static void disk_release(struct device *dev)
  1097. {
  1098. struct gendisk *disk = dev_to_disk(dev);
  1099. might_sleep();
  1100. WARN_ON_ONCE(disk_live(disk));
  1101. blk_trace_remove(disk->queue);
  1102. /*
  1103. * To undo the all initialization from blk_mq_init_allocated_queue in
  1104. * case of a probe failure where add_disk is never called we have to
  1105. * call blk_mq_exit_queue here. We can't do this for the more common
  1106. * teardown case (yet) as the tagset can be gone by the time the disk
  1107. * is released once it was added.
  1108. */
  1109. if (queue_is_mq(disk->queue) &&
  1110. test_bit(GD_OWNS_QUEUE, &disk->state) &&
  1111. !test_bit(GD_ADDED, &disk->state))
  1112. blk_mq_exit_queue(disk->queue);
  1113. blkcg_exit_disk(disk);
  1114. bioset_exit(&disk->bio_split);
  1115. disk_release_events(disk);
  1116. kfree(disk->random);
  1117. disk_free_zone_resources(disk);
  1118. xa_destroy(&disk->part_tbl);
  1119. kobject_put(&disk->queue_kobj);
  1120. disk->queue->disk = NULL;
  1121. blk_put_queue(disk->queue);
  1122. if (test_bit(GD_ADDED, &disk->state) && disk->fops->free_disk)
  1123. disk->fops->free_disk(disk);
  1124. bdev_drop(disk->part0); /* frees the disk */
  1125. }
  1126. static int block_uevent(const struct device *dev, struct kobj_uevent_env *env)
  1127. {
  1128. const struct gendisk *disk = dev_to_disk(dev);
  1129. return add_uevent_var(env, "DISKSEQ=%llu", disk->diskseq);
  1130. }
  1131. const struct class block_class = {
  1132. .name = "block",
  1133. .dev_uevent = block_uevent,
  1134. };
  1135. static char *block_devnode(const struct device *dev, umode_t *mode,
  1136. kuid_t *uid, kgid_t *gid)
  1137. {
  1138. struct gendisk *disk = dev_to_disk(dev);
  1139. if (disk->fops->devnode)
  1140. return disk->fops->devnode(disk, mode);
  1141. return NULL;
  1142. }
  1143. const struct device_type disk_type = {
  1144. .name = "disk",
  1145. .groups = disk_attr_groups,
  1146. .release = disk_release,
  1147. .devnode = block_devnode,
  1148. };
  1149. #ifdef CONFIG_PROC_FS
  1150. /*
  1151. * aggregate disk stat collector. Uses the same stats that the sysfs
  1152. * entries do, above, but makes them available through one seq_file.
  1153. *
  1154. * The output looks suspiciously like /proc/partitions with a bunch of
  1155. * extra fields.
  1156. */
  1157. static int diskstats_show(struct seq_file *seqf, void *v)
  1158. {
  1159. struct gendisk *gp = v;
  1160. struct block_device *hd;
  1161. unsigned int inflight;
  1162. struct disk_stats stat;
  1163. unsigned long idx;
  1164. /*
  1165. if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next)
  1166. seq_puts(seqf, "major minor name"
  1167. " rio rmerge rsect ruse wio wmerge "
  1168. "wsect wuse running use aveq"
  1169. "\n\n");
  1170. */
  1171. rcu_read_lock();
  1172. xa_for_each(&gp->part_tbl, idx, hd) {
  1173. if (bdev_is_partition(hd) && !bdev_nr_sectors(hd))
  1174. continue;
  1175. inflight = bdev_count_inflight(hd);
  1176. if (inflight) {
  1177. part_stat_lock();
  1178. update_io_ticks(hd, jiffies, true);
  1179. part_stat_unlock();
  1180. }
  1181. part_stat_read_all(hd, &stat);
  1182. seq_put_decimal_ull_width(seqf, "", MAJOR(hd->bd_dev), 4);
  1183. seq_put_decimal_ull_width(seqf, " ", MINOR(hd->bd_dev), 7);
  1184. seq_printf(seqf, " %pg", hd);
  1185. seq_put_decimal_ull(seqf, " ", stat.ios[STAT_READ]);
  1186. seq_put_decimal_ull(seqf, " ", stat.merges[STAT_READ]);
  1187. seq_put_decimal_ull(seqf, " ", stat.sectors[STAT_READ]);
  1188. seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_READ],
  1189. NSEC_PER_MSEC));
  1190. seq_put_decimal_ull(seqf, " ", stat.ios[STAT_WRITE]);
  1191. seq_put_decimal_ull(seqf, " ", stat.merges[STAT_WRITE]);
  1192. seq_put_decimal_ull(seqf, " ", stat.sectors[STAT_WRITE]);
  1193. seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_WRITE],
  1194. NSEC_PER_MSEC));
  1195. seq_put_decimal_ull(seqf, " ", inflight);
  1196. seq_put_decimal_ull(seqf, " ", jiffies_to_msecs(stat.io_ticks));
  1197. seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_READ] +
  1198. stat.nsecs[STAT_WRITE] +
  1199. stat.nsecs[STAT_DISCARD] +
  1200. stat.nsecs[STAT_FLUSH],
  1201. NSEC_PER_MSEC));
  1202. seq_put_decimal_ull(seqf, " ", stat.ios[STAT_DISCARD]);
  1203. seq_put_decimal_ull(seqf, " ", stat.merges[STAT_DISCARD]);
  1204. seq_put_decimal_ull(seqf, " ", stat.sectors[STAT_DISCARD]);
  1205. seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_DISCARD],
  1206. NSEC_PER_MSEC));
  1207. seq_put_decimal_ull(seqf, " ", stat.ios[STAT_FLUSH]);
  1208. seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_FLUSH],
  1209. NSEC_PER_MSEC));
  1210. seq_putc(seqf, '\n');
  1211. }
  1212. rcu_read_unlock();
  1213. return 0;
  1214. }
  1215. static const struct seq_operations diskstats_op = {
  1216. .start = disk_seqf_start,
  1217. .next = disk_seqf_next,
  1218. .stop = disk_seqf_stop,
  1219. .show = diskstats_show
  1220. };
  1221. static int __init proc_genhd_init(void)
  1222. {
  1223. proc_create_seq("diskstats", 0, NULL, &diskstats_op);
  1224. proc_create_seq("partitions", 0, NULL, &partitions_op);
  1225. return 0;
  1226. }
  1227. module_init(proc_genhd_init);
  1228. #endif /* CONFIG_PROC_FS */
  1229. dev_t part_devt(struct gendisk *disk, u8 partno)
  1230. {
  1231. struct block_device *part;
  1232. dev_t devt = 0;
  1233. rcu_read_lock();
  1234. part = xa_load(&disk->part_tbl, partno);
  1235. if (part)
  1236. devt = part->bd_dev;
  1237. rcu_read_unlock();
  1238. return devt;
  1239. }
  1240. struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
  1241. struct lock_class_key *lkclass)
  1242. {
  1243. struct gendisk *disk;
  1244. disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id);
  1245. if (!disk)
  1246. return NULL;
  1247. if (bioset_init(&disk->bio_split, BIO_POOL_SIZE, 0, 0))
  1248. goto out_free_disk;
  1249. disk->bdi = bdi_alloc(node_id);
  1250. if (!disk->bdi)
  1251. goto out_free_bioset;
  1252. /* bdev_alloc() might need the queue, set before the first call */
  1253. disk->queue = q;
  1254. disk->part0 = bdev_alloc(disk, 0);
  1255. if (!disk->part0)
  1256. goto out_free_bdi;
  1257. disk->node_id = node_id;
  1258. mutex_init(&disk->open_mutex);
  1259. xa_init(&disk->part_tbl);
  1260. if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL))
  1261. goto out_destroy_part_tbl;
  1262. if (blkcg_init_disk(disk))
  1263. goto out_erase_part0;
  1264. disk_init_zone_resources(disk);
  1265. rand_initialize_disk(disk);
  1266. disk_to_dev(disk)->class = &block_class;
  1267. disk_to_dev(disk)->type = &disk_type;
  1268. device_initialize(disk_to_dev(disk));
  1269. inc_diskseq(disk);
  1270. q->disk = disk;
  1271. lockdep_init_map(&disk->lockdep_map, "(bio completion)", lkclass, 0);
  1272. #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
  1273. INIT_LIST_HEAD(&disk->slave_bdevs);
  1274. #endif
  1275. mutex_init(&disk->rqos_state_mutex);
  1276. kobject_init(&disk->queue_kobj, &blk_queue_ktype);
  1277. return disk;
  1278. out_erase_part0:
  1279. xa_erase(&disk->part_tbl, 0);
  1280. out_destroy_part_tbl:
  1281. xa_destroy(&disk->part_tbl);
  1282. disk->part0->bd_disk = NULL;
  1283. bdev_drop(disk->part0);
  1284. out_free_bdi:
  1285. bdi_put(disk->bdi);
  1286. out_free_bioset:
  1287. bioset_exit(&disk->bio_split);
  1288. out_free_disk:
  1289. kfree(disk);
  1290. return NULL;
  1291. }
  1292. struct gendisk *__blk_alloc_disk(struct queue_limits *lim, int node,
  1293. struct lock_class_key *lkclass)
  1294. {
  1295. struct queue_limits default_lim = { };
  1296. struct request_queue *q;
  1297. struct gendisk *disk;
  1298. q = blk_alloc_queue(lim ? lim : &default_lim, node);
  1299. if (IS_ERR(q))
  1300. return ERR_CAST(q);
  1301. disk = __alloc_disk_node(q, node, lkclass);
  1302. if (!disk) {
  1303. blk_put_queue(q);
  1304. return ERR_PTR(-ENOMEM);
  1305. }
  1306. set_bit(GD_OWNS_QUEUE, &disk->state);
  1307. return disk;
  1308. }
  1309. EXPORT_SYMBOL(__blk_alloc_disk);
  1310. /**
  1311. * put_disk - decrements the gendisk refcount
  1312. * @disk: the struct gendisk to decrement the refcount for
  1313. *
  1314. * This decrements the refcount for the struct gendisk. When this reaches 0
  1315. * we'll have disk_release() called.
  1316. *
  1317. * Note: for blk-mq disk put_disk must be called before freeing the tag_set
  1318. * when handling probe errors (that is before add_disk() is called).
  1319. *
  1320. * Context: Any context, but the last reference must not be dropped from
  1321. * atomic context.
  1322. */
  1323. void put_disk(struct gendisk *disk)
  1324. {
  1325. if (disk)
  1326. put_device(disk_to_dev(disk));
  1327. }
  1328. EXPORT_SYMBOL(put_disk);
  1329. static void set_disk_ro_uevent(struct gendisk *gd, int ro)
  1330. {
  1331. char event[] = "DISK_RO=1";
  1332. char *envp[] = { event, NULL };
  1333. if (!ro)
  1334. event[8] = '0';
  1335. kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp);
  1336. }
  1337. /**
  1338. * set_disk_ro - set a gendisk read-only
  1339. * @disk: gendisk to operate on
  1340. * @read_only: %true to set the disk read-only, %false set the disk read/write
  1341. *
  1342. * This function is used to indicate whether a given disk device should have its
  1343. * read-only flag set. set_disk_ro() is typically used by device drivers to
  1344. * indicate whether the underlying physical device is write-protected.
  1345. */
  1346. void set_disk_ro(struct gendisk *disk, bool read_only)
  1347. {
  1348. if (read_only) {
  1349. if (test_and_set_bit(GD_READ_ONLY, &disk->state))
  1350. return;
  1351. } else {
  1352. if (!test_and_clear_bit(GD_READ_ONLY, &disk->state))
  1353. return;
  1354. }
  1355. set_disk_ro_uevent(disk, read_only);
  1356. }
  1357. EXPORT_SYMBOL(set_disk_ro);
  1358. void inc_diskseq(struct gendisk *disk)
  1359. {
  1360. disk->diskseq = atomic64_inc_return(&diskseq);
  1361. }