summaryrefslogtreecommitdiff
path: root/drivers
diff options
context:
space:
mode:
Diffstat (limited to 'drivers')
-rw-r--r--drivers/block/drbd/drbd_bitmap.c10
-rw-r--r--drivers/block/drbd/drbd_receiver.c8
-rw-r--r--drivers/block/floppy.c2
-rw-r--r--drivers/block/loop.c4
-rw-r--r--drivers/block/nbd.c5
-rw-r--r--drivers/block/null_blk/main.c82
-rw-r--r--drivers/block/null_blk/null_blk.h3
-rw-r--r--drivers/block/null_blk/zoned.c6
-rw-r--r--drivers/block/ps3disk.c4
-rw-r--r--drivers/block/rnbd/rnbd-proto.h15
-rw-r--r--drivers/block/rnull/rnull.rs3
-rw-r--r--drivers/block/ublk_drv.c385
-rw-r--r--drivers/block/virtio_blk.c24
-rw-r--r--drivers/block/zloop.c160
-rw-r--r--drivers/md/bcache/alloc.c25
-rw-r--r--drivers/md/bcache/bcache.h6
-rw-r--r--drivers/md/bcache/bset.h8
-rw-r--r--drivers/md/bcache/btree.c53
-rw-r--r--drivers/md/bcache/journal.c93
-rw-r--r--drivers/md/bcache/journal.h13
-rw-r--r--drivers/md/bcache/super.c33
-rw-r--r--drivers/md/bcache/sysfs.c15
-rw-r--r--drivers/md/bcache/writeback.c5
-rw-r--r--drivers/md/dm-zone.c63
-rw-r--r--drivers/md/dm.h3
-rw-r--r--drivers/md/md-linear.c2
-rw-r--r--drivers/md/md-llbitmap.c2
-rw-r--r--drivers/md/md.c259
-rw-r--r--drivers/md/md.h10
-rw-r--r--drivers/md/raid0.c20
-rw-r--r--drivers/md/raid1.c1
-rw-r--r--drivers/md/raid10.c1
-rw-r--r--drivers/md/raid5-cache.c2
-rw-r--r--drivers/md/raid5.c7
-rw-r--r--drivers/nvme/host/apple.c1
-rw-r--r--drivers/nvme/host/core.c15
-rw-r--r--drivers/nvme/host/fabrics.h6
-rw-r--r--drivers/nvme/host/fc.c1
-rw-r--r--drivers/nvme/host/multipath.c4
-rw-r--r--drivers/nvme/host/nvme.h9
-rw-r--r--drivers/nvme/host/pci.c118
-rw-r--r--drivers/nvme/host/rdma.c1
-rw-r--r--drivers/nvme/host/tcp.c1
-rw-r--r--drivers/nvme/host/zns.c10
-rw-r--r--drivers/nvme/target/loop.c1
-rw-r--r--drivers/s390/block/dasd.c64
-rw-r--r--drivers/s390/block/dasd_devmap.c3
-rw-r--r--drivers/s390/block/dasd_eckd.c8
-rw-r--r--drivers/s390/block/dasd_genhd.c80
-rw-r--r--drivers/scsi/sd.h2
-rw-r--r--drivers/scsi/sd_zbc.c20
51 files changed, 964 insertions, 712 deletions
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 85ca000a0564..d90fa3e7f4cf 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -1210,7 +1210,7 @@ static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned
return err;
}
-/**
+/*
* drbd_bm_read() - Read the whole bitmap from its on disk location.
* @device: DRBD device.
*/
@@ -1221,7 +1221,7 @@ int drbd_bm_read(struct drbd_device *device,
return bm_rw(device, BM_AIO_READ, 0);
}
-/**
+/*
* drbd_bm_write() - Write the whole bitmap to its on disk location.
* @device: DRBD device.
*
@@ -1233,7 +1233,7 @@ int drbd_bm_write(struct drbd_device *device,
return bm_rw(device, 0, 0);
}
-/**
+/*
* drbd_bm_write_all() - Write the whole bitmap to its on disk location.
* @device: DRBD device.
*
@@ -1255,7 +1255,7 @@ int drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_ho
return bm_rw(device, BM_AIO_COPY_PAGES, upper_idx);
}
-/**
+/*
* drbd_bm_write_copy_pages() - Write the whole bitmap to its on disk location.
* @device: DRBD device.
*
@@ -1272,7 +1272,7 @@ int drbd_bm_write_copy_pages(struct drbd_device *device,
return bm_rw(device, BM_AIO_COPY_PAGES, 0);
}
-/**
+/*
* drbd_bm_write_hinted() - Write bitmap pages with "hint" marks, if they have changed.
* @device: DRBD device.
*/
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 33bc91665fe8..3de919b6f0e1 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1736,13 +1736,13 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
page = peer_req->pages;
page_chain_for_each(page) {
unsigned len = min_t(int, ds, PAGE_SIZE);
- data = kmap(page);
+ data = kmap_local_page(page);
err = drbd_recv_all_warn(peer_device->connection, data, len);
if (drbd_insert_fault(device, DRBD_FAULT_RECEIVE)) {
drbd_err(device, "Fault injection: Corrupting data on receive\n");
data[0] = data[0] ^ (unsigned long)-1;
}
- kunmap(page);
+ kunmap_local(data);
if (err) {
drbd_free_peer_req(device, peer_req);
return NULL;
@@ -1777,7 +1777,7 @@ static int drbd_drain_block(struct drbd_peer_device *peer_device, int data_size)
page = drbd_alloc_pages(peer_device, 1, 1);
- data = kmap(page);
+ data = kmap_local_page(page);
while (data_size) {
unsigned int len = min_t(int, data_size, PAGE_SIZE);
@@ -1786,7 +1786,7 @@ static int drbd_drain_block(struct drbd_peer_device *peer_device, int data_size)
break;
data_size -= len;
}
- kunmap(page);
+ kunmap_local(data);
drbd_free_pages(peer_device->device, page);
return err;
}
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 5336c3c5ca36..c28786e0fe1c 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -329,7 +329,7 @@ static bool initialized;
* This default is used whenever the current disk size is unknown.
* [Now it is rather a minimum]
*/
-#define MAX_DISK_SIZE 4 /* 3984 */
+#define MAX_DISK_SIZE (PAGE_SIZE / 1024)
/*
* globals used by 'result()'
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 13ce229d450c..ebe751f39742 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1908,6 +1908,10 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
goto failed;
}
+ /* We can block in this context, so ignore REQ_NOWAIT. */
+ if (rq->cmd_flags & REQ_NOWAIT)
+ rq->cmd_flags &= ~REQ_NOWAIT;
+
if (cmd_blkcg_css)
kthread_associate_blkcg(cmd_blkcg_css);
if (cmd_memcg_css)
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 3263040fcf2d..f6c33b21f69e 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -1021,9 +1021,9 @@ static void recv_work(struct work_struct *work)
nbd_mark_nsock_dead(nbd, nsock, 1);
mutex_unlock(&nsock->tx_lock);
- nbd_config_put(nbd);
atomic_dec(&config->recv_threads);
wake_up(&config->recv_wq);
+ nbd_config_put(nbd);
kfree(args);
}
@@ -2238,12 +2238,13 @@ again:
ret = nbd_start_device(nbd);
out:
- mutex_unlock(&nbd->config_lock);
if (!ret) {
set_bit(NBD_RT_HAS_CONFIG_REF, &config->runtime_flags);
refcount_inc(&nbd->config_refs);
nbd_connect_reply(info, nbd->index);
}
+ mutex_unlock(&nbd->config_lock);
+
nbd_config_put(nbd);
if (put_dev)
nbd_put(nbd);
diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c
index 0ee55f889cfd..c7c0fb79a6bf 100644
--- a/drivers/block/null_blk/main.c
+++ b/drivers/block/null_blk/main.c
@@ -1129,26 +1129,28 @@ again:
return 0;
}
-static int copy_to_nullb(struct nullb *nullb, struct page *source,
- unsigned int off, sector_t sector, size_t n, bool is_fua)
+static blk_status_t copy_to_nullb(struct nullb *nullb, void *source,
+ loff_t pos, size_t n, bool is_fua)
{
size_t temp, count = 0;
- unsigned int offset;
struct nullb_page *t_page;
+ sector_t sector;
while (count < n) {
- temp = min_t(size_t, nullb->dev->blocksize, n - count);
+ temp = min3(nullb->dev->blocksize, n - count,
+ PAGE_SIZE - offset_in_page(pos));
+ sector = pos >> SECTOR_SHIFT;
if (null_cache_active(nullb) && !is_fua)
null_make_cache_space(nullb, PAGE_SIZE);
- offset = (sector & SECTOR_MASK) << SECTOR_SHIFT;
t_page = null_insert_page(nullb, sector,
!null_cache_active(nullb) || is_fua);
if (!t_page)
- return -ENOSPC;
+ return BLK_STS_NOSPC;
- memcpy_page(t_page->page, offset, source, off + count, temp);
+ memcpy_to_page(t_page->page, offset_in_page(pos),
+ source + count, temp);
__set_bit(sector & SECTOR_MASK, t_page->bitmap);
@@ -1156,41 +1158,34 @@ static int copy_to_nullb(struct nullb *nullb, struct page *source,
null_free_sector(nullb, sector, true);
count += temp;
- sector += temp >> SECTOR_SHIFT;
+ pos += temp;
}
- return 0;
+ return BLK_STS_OK;
}
-static int copy_from_nullb(struct nullb *nullb, struct page *dest,
- unsigned int off, sector_t sector, size_t n)
+static void copy_from_nullb(struct nullb *nullb, void *dest, loff_t pos,
+ size_t n)
{
size_t temp, count = 0;
- unsigned int offset;
struct nullb_page *t_page;
+ sector_t sector;
while (count < n) {
- temp = min_t(size_t, nullb->dev->blocksize, n - count);
+ temp = min3(nullb->dev->blocksize, n - count,
+ PAGE_SIZE - offset_in_page(pos));
+ sector = pos >> SECTOR_SHIFT;
- offset = (sector & SECTOR_MASK) << SECTOR_SHIFT;
t_page = null_lookup_page(nullb, sector, false,
!null_cache_active(nullb));
-
if (t_page)
- memcpy_page(dest, off + count, t_page->page, offset,
- temp);
+ memcpy_from_page(dest + count, t_page->page,
+ offset_in_page(pos), temp);
else
- memzero_page(dest, off + count, temp);
+ memset(dest + count, 0, temp);
count += temp;
- sector += temp >> SECTOR_SHIFT;
+ pos += temp;
}
- return 0;
-}
-
-static void nullb_fill_pattern(struct nullb *nullb, struct page *page,
- unsigned int len, unsigned int off)
-{
- memset_page(page, off, 0xff, len);
}
blk_status_t null_handle_discard(struct nullb_device *dev,
@@ -1234,34 +1229,39 @@ static blk_status_t null_handle_flush(struct nullb *nullb)
return errno_to_blk_status(err);
}
-static int null_transfer(struct nullb *nullb, struct page *page,
- unsigned int len, unsigned int off, bool is_write, sector_t sector,
+static blk_status_t null_transfer(struct nullb *nullb, struct page *page,
+ unsigned int len, unsigned int off, bool is_write, loff_t pos,
bool is_fua)
{
struct nullb_device *dev = nullb->dev;
+ blk_status_t err = BLK_STS_OK;
unsigned int valid_len = len;
- int err = 0;
+ void *p;
+ p = kmap_local_page(page) + off;
if (!is_write) {
- if (dev->zoned)
+ if (dev->zoned) {
valid_len = null_zone_valid_read_len(nullb,
- sector, len);
+ pos >> SECTOR_SHIFT, len);
+ if (valid_len && valid_len != len)
+ valid_len -= pos & (SECTOR_SIZE - 1);
+ }
if (valid_len) {
- err = copy_from_nullb(nullb, page, off,
- sector, valid_len);
+ copy_from_nullb(nullb, p, pos, valid_len);
off += valid_len;
len -= valid_len;
}
if (len)
- nullb_fill_pattern(nullb, page, len, off);
+ memset(p + valid_len, 0xff, len);
flush_dcache_page(page);
} else {
flush_dcache_page(page);
- err = copy_to_nullb(nullb, page, off, sector, len, is_fua);
+ err = copy_to_nullb(nullb, p, pos, len, is_fua);
}
+ kunmap_local(p);
return err;
}
@@ -1274,9 +1274,9 @@ static blk_status_t null_handle_data_transfer(struct nullb_cmd *cmd,
{
struct request *rq = blk_mq_rq_from_pdu(cmd);
struct nullb *nullb = cmd->nq->dev->nullb;
- int err = 0;
+ blk_status_t err = BLK_STS_OK;
unsigned int len;
- sector_t sector = blk_rq_pos(rq);
+ loff_t pos = blk_rq_pos(rq) << SECTOR_SHIFT;
unsigned int max_bytes = nr_sectors << SECTOR_SHIFT;
unsigned int transferred_bytes = 0;
struct req_iterator iter;
@@ -1288,18 +1288,18 @@ static blk_status_t null_handle_data_transfer(struct nullb_cmd *cmd,
if (transferred_bytes + len > max_bytes)
len = max_bytes - transferred_bytes;
err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset,
- op_is_write(req_op(rq)), sector,
+ op_is_write(req_op(rq)), pos,
rq->cmd_flags & REQ_FUA);
if (err)
break;
- sector += len >> SECTOR_SHIFT;
+ pos += len;
transferred_bytes += len;
if (transferred_bytes >= max_bytes)
break;
}
spin_unlock_irq(&nullb->lock);
- return errno_to_blk_status(err);
+ return err;
}
static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd)
@@ -1949,7 +1949,7 @@ static int null_add_dev(struct nullb_device *dev)
.logical_block_size = dev->blocksize,
.physical_block_size = dev->blocksize,
.max_hw_sectors = dev->max_sectors,
- .dma_alignment = dev->blocksize - 1,
+ .dma_alignment = 1,
};
struct nullb *nullb;
diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h
index 7bb6128dbaaf..6c4c4bbe7dad 100644
--- a/drivers/block/null_blk/null_blk.h
+++ b/drivers/block/null_blk/null_blk.h
@@ -143,7 +143,8 @@ int null_init_zoned_dev(struct nullb_device *dev, struct queue_limits *lim);
int null_register_zoned_dev(struct nullb *nullb);
void null_free_zoned_dev(struct nullb_device *dev);
int null_report_zones(struct gendisk *disk, sector_t sector,
- unsigned int nr_zones, report_zones_cb cb, void *data);
+ unsigned int nr_zones,
+ struct blk_report_zones_args *args);
blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_op op,
sector_t sector, sector_t nr_sectors);
size_t null_zone_valid_read_len(struct nullb *nullb,
diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c
index 4e5728f45989..0ada35dc0989 100644
--- a/drivers/block/null_blk/zoned.c
+++ b/drivers/block/null_blk/zoned.c
@@ -191,7 +191,7 @@ void null_free_zoned_dev(struct nullb_device *dev)
}
int null_report_zones(struct gendisk *disk, sector_t sector,
- unsigned int nr_zones, report_zones_cb cb, void *data)
+ unsigned int nr_zones, struct blk_report_zones_args *args)
{
struct nullb *nullb = disk->private_data;
struct nullb_device *dev = nullb->dev;
@@ -225,7 +225,7 @@ int null_report_zones(struct gendisk *disk, sector_t sector,
blkz.capacity = zone->capacity;
null_unlock_zone(dev, zone);
- error = cb(&blkz, i, data);
+ error = disk_report_zone(disk, &blkz, i, args);
if (error)
return error;
}
@@ -242,7 +242,7 @@ size_t null_zone_valid_read_len(struct nullb *nullb,
{
struct nullb_device *dev = nullb->dev;
struct nullb_zone *zone = &dev->zones[null_zone_no(dev, sector)];
- unsigned int nr_sectors = len >> SECTOR_SHIFT;
+ unsigned int nr_sectors = DIV_ROUND_UP(len, SECTOR_SIZE);
/* Read must be below the write pointer position */
if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL ||
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index dc9e4a14b885..8892f218a814 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -85,10 +85,14 @@ static void ps3disk_scatter_gather(struct ps3_storage_device *dev,
struct bio_vec bvec;
rq_for_each_segment(bvec, req, iter) {
+ dev_dbg(&dev->sbd.core, "%s:%u: %u sectors from %llu\n",
+ __func__, __LINE__, bio_sectors(iter.bio),
+ iter.bio->bi_iter.bi_sector);
if (gather)
memcpy_from_bvec(dev->bounce_buf + offset, &bvec);
else
memcpy_to_bvec(&bvec, dev->bounce_buf + offset);
+ offset += bvec.bv_len;
}
}
diff --git a/drivers/block/rnbd/rnbd-proto.h b/drivers/block/rnbd/rnbd-proto.h
index f35be51d213c..77360c2a6069 100644
--- a/drivers/block/rnbd/rnbd-proto.h
+++ b/drivers/block/rnbd/rnbd-proto.h
@@ -24,7 +24,7 @@
#define RTRS_PORT 1234
/**
- * enum rnbd_msg_types - RNBD message types
+ * enum rnbd_msg_type - RNBD message types
* @RNBD_MSG_SESS_INFO: initial session info from client to server
* @RNBD_MSG_SESS_INFO_RSP: initial session info from server to client
* @RNBD_MSG_OPEN: open (map) device request
@@ -47,10 +47,11 @@ enum rnbd_msg_type {
*/
struct rnbd_msg_hdr {
__le16 type;
+ /* private: */
__le16 __padding;
};
-/**
+/*
* We allow to map RO many times and RW only once. We allow to map yet another
* time RW, if MIGRATION is provided (second RW export can be required for
* example for VM migration)
@@ -78,6 +79,7 @@ static const __maybe_unused struct {
struct rnbd_msg_sess_info {
struct rnbd_msg_hdr hdr;
u8 ver;
+ /* private: */
u8 reserved[31];
};
@@ -89,6 +91,7 @@ struct rnbd_msg_sess_info {
struct rnbd_msg_sess_info_rsp {
struct rnbd_msg_hdr hdr;
u8 ver;
+ /* private: */
u8 reserved[31];
};
@@ -97,13 +100,16 @@ struct rnbd_msg_sess_info_rsp {
* @hdr: message header
* @access_mode: the mode to open remote device, valid values see:
* enum rnbd_access_mode
- * @device_name: device path on remote side
+ * @dev_name: device path on remote side
*/
struct rnbd_msg_open {
struct rnbd_msg_hdr hdr;
u8 access_mode;
+ /* private: */
u8 resv1;
+ /* public: */
s8 dev_name[NAME_MAX];
+ /* private: */
u8 reserved[3];
};
@@ -155,6 +161,7 @@ struct rnbd_msg_open_rsp {
__le16 secure_discard;
u8 obsolete_rotational;
u8 cache_policy;
+ /* private: */
u8 reserved[10];
};
@@ -187,7 +194,7 @@ struct rnbd_msg_io {
* @RNBD_OP_DISCARD: discard sectors
* @RNBD_OP_SECURE_ERASE: securely erase sectors
* @RNBD_OP_WRITE_ZEROES: write zeroes sectors
-
+ *
* @RNBD_F_SYNC: request is sync (sync write or read)
* @RNBD_F_FUA: forced unit access
*/
diff --git a/drivers/block/rnull/rnull.rs b/drivers/block/rnull/rnull.rs
index 1ec694d7f1a6..a9d5e575a2c4 100644
--- a/drivers/block/rnull/rnull.rs
+++ b/drivers/block/rnull/rnull.rs
@@ -17,8 +17,7 @@ use kernel::{
error::Result,
pr_info,
prelude::*,
- sync::Arc,
- types::ARef,
+ sync::{aref::ARef, Arc},
};
use pin_init::PinInit;
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index e0c601128efa..2c715df63f23 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -155,12 +155,13 @@ struct ublk_uring_cmd_pdu {
*/
#define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2)
+union ublk_io_buf {
+ __u64 addr;
+ struct ublk_auto_buf_reg auto_reg;
+};
+
struct ublk_io {
- /* userspace buffer address from io cmd */
- union {
- __u64 addr;
- struct ublk_auto_buf_reg buf;
- };
+ union ublk_io_buf buf;
unsigned int flags;
int res;
@@ -203,15 +204,12 @@ struct ublk_queue {
bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */
spinlock_t cancel_lock;
struct ublk_device *dev;
- struct ublk_io ios[];
+ struct ublk_io ios[] __counted_by(q_depth);
};
struct ublk_device {
struct gendisk *ub_disk;
- char *__queues;
-
- unsigned int queue_size;
struct ublksrv_ctrl_dev_info dev_info;
struct blk_mq_tag_set tag_set;
@@ -239,6 +237,8 @@ struct ublk_device {
bool canceling;
pid_t ublksrv_tgid;
struct delayed_work exit_work;
+
+ struct ublk_queue *queues[];
};
/* header of ublk_params */
@@ -265,7 +265,7 @@ static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
return ub->dev_info.flags & UBLK_F_ZONED;
}
-static inline bool ublk_queue_is_zoned(struct ublk_queue *ubq)
+static inline bool ublk_queue_is_zoned(const struct ublk_queue *ubq)
{
return ubq->flags & UBLK_F_ZONED;
}
@@ -368,7 +368,7 @@ static void *ublk_alloc_report_buffer(struct ublk_device *ublk,
}
static int ublk_report_zones(struct gendisk *disk, sector_t sector,
- unsigned int nr_zones, report_zones_cb cb, void *data)
+ unsigned int nr_zones, struct blk_report_zones_args *args)
{
struct ublk_device *ub = disk->private_data;
unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors;
@@ -431,7 +431,7 @@ free_req:
if (!zone->len)
break;
- ret = cb(zone, i, data);
+ ret = disk_report_zone(disk, zone, i, args);
if (ret)
goto out;
@@ -499,7 +499,7 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
iod->op_flags = ublk_op | ublk_req_build_flags(req);
iod->nr_sectors = blk_rq_sectors(req);
iod->start_sector = blk_rq_pos(req);
- iod->addr = io->addr;
+ iod->addr = io->buf.addr;
return BLK_STS_OK;
}
@@ -781,7 +781,7 @@ static noinline void ublk_put_device(struct ublk_device *ub)
static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev,
int qid)
{
- return (struct ublk_queue *)&(dev->__queues[qid * dev->queue_size]);
+ return dev->queues[qid];
}
static inline bool ublk_rq_has_data(const struct request *rq)
@@ -914,73 +914,6 @@ static const struct block_device_operations ub_fops = {
.report_zones = ublk_report_zones,
};
-#define UBLK_MAX_PIN_PAGES 32
-
-struct ublk_io_iter {
- struct page *pages[UBLK_MAX_PIN_PAGES];
- struct bio *bio;
- struct bvec_iter iter;
-};
-
-/* return how many pages are copied */
-static void ublk_copy_io_pages(struct ublk_io_iter *data,
- size_t total, size_t pg_off, int dir)
-{
- unsigned done = 0;
- unsigned pg_idx = 0;
-
- while (done < total) {
- struct bio_vec bv = bio_iter_iovec(data->bio, data->iter);
- unsigned int bytes = min3(bv.bv_len, (unsigned)total - done,
- (unsigned)(PAGE_SIZE - pg_off));
- void *bv_buf = bvec_kmap_local(&bv);
- void *pg_buf = kmap_local_page(data->pages[pg_idx]);
-
- if (dir == ITER_DEST)
- memcpy(pg_buf + pg_off, bv_buf, bytes);
- else
- memcpy(bv_buf, pg_buf + pg_off, bytes);
-
- kunmap_local(pg_buf);
- kunmap_local(bv_buf);
-
- /* advance page array */
- pg_off += bytes;
- if (pg_off == PAGE_SIZE) {
- pg_idx += 1;
- pg_off = 0;
- }
-
- done += bytes;
-
- /* advance bio */
- bio_advance_iter_single(data->bio, &data->iter, bytes);
- if (!data->iter.bi_size) {
- data->bio = data->bio->bi_next;
- if (data->bio == NULL)
- break;
- data->iter = data->bio->bi_iter;
- }
- }
-}
-
-static bool ublk_advance_io_iter(const struct request *req,
- struct ublk_io_iter *iter, unsigned int offset)
-{
- struct bio *bio = req->bio;
-
- for_each_bio(bio) {
- if (bio->bi_iter.bi_size > offset) {
- iter->bio = bio;
- iter->iter = bio->bi_iter;
- bio_advance_iter(iter->bio, &iter->iter, offset);
- return true;
- }
- offset -= bio->bi_iter.bi_size;
- }
- return false;
-}
-
/*
* Copy data between request pages and io_iter, and 'offset'
* is the start point of linear offset of request.
@@ -988,34 +921,35 @@ static bool ublk_advance_io_iter(const struct request *req,
static size_t ublk_copy_user_pages(const struct request *req,
unsigned offset, struct iov_iter *uiter, int dir)
{
- struct ublk_io_iter iter;
+ struct req_iterator iter;
+ struct bio_vec bv;
size_t done = 0;
- if (!ublk_advance_io_iter(req, &iter, offset))
- return 0;
-
- while (iov_iter_count(uiter) && iter.bio) {
- unsigned nr_pages;
- ssize_t len;
- size_t off;
- int i;
+ rq_for_each_segment(bv, req, iter) {
+ void *bv_buf;
+ size_t copied;
- len = iov_iter_get_pages2(uiter, iter.pages,
- iov_iter_count(uiter),
- UBLK_MAX_PIN_PAGES, &off);
- if (len <= 0)
- return done;
-
- ublk_copy_io_pages(&iter, len, off, dir);
- nr_pages = DIV_ROUND_UP(len + off, PAGE_SIZE);
- for (i = 0; i < nr_pages; i++) {
- if (dir == ITER_DEST)
- set_page_dirty(iter.pages[i]);
- put_page(iter.pages[i]);
+ if (offset >= bv.bv_len) {
+ offset -= bv.bv_len;
+ continue;
}
- done += len;
- }
+ bv.bv_offset += offset;
+ bv.bv_len -= offset;
+ bv_buf = bvec_kmap_local(&bv);
+ if (dir == ITER_DEST)
+ copied = copy_to_iter(bv_buf, bv.bv_len, uiter);
+ else
+ copied = copy_from_iter(bv_buf, bv.bv_len, uiter);
+
+ kunmap_local(bv_buf);
+
+ done += copied;
+ if (copied < bv.bv_len)
+ break;
+
+ offset = 0;
+ }
return done;
}
@@ -1030,8 +964,9 @@ static inline bool ublk_need_unmap_req(const struct request *req)
(req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN);
}
-static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
- const struct ublk_io *io)
+static unsigned int ublk_map_io(const struct ublk_queue *ubq,
+ const struct request *req,
+ const struct ublk_io *io)
{
const unsigned int rq_bytes = blk_rq_bytes(req);
@@ -1047,13 +982,13 @@ static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
struct iov_iter iter;
const int dir = ITER_DEST;
- import_ubuf(dir, u64_to_user_ptr(io->addr), rq_bytes, &iter);
+ import_ubuf(dir, u64_to_user_ptr(io->buf.addr), rq_bytes, &iter);
return ublk_copy_user_pages(req, 0, &iter, dir);
}
return rq_bytes;
}
-static int ublk_unmap_io(bool need_map,
+static unsigned int ublk_unmap_io(bool need_map,
const struct request *req,
const struct ublk_io *io)
{
@@ -1068,7 +1003,7 @@ static int ublk_unmap_io(bool need_map,
WARN_ON_ONCE(io->res > rq_bytes);
- import_ubuf(dir, u64_to_user_ptr(io->addr), io->res, &iter);
+ import_ubuf(dir, u64_to_user_ptr(io->buf.addr), io->res, &iter);
return ublk_copy_user_pages(req, 0, &iter, dir);
}
return rq_bytes;
@@ -1134,7 +1069,7 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
iod->op_flags = ublk_op | ublk_req_build_flags(req);
iod->nr_sectors = blk_rq_sectors(req);
iod->start_sector = blk_rq_pos(req);
- iod->addr = io->addr;
+ iod->addr = io->buf.addr;
return BLK_STS_OK;
}
@@ -1233,45 +1168,65 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq,
}
static void
-ublk_auto_buf_reg_fallback(const struct ublk_queue *ubq, struct ublk_io *io)
+ublk_auto_buf_reg_fallback(const struct ublk_queue *ubq, unsigned tag)
{
- unsigned tag = io - ubq->ios;
struct ublksrv_io_desc *iod = ublk_get_iod(ubq, tag);
iod->op_flags |= UBLK_IO_F_NEED_REG_BUF;
}
-static bool ublk_auto_buf_reg(const struct ublk_queue *ubq, struct request *req,
- struct ublk_io *io, unsigned int issue_flags)
+enum auto_buf_reg_res {
+ AUTO_BUF_REG_FAIL,
+ AUTO_BUF_REG_FALLBACK,
+ AUTO_BUF_REG_OK,
+};
+
+static void ublk_prep_auto_buf_reg_io(const struct ublk_queue *ubq,
+ struct request *req, struct ublk_io *io,
+ struct io_uring_cmd *cmd,
+ enum auto_buf_reg_res res)
+{
+ if (res == AUTO_BUF_REG_OK) {
+ io->task_registered_buffers = 1;
+ io->buf_ctx_handle = io_uring_cmd_ctx_handle(cmd);
+ io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG;
+ }
+ ublk_init_req_ref(ubq, io);
+ __ublk_prep_compl_io_cmd(io, req);
+}
+
+static enum auto_buf_reg_res
+__ublk_do_auto_buf_reg(const struct ublk_queue *ubq, struct request *req,
+ struct ublk_io *io, struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
{
int ret;
- ret = io_buffer_register_bvec(io->cmd, req, ublk_io_release,
- io->buf.index, issue_flags);
+ ret = io_buffer_register_bvec(cmd, req, ublk_io_release,
+ io->buf.auto_reg.index, issue_flags);
if (ret) {
- if (io->buf.flags & UBLK_AUTO_BUF_REG_FALLBACK) {
- ublk_auto_buf_reg_fallback(ubq, io);
- return true;
+ if (io->buf.auto_reg.flags & UBLK_AUTO_BUF_REG_FALLBACK) {
+ ublk_auto_buf_reg_fallback(ubq, req->tag);
+ return AUTO_BUF_REG_FALLBACK;
}
blk_mq_end_request(req, BLK_STS_IOERR);
- return false;
+ return AUTO_BUF_REG_FAIL;
}
- io->task_registered_buffers = 1;
- io->buf_ctx_handle = io_uring_cmd_ctx_handle(io->cmd);
- io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG;
- return true;
+ return AUTO_BUF_REG_OK;
}
-static bool ublk_prep_auto_buf_reg(struct ublk_queue *ubq,
- struct request *req, struct ublk_io *io,
- unsigned int issue_flags)
+static void ublk_do_auto_buf_reg(const struct ublk_queue *ubq, struct request *req,
+ struct ublk_io *io, struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
{
- ublk_init_req_ref(ubq, io);
- if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req))
- return ublk_auto_buf_reg(ubq, req, io, issue_flags);
+ enum auto_buf_reg_res res = __ublk_do_auto_buf_reg(ubq, req, io, cmd,
+ issue_flags);
- return true;
+ if (res != AUTO_BUF_REG_FAIL) {
+ ublk_prep_auto_buf_reg_io(ubq, req, io, cmd, res);
+ io_uring_cmd_done(cmd, UBLK_IO_RES_OK, issue_flags);
+ }
}
static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req,
@@ -1343,8 +1298,12 @@ static void ublk_dispatch_req(struct ublk_queue *ubq, struct request *req)
if (!ublk_start_io(ubq, req, io))
return;
- if (ublk_prep_auto_buf_reg(ubq, req, io, issue_flags))
+ if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) {
+ ublk_do_auto_buf_reg(ubq, req, io, io->cmd, issue_flags);
+ } else {
+ ublk_init_req_ref(ubq, io);
ublk_complete_io_cmd(io, req, UBLK_IO_RES_OK, issue_flags);
+ }
}
static void ublk_cmd_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw)
@@ -1536,7 +1495,7 @@ static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
*/
io->flags &= UBLK_IO_FLAG_CANCELED;
io->cmd = NULL;
- io->addr = 0;
+ io->buf.addr = 0;
/*
* old task is PF_EXITING, put it now
@@ -2097,13 +2056,16 @@ static inline int ublk_check_cmd_op(u32 cmd_op)
static inline int ublk_set_auto_buf_reg(struct ublk_io *io, struct io_uring_cmd *cmd)
{
- io->buf = ublk_sqe_addr_to_auto_buf_reg(READ_ONCE(cmd->sqe->addr));
+ struct ublk_auto_buf_reg buf;
- if (io->buf.reserved0 || io->buf.reserved1)
+ buf = ublk_sqe_addr_to_auto_buf_reg(READ_ONCE(cmd->sqe->addr));
+
+ if (buf.reserved0 || buf.reserved1)
return -EINVAL;
- if (io->buf.flags & ~UBLK_AUTO_BUF_REG_F_MASK)
+ if (buf.flags & ~UBLK_AUTO_BUF_REG_F_MASK)
return -EINVAL;
+ io->buf.auto_reg = buf;
return 0;
}
@@ -2125,7 +2087,7 @@ static int ublk_handle_auto_buf_reg(struct ublk_io *io,
* this ublk request gets stuck.
*/
if (io->buf_ctx_handle == io_uring_cmd_ctx_handle(cmd))
- *buf_idx = io->buf.index;
+ *buf_idx = io->buf.auto_reg.index;
}
return ublk_set_auto_buf_reg(io, cmd);
@@ -2153,7 +2115,7 @@ ublk_config_io_buf(const struct ublk_device *ub, struct ublk_io *io,
if (ublk_dev_support_auto_buf_reg(ub))
return ublk_handle_auto_buf_reg(io, cmd, buf_idx);
- io->addr = buf_addr;
+ io->buf.addr = buf_addr;
return 0;
}
@@ -2271,39 +2233,41 @@ static int ublk_check_fetch_buf(const struct ublk_device *ub, __u64 buf_addr)
return 0;
}
-static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub,
- struct ublk_io *io, __u64 buf_addr)
+static int __ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub,
+ struct ublk_io *io)
{
- int ret = 0;
-
- /*
- * When handling FETCH command for setting up ublk uring queue,
- * ub->mutex is the innermost lock, and we won't block for handling
- * FETCH, so it is fine even for IO_URING_F_NONBLOCK.
- */
- mutex_lock(&ub->mutex);
/* UBLK_IO_FETCH_REQ is only allowed before dev is setup */
- if (ublk_dev_ready(ub)) {
- ret = -EBUSY;
- goto out;
- }
+ if (ublk_dev_ready(ub))
+ return -EBUSY;
/* allow each command to be FETCHed at most once */
- if (io->flags & UBLK_IO_FLAG_ACTIVE) {
- ret = -EINVAL;
- goto out;
- }
+ if (io->flags & UBLK_IO_FLAG_ACTIVE)
+ return -EINVAL;
WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV);
ublk_fill_io_cmd(io, cmd);
- ret = ublk_config_io_buf(ub, io, cmd, buf_addr, NULL);
- if (ret)
- goto out;
WRITE_ONCE(io->task, get_task_struct(current));
ublk_mark_io_ready(ub);
-out:
+
+ return 0;
+}
+
+static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub,
+ struct ublk_io *io, __u64 buf_addr)
+{
+ int ret;
+
+ /*
+ * When handling FETCH command for setting up ublk uring queue,
+ * ub->mutex is the innermost lock, and we won't block for handling
+ * FETCH, so it is fine even for IO_URING_F_NONBLOCK.
+ */
+ mutex_lock(&ub->mutex);
+ ret = __ublk_fetch(cmd, ub, io);
+ if (!ret)
+ ret = ublk_config_io_buf(ub, io, cmd, buf_addr, NULL);
mutex_unlock(&ub->mutex);
return ret;
}
@@ -2350,7 +2314,7 @@ static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io,
*/
io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
/* update iod->addr because ublksrv may have passed a new io buffer */
- ublk_get_iod(ubq, req->tag)->addr = io->addr;
+ ublk_get_iod(ubq, req->tag)->addr = io->buf.addr;
pr_devel("%s: update iod->addr: qid %d tag %d io_flags %x addr %llx\n",
__func__, ubq->q_id, req->tag, io->flags,
ublk_get_iod(ubq, req->tag)->addr);
@@ -2366,7 +2330,7 @@ static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd,
u16 buf_idx = UBLK_INVALID_BUF_IDX;
struct ublk_device *ub = cmd->file->private_data;
struct ublk_queue *ubq;
- struct ublk_io *io;
+ struct ublk_io *io = NULL;
u32 cmd_op = cmd->cmd_op;
u16 q_id = READ_ONCE(ub_src->q_id);
u16 tag = READ_ONCE(ub_src->tag);
@@ -2487,7 +2451,7 @@ static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd,
out:
pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n",
- __func__, cmd_op, tag, ret, io->flags);
+ __func__, cmd_op, tag, ret, io ? io->flags : 0);
return ret;
}
@@ -2575,9 +2539,6 @@ static struct request *ublk_check_and_get_req(struct kiocb *iocb,
size_t buf_off;
u16 tag, q_id;
- if (!ub)
- return ERR_PTR(-EACCES);
-
if (!user_backed_iter(iter))
return ERR_PTR(-EACCES);
@@ -2603,9 +2564,6 @@ static struct request *ublk_check_and_get_req(struct kiocb *iocb,
if (!req)
return ERR_PTR(-EINVAL);
- if (!req->mq_hctx || !req->mq_hctx->driver_data)
- goto fail;
-
if (!ublk_check_ubuf_dir(req, dir))
goto fail;
@@ -2662,9 +2620,13 @@ static const struct file_operations ublk_ch_fops = {
static void ublk_deinit_queue(struct ublk_device *ub, int q_id)
{
- int size = ublk_queue_cmd_buf_size(ub);
- struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
- int i;
+ struct ublk_queue *ubq = ub->queues[q_id];
+ int size, i;
+
+ if (!ubq)
+ return;
+
+ size = ublk_queue_cmd_buf_size(ub);
for (i = 0; i < ubq->q_depth; i++) {
struct ublk_io *io = &ubq->ios[i];
@@ -2676,57 +2638,76 @@ static void ublk_deinit_queue(struct ublk_device *ub, int q_id)
if (ubq->io_cmd_buf)
free_pages((unsigned long)ubq->io_cmd_buf, get_order(size));
+
+ kvfree(ubq);
+ ub->queues[q_id] = NULL;
+}
+
+static int ublk_get_queue_numa_node(struct ublk_device *ub, int q_id)
+{
+ unsigned int cpu;
+
+ /* Find first CPU mapped to this queue */
+ for_each_possible_cpu(cpu) {
+ if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[cpu] == q_id)
+ return cpu_to_node(cpu);
+ }
+
+ return NUMA_NO_NODE;
}
static int ublk_init_queue(struct ublk_device *ub, int q_id)
{
- struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
+ int depth = ub->dev_info.queue_depth;
gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO;
- void *ptr;
+ struct ublk_queue *ubq;
+ struct page *page;
+ int numa_node;
int size;
+ /* Determine NUMA node based on queue's CPU affinity */
+ numa_node = ublk_get_queue_numa_node(ub, q_id);
+
+ /* Allocate queue structure on local NUMA node */
+ ubq = kvzalloc_node(struct_size(ubq, ios, depth), GFP_KERNEL,
+ numa_node);
+ if (!ubq)
+ return -ENOMEM;
+
spin_lock_init(&ubq->cancel_lock);
ubq->flags = ub->dev_info.flags;
ubq->q_id = q_id;
- ubq->q_depth = ub->dev_info.queue_depth;
+ ubq->q_depth = depth;
size = ublk_queue_cmd_buf_size(ub);
- ptr = (void *) __get_free_pages(gfp_flags, get_order(size));
- if (!ptr)
+ /* Allocate I/O command buffer on local NUMA node */
+ page = alloc_pages_node(numa_node, gfp_flags, get_order(size));
+ if (!page) {
+ kvfree(ubq);
return -ENOMEM;
+ }
+ ubq->io_cmd_buf = page_address(page);
- ubq->io_cmd_buf = ptr;
+ ub->queues[q_id] = ubq;
ubq->dev = ub;
return 0;
}
static void ublk_deinit_queues(struct ublk_device *ub)
{
- int nr_queues = ub->dev_info.nr_hw_queues;
int i;
- if (!ub->__queues)
- return;
-
- for (i = 0; i < nr_queues; i++)
+ for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
ublk_deinit_queue(ub, i);
- kvfree(ub->__queues);
}
static int ublk_init_queues(struct ublk_device *ub)
{
- int nr_queues = ub->dev_info.nr_hw_queues;
- int depth = ub->dev_info.queue_depth;
- int ubq_size = sizeof(struct ublk_queue) + depth * sizeof(struct ublk_io);
- int i, ret = -ENOMEM;
+ int i, ret;
- ub->queue_size = ubq_size;
- ub->__queues = kvcalloc(nr_queues, ubq_size, GFP_KERNEL);
- if (!ub->__queues)
- return ret;
-
- for (i = 0; i < nr_queues; i++) {
- if (ublk_init_queue(ub, i))
+ for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
+ ret = ublk_init_queue(ub, i);
+ if (ret)
goto fail;
}
@@ -3128,7 +3109,7 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
goto out_unlock;
ret = -ENOMEM;
- ub = kzalloc(sizeof(*ub), GFP_KERNEL);
+ ub = kzalloc(struct_size(ub, queues, info.nr_hw_queues), GFP_KERNEL);
if (!ub)
goto out_unlock;
mutex_init(&ub->mutex);
@@ -3178,17 +3159,17 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
ub->dev_info.nr_hw_queues, nr_cpu_ids);
ublk_align_max_io_size(ub);
- ret = ublk_init_queues(ub);
+ ret = ublk_add_tag_set(ub);
if (ret)
goto out_free_dev_number;
- ret = ublk_add_tag_set(ub);
+ ret = ublk_init_queues(ub);
if (ret)
- goto out_deinit_queues;
+ goto out_free_tag_set;
ret = -EFAULT;
if (copy_to_user(argp, &ub->dev_info, sizeof(info)))
- goto out_free_tag_set;
+ goto out_deinit_queues;
/*
* Add the char dev so that ublksrv daemon can be setup.
@@ -3197,10 +3178,10 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
ret = ublk_add_chdev(ub);
goto out_unlock;
-out_free_tag_set:
- blk_mq_free_tag_set(&ub->tag_set);
out_deinit_queues:
ublk_deinit_queues(ub);
+out_free_tag_set:
+ blk_mq_free_tag_set(&ub->tag_set);
out_free_dev_number:
ublk_free_dev_number(ub);
out_free_ub:
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index f061420dfb10..357434bdae99 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -584,7 +584,8 @@ out:
static int virtblk_parse_zone(struct virtio_blk *vblk,
struct virtio_blk_zone_descriptor *entry,
- unsigned int idx, report_zones_cb cb, void *data)
+ unsigned int idx,
+ struct blk_report_zones_args *args)
{
struct blk_zone zone = { };
@@ -650,12 +651,12 @@ static int virtblk_parse_zone(struct virtio_blk *vblk,
* The callback below checks the validity of the reported
* entry data, no need to further validate it here.
*/
- return cb(&zone, idx, data);
+ return disk_report_zone(vblk->disk, &zone, idx, args);
}
static int virtblk_report_zones(struct gendisk *disk, sector_t sector,
- unsigned int nr_zones, report_zones_cb cb,
- void *data)
+ unsigned int nr_zones,
+ struct blk_report_zones_args *args)
{
struct virtio_blk *vblk = disk->private_data;
struct virtio_blk_zone_report *report;
@@ -693,7 +694,7 @@ static int virtblk_report_zones(struct gendisk *disk, sector_t sector,
for (i = 0; i < nz && zone_idx < nr_zones; i++) {
ret = virtblk_parse_zone(vblk, &report->zones[i],
- zone_idx, cb, data);
+ zone_idx, args);
if (ret)
goto fail_report;
@@ -1026,8 +1027,13 @@ static int init_vq(struct virtio_blk *vblk)
out:
kfree(vqs);
kfree(vqs_info);
- if (err)
+ if (err) {
kfree(vblk->vqs);
+ /*
+ * Set to NULL to prevent freeing vqs again during freezing.
+ */
+ vblk->vqs = NULL;
+ }
return err;
}
@@ -1598,6 +1604,12 @@ static int virtblk_freeze_priv(struct virtio_device *vdev)
vdev->config->del_vqs(vdev);
kfree(vblk->vqs);
+ /*
+ * Set to NULL to prevent freeing vqs again after a failed vqs
+ * allocation during resume. Note that kfree() already handles NULL
+ * pointers safely.
+ */
+ vblk->vqs = NULL;
return 0;
}
diff --git a/drivers/block/zloop.c b/drivers/block/zloop.c
index a423228e201b..3f50321aa4a7 100644
--- a/drivers/block/zloop.c
+++ b/drivers/block/zloop.c
@@ -32,6 +32,8 @@ enum {
ZLOOP_OPT_NR_QUEUES = (1 << 6),
ZLOOP_OPT_QUEUE_DEPTH = (1 << 7),
ZLOOP_OPT_BUFFERED_IO = (1 << 8),
+ ZLOOP_OPT_ZONE_APPEND = (1 << 9),
+ ZLOOP_OPT_ORDERED_ZONE_APPEND = (1 << 10),
};
static const match_table_t zloop_opt_tokens = {
@@ -44,6 +46,8 @@ static const match_table_t zloop_opt_tokens = {
{ ZLOOP_OPT_NR_QUEUES, "nr_queues=%u" },
{ ZLOOP_OPT_QUEUE_DEPTH, "queue_depth=%u" },
{ ZLOOP_OPT_BUFFERED_IO, "buffered_io" },
+ { ZLOOP_OPT_ZONE_APPEND, "zone_append=%u" },
+ { ZLOOP_OPT_ORDERED_ZONE_APPEND, "ordered_zone_append" },
{ ZLOOP_OPT_ERR, NULL }
};
@@ -56,6 +60,8 @@ static const match_table_t zloop_opt_tokens = {
#define ZLOOP_DEF_NR_QUEUES 1
#define ZLOOP_DEF_QUEUE_DEPTH 128
#define ZLOOP_DEF_BUFFERED_IO false
+#define ZLOOP_DEF_ZONE_APPEND true
+#define ZLOOP_DEF_ORDERED_ZONE_APPEND false
/* Arbitrary limit on the zone size (16GB). */
#define ZLOOP_MAX_ZONE_SIZE_MB 16384
@@ -71,6 +77,8 @@ struct zloop_options {
unsigned int nr_queues;
unsigned int queue_depth;
bool buffered_io;
+ bool zone_append;
+ bool ordered_zone_append;
};
/*
@@ -92,6 +100,7 @@ struct zloop_zone {
unsigned long flags;
struct mutex lock;
+ spinlock_t wp_lock;
enum blk_zone_cond cond;
sector_t start;
sector_t wp;
@@ -108,6 +117,8 @@ struct zloop_device {
struct workqueue_struct *workqueue;
bool buffered_io;
+ bool zone_append;
+ bool ordered_zone_append;
const char *base_dir;
struct file *data_dir;
@@ -147,6 +158,7 @@ static int zloop_update_seq_zone(struct zloop_device *zlo, unsigned int zone_no)
struct zloop_zone *zone = &zlo->zones[zone_no];
struct kstat stat;
sector_t file_sectors;
+ unsigned long flags;
int ret;
lockdep_assert_held(&zone->lock);
@@ -172,16 +184,18 @@ static int zloop_update_seq_zone(struct zloop_device *zlo, unsigned int zone_no)
return -EINVAL;
}
+ spin_lock_irqsave(&zone->wp_lock, flags);
if (!file_sectors) {
zone->cond = BLK_ZONE_COND_EMPTY;
zone->wp = zone->start;
} else if (file_sectors == zlo->zone_capacity) {
zone->cond = BLK_ZONE_COND_FULL;
- zone->wp = zone->start + zlo->zone_size;
+ zone->wp = ULLONG_MAX;
} else {
zone->cond = BLK_ZONE_COND_CLOSED;
zone->wp = zone->start + file_sectors;
}
+ spin_unlock_irqrestore(&zone->wp_lock, flags);
return 0;
}
@@ -225,6 +239,7 @@ unlock:
static int zloop_close_zone(struct zloop_device *zlo, unsigned int zone_no)
{
struct zloop_zone *zone = &zlo->zones[zone_no];
+ unsigned long flags;
int ret = 0;
if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
@@ -243,10 +258,12 @@ static int zloop_close_zone(struct zloop_device *zlo, unsigned int zone_no)
break;
case BLK_ZONE_COND_IMP_OPEN:
case BLK_ZONE_COND_EXP_OPEN:
+ spin_lock_irqsave(&zone->wp_lock, flags);
if (zone->wp == zone->start)
zone->cond = BLK_ZONE_COND_EMPTY;
else
zone->cond = BLK_ZONE_COND_CLOSED;
+ spin_unlock_irqrestore(&zone->wp_lock, flags);
break;
case BLK_ZONE_COND_EMPTY:
case BLK_ZONE_COND_FULL:
@@ -264,6 +281,7 @@ unlock:
static int zloop_reset_zone(struct zloop_device *zlo, unsigned int zone_no)
{
struct zloop_zone *zone = &zlo->zones[zone_no];
+ unsigned long flags;
int ret = 0;
if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
@@ -281,9 +299,11 @@ static int zloop_reset_zone(struct zloop_device *zlo, unsigned int zone_no)
goto unlock;
}
+ spin_lock_irqsave(&zone->wp_lock, flags);
zone->cond = BLK_ZONE_COND_EMPTY;
zone->wp = zone->start;
clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
+ spin_unlock_irqrestore(&zone->wp_lock, flags);
unlock:
mutex_unlock(&zone->lock);
@@ -308,6 +328,7 @@ static int zloop_reset_all_zones(struct zloop_device *zlo)
static int zloop_finish_zone(struct zloop_device *zlo, unsigned int zone_no)
{
struct zloop_zone *zone = &zlo->zones[zone_no];
+ unsigned long flags;
int ret = 0;
if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
@@ -325,9 +346,11 @@ static int zloop_finish_zone(struct zloop_device *zlo, unsigned int zone_no)
goto unlock;
}
+ spin_lock_irqsave(&zone->wp_lock, flags);
zone->cond = BLK_ZONE_COND_FULL;
- zone->wp = zone->start + zlo->zone_size;
+ zone->wp = ULLONG_MAX;
clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
+ spin_unlock_irqrestore(&zone->wp_lock, flags);
unlock:
mutex_unlock(&zone->lock);
@@ -369,6 +392,7 @@ static void zloop_rw(struct zloop_cmd *cmd)
struct zloop_zone *zone;
struct iov_iter iter;
struct bio_vec tmp;
+ unsigned long flags;
sector_t zone_end;
int nr_bvec = 0;
int ret;
@@ -378,6 +402,11 @@ static void zloop_rw(struct zloop_cmd *cmd)
cmd->nr_sectors = nr_sectors;
cmd->ret = 0;
+ if (WARN_ON_ONCE(is_append && !zlo->zone_append)) {
+ ret = -EIO;
+ goto out;
+ }
+
/* We should never get an I/O beyond the device capacity. */
if (WARN_ON_ONCE(zone_no >= zlo->nr_zones)) {
ret = -EIO;
@@ -406,16 +435,31 @@ static void zloop_rw(struct zloop_cmd *cmd)
if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write) {
mutex_lock(&zone->lock);
- if (is_append) {
- sector = zone->wp;
- cmd->sector = sector;
- }
+ spin_lock_irqsave(&zone->wp_lock, flags);
/*
- * Write operations must be aligned to the write pointer and
- * fully contained within the zone capacity.
+ * Zone append operations always go at the current write
+ * pointer, but regular write operations must already be
+ * aligned to the write pointer when submitted.
*/
- if (sector != zone->wp || zone->wp + nr_sectors > zone_end) {
+ if (is_append) {
+ /*
+ * If ordered zone append is in use, we already checked
+ * and set the target sector in zloop_queue_rq().
+ */
+ if (!zlo->ordered_zone_append) {
+ if (zone->cond == BLK_ZONE_COND_FULL ||
+ zone->wp + nr_sectors > zone_end) {
+ spin_unlock_irqrestore(&zone->wp_lock,
+ flags);
+ ret = -EIO;
+ goto unlock;
+ }
+ sector = zone->wp;
+ }
+ cmd->sector = sector;
+ } else if (sector != zone->wp) {
+ spin_unlock_irqrestore(&zone->wp_lock, flags);
pr_err("Zone %u: unaligned write: sect %llu, wp %llu\n",
zone_no, sector, zone->wp);
ret = -EIO;
@@ -428,13 +472,19 @@ static void zloop_rw(struct zloop_cmd *cmd)
zone->cond = BLK_ZONE_COND_IMP_OPEN;
/*
- * Advance the write pointer of sequential zones. If the write
- * fails, the wp position will be corrected when the next I/O
- * copmpletes.
+ * Advance the write pointer, unless ordered zone append is in
+ * use. If the write fails, the write pointer position will be
+ * corrected when the next I/O starts execution.
*/
- zone->wp += nr_sectors;
- if (zone->wp == zone_end)
- zone->cond = BLK_ZONE_COND_FULL;
+ if (!is_append || !zlo->ordered_zone_append) {
+ zone->wp += nr_sectors;
+ if (zone->wp == zone_end) {
+ zone->cond = BLK_ZONE_COND_FULL;
+ zone->wp = ULLONG_MAX;
+ }
+ }
+
+ spin_unlock_irqrestore(&zone->wp_lock, flags);
}
rq_for_each_bvec(tmp, rq, rq_iter)
@@ -498,6 +548,10 @@ static void zloop_handle_cmd(struct zloop_cmd *cmd)
struct request *rq = blk_mq_rq_from_pdu(cmd);
struct zloop_device *zlo = rq->q->queuedata;
+ /* We can block in this context, so ignore REQ_NOWAIT. */
+ if (rq->cmd_flags & REQ_NOWAIT)
+ rq->cmd_flags &= ~REQ_NOWAIT;
+
switch (req_op(rq)) {
case REQ_OP_READ:
case REQ_OP_WRITE:
@@ -608,6 +662,35 @@ static void zloop_complete_rq(struct request *rq)
blk_mq_end_request(rq, sts);
}
+static bool zloop_set_zone_append_sector(struct request *rq)
+{
+ struct zloop_device *zlo = rq->q->queuedata;
+ unsigned int zone_no = rq_zone_no(rq);
+ struct zloop_zone *zone = &zlo->zones[zone_no];
+ sector_t zone_end = zone->start + zlo->zone_capacity;
+ sector_t nr_sectors = blk_rq_sectors(rq);
+ unsigned long flags;
+
+ spin_lock_irqsave(&zone->wp_lock, flags);
+
+ if (zone->cond == BLK_ZONE_COND_FULL ||
+ zone->wp + nr_sectors > zone_end) {
+ spin_unlock_irqrestore(&zone->wp_lock, flags);
+ return false;
+ }
+
+ rq->__sector = zone->wp;
+ zone->wp += blk_rq_sectors(rq);
+ if (zone->wp >= zone_end) {
+ zone->cond = BLK_ZONE_COND_FULL;
+ zone->wp = ULLONG_MAX;
+ }
+
+ spin_unlock_irqrestore(&zone->wp_lock, flags);
+
+ return true;
+}
+
static blk_status_t zloop_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
@@ -618,6 +701,16 @@ static blk_status_t zloop_queue_rq(struct blk_mq_hw_ctx *hctx,
if (zlo->state == Zlo_deleting)
return BLK_STS_IOERR;
+ /*
+ * If we need to strongly order zone append operations, set the request
+ * sector to the zone write pointer location now instead of when the
+ * command work runs.
+ */
+ if (zlo->ordered_zone_append && req_op(rq) == REQ_OP_ZONE_APPEND) {
+ if (!zloop_set_zone_append_sector(rq))
+ return BLK_STS_IOERR;
+ }
+
blk_mq_start_request(rq);
INIT_WORK(&cmd->work, zloop_cmd_workfn);
@@ -647,11 +740,12 @@ static int zloop_open(struct gendisk *disk, blk_mode_t mode)
}
static int zloop_report_zones(struct gendisk *disk, sector_t sector,
- unsigned int nr_zones, report_zones_cb cb, void *data)
+ unsigned int nr_zones, struct blk_report_zones_args *args)
{
struct zloop_device *zlo = disk->private_data;
struct blk_zone blkz = {};
unsigned int first, i;
+ unsigned long flags;
int ret;
first = disk_zone_no(disk, sector);
@@ -675,7 +769,9 @@ static int zloop_report_zones(struct gendisk *disk, sector_t sector,
blkz.start = zone->start;
blkz.len = zlo->zone_size;
+ spin_lock_irqsave(&zone->wp_lock, flags);
blkz.wp = zone->wp;
+ spin_unlock_irqrestore(&zone->wp_lock, flags);
blkz.cond = zone->cond;
if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) {
blkz.type = BLK_ZONE_TYPE_CONVENTIONAL;
@@ -687,7 +783,7 @@ static int zloop_report_zones(struct gendisk *disk, sector_t sector,
mutex_unlock(&zone->lock);
- ret = cb(&blkz, i, data);
+ ret = disk_report_zone(disk, &blkz, i, args);
if (ret)
return ret;
}
@@ -783,6 +879,7 @@ static int zloop_init_zone(struct zloop_device *zlo, struct zloop_options *opts,
int ret;
mutex_init(&zone->lock);
+ spin_lock_init(&zone->wp_lock);
zone->start = (sector_t)zone_no << zlo->zone_shift;
if (!restore)
@@ -884,7 +981,6 @@ static int zloop_ctl_add(struct zloop_options *opts)
{
struct queue_limits lim = {
.max_hw_sectors = SZ_1M >> SECTOR_SHIFT,
- .max_hw_zone_append_sectors = SZ_1M >> SECTOR_SHIFT,
.chunk_sectors = opts->zone_size,
.features = BLK_FEAT_ZONED,
};
@@ -936,6 +1032,9 @@ static int zloop_ctl_add(struct zloop_options *opts)
zlo->nr_zones = nr_zones;
zlo->nr_conv_zones = opts->nr_conv_zones;
zlo->buffered_io = opts->buffered_io;
+ zlo->zone_append = opts->zone_append;
+ if (zlo->zone_append)
+ zlo->ordered_zone_append = opts->ordered_zone_append;
zlo->workqueue = alloc_workqueue("zloop%d", WQ_UNBOUND | WQ_FREEZABLE,
opts->nr_queues * opts->queue_depth, zlo->id);
@@ -976,6 +1075,8 @@ static int zloop_ctl_add(struct zloop_options *opts)
lim.physical_block_size = zlo->block_size;
lim.logical_block_size = zlo->block_size;
+ if (zlo->zone_append)
+ lim.max_hw_zone_append_sectors = lim.max_hw_sectors;
zlo->tag_set.ops = &zloop_mq_ops;
zlo->tag_set.nr_hw_queues = opts->nr_queues;
@@ -1016,10 +1117,14 @@ static int zloop_ctl_add(struct zloop_options *opts)
zlo->state = Zlo_live;
mutex_unlock(&zloop_ctl_mutex);
- pr_info("Added device %d: %u zones of %llu MB, %u B block size\n",
+ pr_info("zloop: device %d, %u zones of %llu MiB, %u B block size\n",
zlo->id, zlo->nr_zones,
((sector_t)zlo->zone_size << SECTOR_SHIFT) >> 20,
zlo->block_size);
+ pr_info("zloop%d: using %s%s zone append\n",
+ zlo->id,
+ zlo->ordered_zone_append ? "ordered " : "",
+ zlo->zone_append ? "native" : "emulated");
return 0;
@@ -1106,6 +1211,8 @@ static int zloop_parse_options(struct zloop_options *opts, const char *buf)
opts->nr_queues = ZLOOP_DEF_NR_QUEUES;
opts->queue_depth = ZLOOP_DEF_QUEUE_DEPTH;
opts->buffered_io = ZLOOP_DEF_BUFFERED_IO;
+ opts->zone_append = ZLOOP_DEF_ZONE_APPEND;
+ opts->ordered_zone_append = ZLOOP_DEF_ORDERED_ZONE_APPEND;
if (!buf)
return 0;
@@ -1215,6 +1322,21 @@ static int zloop_parse_options(struct zloop_options *opts, const char *buf)
case ZLOOP_OPT_BUFFERED_IO:
opts->buffered_io = true;
break;
+ case ZLOOP_OPT_ZONE_APPEND:
+ if (match_uint(args, &token)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (token != 0 && token != 1) {
+ pr_err("Invalid zone_append value\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ opts->zone_append = token;
+ break;
+ case ZLOOP_OPT_ORDERED_ZONE_APPEND:
+ opts->ordered_zone_append = true;
+ break;
case ZLOOP_OPT_ERR:
default:
pr_warn("unknown parameter or missing value '%s'\n", p);
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 48ce750bf70a..7708d92df23e 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -24,21 +24,18 @@
* Since the gens and priorities are all stored contiguously on disk, we can
* batch this up: We fill up the free_inc list with freshly invalidated buckets,
* call prio_write(), and when prio_write() finishes we pull buckets off the
- * free_inc list and optionally discard them.
+ * free_inc list.
*
* free_inc isn't the only freelist - if it was, we'd often to sleep while
* priorities and gens were being written before we could allocate. c->free is a
* smaller freelist, and buckets on that list are always ready to be used.
*
- * If we've got discards enabled, that happens when a bucket moves from the
- * free_inc list to the free list.
- *
* There is another freelist, because sometimes we have buckets that we know
* have nothing pointing into them - these we can reuse without waiting for
* priorities to be rewritten. These come from freed btree nodes and buckets
* that garbage collection discovered no longer had valid keys pointing into
* them (because they were overwritten). That's the unused list - buckets on the
- * unused list move to the free list, optionally being discarded in the process.
+ * unused list move to the free list.
*
* It's also important to ensure that gens don't wrap around - with respect to
* either the oldest gen in the btree or the gen on disk. This is quite
@@ -118,8 +115,7 @@ void bch_rescale_priorities(struct cache_set *c, int sectors)
/*
* Background allocation thread: scans for buckets to be invalidated,
* invalidates them, rewrites prios/gens (marking them as invalidated on disk),
- * then optionally issues discard commands to the newly free buckets, then puts
- * them on the various freelists.
+ * then puts them on the various freelists.
*/
static inline bool can_inc_bucket_gen(struct bucket *b)
@@ -321,8 +317,7 @@ static int bch_allocator_thread(void *arg)
while (1) {
/*
* First, we pull buckets off of the unused and free_inc lists,
- * possibly issue discards to them, then we add the bucket to
- * the free list:
+ * then we add the bucket to the free list:
*/
while (1) {
long bucket;
@@ -330,14 +325,6 @@ static int bch_allocator_thread(void *arg)
if (!fifo_pop(&ca->free_inc, bucket))
break;
- if (ca->discard) {
- mutex_unlock(&ca->set->bucket_lock);
- blkdev_issue_discard(ca->bdev,
- bucket_to_sector(ca->set, bucket),
- ca->sb.bucket_size, GFP_KERNEL);
- mutex_lock(&ca->set->bucket_lock);
- }
-
allocator_wait(ca, bch_allocator_push(ca, bucket));
wake_up(&ca->set->btree_cache_wait);
wake_up(&ca->set->bucket_wait);
@@ -412,7 +399,11 @@ long bch_bucket_alloc(struct cache *ca, unsigned int reserve, bool wait)
TASK_UNINTERRUPTIBLE);
mutex_unlock(&ca->set->bucket_lock);
+
+ atomic_inc(&ca->set->bucket_wait_cnt);
schedule();
+ atomic_dec(&ca->set->bucket_wait_cnt);
+
mutex_lock(&ca->set->bucket_lock);
} while (!fifo_pop(&ca->free[RESERVE_NONE], r) &&
!fifo_pop(&ca->free[reserve], r));
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 1d33e40d26ea..8ccacba85547 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -447,8 +447,7 @@ struct cache {
* free_inc: Incoming buckets - these are buckets that currently have
* cached data in them, and we can't reuse them until after we write
* their new gen to disk. After prio_write() finishes writing the new
- * gens/prios, they'll be moved to the free list (and possibly discarded
- * in the process)
+ * gens/prios, they'll be moved to the free list.
*/
DECLARE_FIFO(long, free)[RESERVE_NR];
DECLARE_FIFO(long, free_inc);
@@ -467,8 +466,6 @@ struct cache {
*/
unsigned int invalidate_needs_gc;
- bool discard; /* Get rid of? */
-
struct journal_device journal;
/* The rest of this all shows up in sysfs */
@@ -607,6 +604,7 @@ struct cache_set {
*/
atomic_t prio_blocked;
wait_queue_head_t bucket_wait;
+ atomic_t bucket_wait_cnt;
/*
* For any bio we don't skip we subtract the number of sectors from
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index 011f6062c4c0..6ee2c6a506a2 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -327,9 +327,13 @@ struct btree_iter {
/* Fixed-size btree_iter that can be allocated on the stack */
struct btree_iter_stack {
- struct btree_iter iter;
- struct btree_iter_set stack_data[MAX_BSETS];
+ /* Must be last as it ends in a flexible-array member. */
+ TRAILING_OVERLAP(struct btree_iter, iter, data,
+ struct btree_iter_set stack_data[MAX_BSETS];
+ );
};
+static_assert(offsetof(struct btree_iter_stack, iter.data) ==
+ offsetof(struct btree_iter_stack, stack_data));
typedef bool (*ptr_filter_fn)(struct btree_keys *b, const struct bkey *k);
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 210b59007d98..3ed39c823826 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -89,8 +89,9 @@
* Test module load/unload
*/
-#define MAX_GC_TIMES 100
-#define MIN_GC_NODES 100
+#define MAX_GC_TIMES_SHIFT 7 /* 128 loops */
+#define GC_NODES_MIN 10
+#define GC_SLEEP_MS_MIN 10
#define GC_SLEEP_MS 100
#define PTR_DIRTY_BIT (((uint64_t) 1 << 36))
@@ -371,7 +372,7 @@ static void do_btree_node_write(struct btree *b)
SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) +
bset_sector_offset(&b->keys, i));
- if (!bch_bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) {
+ if (!bch_bio_alloc_pages(b->bio, GFP_NOWAIT)) {
struct bio_vec *bv;
void *addr = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
struct bvec_iter_all iter_all;
@@ -1578,29 +1579,29 @@ static unsigned int btree_gc_count_keys(struct btree *b)
static size_t btree_gc_min_nodes(struct cache_set *c)
{
- size_t min_nodes;
+ size_t min_nodes = GC_NODES_MIN;
- /*
- * Since incremental GC would stop 100ms when front
- * side I/O comes, so when there are many btree nodes,
- * if GC only processes constant (100) nodes each time,
- * GC would last a long time, and the front side I/Os
- * would run out of the buckets (since no new bucket
- * can be allocated during GC), and be blocked again.
- * So GC should not process constant nodes, but varied
- * nodes according to the number of btree nodes, which
- * realized by dividing GC into constant(100) times,
- * so when there are many btree nodes, GC can process
- * more nodes each time, otherwise, GC will process less
- * nodes each time (but no less than MIN_GC_NODES)
- */
- min_nodes = c->gc_stats.nodes / MAX_GC_TIMES;
- if (min_nodes < MIN_GC_NODES)
- min_nodes = MIN_GC_NODES;
+ if (atomic_read(&c->search_inflight) == 0) {
+ size_t n = c->gc_stats.nodes >> MAX_GC_TIMES_SHIFT;
+
+ if (min_nodes < n)
+ min_nodes = n;
+ }
return min_nodes;
}
+static uint64_t btree_gc_sleep_ms(struct cache_set *c)
+{
+ uint64_t sleep_ms;
+
+ if (atomic_read(&c->bucket_wait_cnt) > 0)
+ sleep_ms = GC_SLEEP_MS_MIN;
+ else
+ sleep_ms = GC_SLEEP_MS;
+
+ return sleep_ms;
+}
static int btree_gc_recurse(struct btree *b, struct btree_op *op,
struct closure *writes, struct gc_stat *gc)
@@ -1668,8 +1669,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1));
r->b = NULL;
- if (atomic_read(&b->c->search_inflight) &&
- gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c)) {
+ if (gc->nodes >= (gc->nodes_pre + btree_gc_min_nodes(b->c))) {
gc->nodes_pre = gc->nodes;
ret = -EAGAIN;
break;
@@ -1846,8 +1846,8 @@ static void bch_btree_gc(struct cache_set *c)
cond_resched();
if (ret == -EAGAIN)
- schedule_timeout_interruptible(msecs_to_jiffies
- (GC_SLEEP_MS));
+ schedule_timeout_interruptible(
+ msecs_to_jiffies(btree_gc_sleep_ms(c)));
else if (ret)
pr_warn("gc failed!\n");
} while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags));
@@ -2822,7 +2822,8 @@ void bch_btree_exit(void)
int __init bch_btree_init(void)
{
- btree_io_wq = alloc_workqueue("bch_btree_io", WQ_MEM_RECLAIM, 0);
+ btree_io_wq = alloc_workqueue("bch_btree_io",
+ WQ_MEM_RECLAIM | WQ_PERCPU, 0);
if (!btree_io_wq)
return -ENOMEM;
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index d50eb82ccb4f..144693b7c46a 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -275,8 +275,7 @@ bsearch:
* ja->cur_idx
*/
ja->cur_idx = i;
- ja->last_idx = ja->discard_idx = (i + 1) %
- ca->sb.njournal_buckets;
+ ja->last_idx = (i + 1) % ca->sb.njournal_buckets;
}
@@ -336,16 +335,6 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list)
}
}
-static bool is_discard_enabled(struct cache_set *s)
-{
- struct cache *ca = s->cache;
-
- if (ca->discard)
- return true;
-
- return false;
-}
-
int bch_journal_replay(struct cache_set *s, struct list_head *list)
{
int ret = 0, keys = 0, entries = 0;
@@ -360,15 +349,10 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
BUG_ON(i->pin && atomic_read(i->pin) != 1);
if (n != i->j.seq) {
- if (n == start && is_discard_enabled(s))
- pr_info("journal entries %llu-%llu may be discarded! (replaying %llu-%llu)\n",
- n, i->j.seq - 1, start, end);
- else {
- pr_err("journal entries %llu-%llu missing! (replaying %llu-%llu)\n",
- n, i->j.seq - 1, start, end);
- ret = -EIO;
- goto err;
- }
+ pr_err("journal entries %llu-%llu missing! (replaying %llu-%llu)\n",
+ n, i->j.seq - 1, start, end);
+ ret = -EIO;
+ goto err;
}
for (k = i->j.start;
@@ -568,65 +552,6 @@ out:
#define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1)
-static void journal_discard_endio(struct bio *bio)
-{
- struct journal_device *ja =
- container_of(bio, struct journal_device, discard_bio);
- struct cache *ca = container_of(ja, struct cache, journal);
-
- atomic_set(&ja->discard_in_flight, DISCARD_DONE);
-
- closure_wake_up(&ca->set->journal.wait);
- closure_put(&ca->set->cl);
-}
-
-static void journal_discard_work(struct work_struct *work)
-{
- struct journal_device *ja =
- container_of(work, struct journal_device, discard_work);
-
- submit_bio(&ja->discard_bio);
-}
-
-static void do_journal_discard(struct cache *ca)
-{
- struct journal_device *ja = &ca->journal;
- struct bio *bio = &ja->discard_bio;
-
- if (!ca->discard) {
- ja->discard_idx = ja->last_idx;
- return;
- }
-
- switch (atomic_read(&ja->discard_in_flight)) {
- case DISCARD_IN_FLIGHT:
- return;
-
- case DISCARD_DONE:
- ja->discard_idx = (ja->discard_idx + 1) %
- ca->sb.njournal_buckets;
-
- atomic_set(&ja->discard_in_flight, DISCARD_READY);
- fallthrough;
-
- case DISCARD_READY:
- if (ja->discard_idx == ja->last_idx)
- return;
-
- atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT);
-
- bio_init_inline(bio, ca->bdev, 1, REQ_OP_DISCARD);
- bio->bi_iter.bi_sector = bucket_to_sector(ca->set,
- ca->sb.d[ja->discard_idx]);
- bio->bi_iter.bi_size = bucket_bytes(ca);
- bio->bi_end_io = journal_discard_endio;
-
- closure_get(&ca->set->cl);
- INIT_WORK(&ja->discard_work, journal_discard_work);
- queue_work(bch_journal_wq, &ja->discard_work);
- }
-}
-
static unsigned int free_journal_buckets(struct cache_set *c)
{
struct journal *j = &c->journal;
@@ -635,10 +560,10 @@ static unsigned int free_journal_buckets(struct cache_set *c)
unsigned int n;
/* In case njournal_buckets is not power of 2 */
- if (ja->cur_idx >= ja->discard_idx)
- n = ca->sb.njournal_buckets + ja->discard_idx - ja->cur_idx;
+ if (ja->cur_idx >= ja->last_idx)
+ n = ca->sb.njournal_buckets + ja->last_idx - ja->cur_idx;
else
- n = ja->discard_idx - ja->cur_idx;
+ n = ja->last_idx - ja->cur_idx;
if (n > (1 + j->do_reserve))
return n - (1 + j->do_reserve);
@@ -668,8 +593,6 @@ static void journal_reclaim(struct cache_set *c)
ja->last_idx = (ja->last_idx + 1) %
ca->sb.njournal_buckets;
- do_journal_discard(ca);
-
if (c->journal.blocks_free)
goto out;
diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
index cd316b4a1e95..9e9d1b3016a5 100644
--- a/drivers/md/bcache/journal.h
+++ b/drivers/md/bcache/journal.h
@@ -139,19 +139,6 @@ struct journal_device {
/* Last journal bucket that still contains an open journal entry */
unsigned int last_idx;
- /* Next journal bucket to be discarded */
- unsigned int discard_idx;
-
-#define DISCARD_READY 0
-#define DISCARD_IN_FLIGHT 1
-#define DISCARD_DONE 2
- /* 1 - discard in flight, -1 - discard completed */
- atomic_t discard_in_flight;
-
- struct work_struct discard_work;
- struct bio discard_bio;
- struct bio_vec discard_bv;
-
/* Bio for journal reads/writes to this device */
struct bio bio;
struct bio_vec bv[8];
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 6d250e366412..c17d4517af22 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1388,7 +1388,7 @@ static CLOSURE_CALLBACK(cached_dev_flush)
bch_cache_accounting_destroy(&dc->accounting);
kobject_del(&d->kobj);
- continue_at(cl, cached_dev_free, system_wq);
+ continue_at(cl, cached_dev_free, system_percpu_wq);
}
static int cached_dev_init(struct cached_dev *dc, unsigned int block_size)
@@ -1400,7 +1400,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size)
__module_get(THIS_MODULE);
INIT_LIST_HEAD(&dc->list);
closure_init(&dc->disk.cl, NULL);
- set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq);
+ set_closure_fn(&dc->disk.cl, cached_dev_flush, system_percpu_wq);
kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype);
INIT_WORK(&dc->detach, cached_dev_detach_finish);
sema_init(&dc->sb_write_mutex, 1);
@@ -1513,7 +1513,7 @@ static CLOSURE_CALLBACK(flash_dev_flush)
bcache_device_unlink(d);
mutex_unlock(&bch_register_lock);
kobject_del(&d->kobj);
- continue_at(cl, flash_dev_free, system_wq);
+ continue_at(cl, flash_dev_free, system_percpu_wq);
}
static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
@@ -1525,7 +1525,7 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
goto err_ret;
closure_init(&d->cl, NULL);
- set_closure_fn(&d->cl, flash_dev_flush, system_wq);
+ set_closure_fn(&d->cl, flash_dev_flush, system_percpu_wq);
kobject_init(&d->kobj, &bch_flash_dev_ktype);
@@ -1833,7 +1833,7 @@ static CLOSURE_CALLBACK(__cache_set_unregister)
mutex_unlock(&bch_register_lock);
- continue_at(cl, cache_set_flush, system_wq);
+ continue_at(cl, cache_set_flush, system_percpu_wq);
}
void bch_cache_set_stop(struct cache_set *c)
@@ -1863,10 +1863,10 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
__module_get(THIS_MODULE);
closure_init(&c->cl, NULL);
- set_closure_fn(&c->cl, cache_set_free, system_wq);
+ set_closure_fn(&c->cl, cache_set_free, system_percpu_wq);
closure_init(&c->caching, &c->cl);
- set_closure_fn(&c->caching, __cache_set_unregister, system_wq);
+ set_closure_fn(&c->caching, __cache_set_unregister, system_percpu_wq);
/* Maybe create continue_at_noreturn() and use it here? */
closure_set_stopped(&c->cl);
@@ -1939,7 +1939,8 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
if (!c->uuids)
goto err;
- c->moving_gc_wq = alloc_workqueue("bcache_gc", WQ_MEM_RECLAIM, 0);
+ c->moving_gc_wq = alloc_workqueue("bcache_gc",
+ WQ_MEM_RECLAIM | WQ_PERCPU, 0);
if (!c->moving_gc_wq)
goto err;
@@ -2382,9 +2383,6 @@ static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
ca->bdev = file_bdev(bdev_file);
ca->sb_disk = sb_disk;
- if (bdev_max_discard_sectors(file_bdev(bdev_file)))
- ca->discard = CACHE_DISCARD(&ca->sb);
-
ret = cache_alloc(ca);
if (ret != 0) {
if (ret == -ENOMEM)
@@ -2531,7 +2529,7 @@ static void register_device_async(struct async_reg_args *args)
INIT_DELAYED_WORK(&args->reg_work, register_cache_worker);
/* 10 jiffies is enough for a delay */
- queue_delayed_work(system_wq, &args->reg_work, 10);
+ queue_delayed_work(system_percpu_wq, &args->reg_work, 10);
}
static void *alloc_holder_object(struct cache_sb *sb)
@@ -2905,24 +2903,25 @@ static int __init bcache_init(void)
if (bch_btree_init())
goto err;
- bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0);
+ bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM | WQ_PERCPU, 0);
if (!bcache_wq)
goto err;
/*
* Let's not make this `WQ_MEM_RECLAIM` for the following reasons:
*
- * 1. It used `system_wq` before which also does no memory reclaim.
+ * 1. It used `system_percpu_wq` before which also does no memory reclaim.
* 2. With `WQ_MEM_RECLAIM` desktop stalls, increased boot times, and
* reduced throughput can be observed.
*
- * We still want to user our own queue to not congest the `system_wq`.
+ * We still want to user our own queue to not congest the `system_percpu_wq`.
*/
- bch_flush_wq = alloc_workqueue("bch_flush", 0, 0);
+ bch_flush_wq = alloc_workqueue("bch_flush", WQ_PERCPU, 0);
if (!bch_flush_wq)
goto err;
- bch_journal_wq = alloc_workqueue("bch_journal", WQ_MEM_RECLAIM, 0);
+ bch_journal_wq = alloc_workqueue("bch_journal",
+ WQ_MEM_RECLAIM | WQ_PERCPU, 0);
if (!bch_journal_wq)
goto err;
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 826b14cae4e5..72f38e5b6f5c 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -134,7 +134,6 @@ read_attribute(partial_stripes_expensive);
rw_attribute(synchronous);
rw_attribute(journal_delay_ms);
rw_attribute(io_disable);
-rw_attribute(discard);
rw_attribute(running);
rw_attribute(label);
rw_attribute(errors);
@@ -1036,7 +1035,6 @@ SHOW(__bch_cache)
sysfs_hprint(bucket_size, bucket_bytes(ca));
sysfs_hprint(block_size, block_bytes(ca));
sysfs_print(nbuckets, ca->sb.nbuckets);
- sysfs_print(discard, ca->discard);
sysfs_hprint(written, atomic_long_read(&ca->sectors_written) << 9);
sysfs_hprint(btree_written,
atomic_long_read(&ca->btree_sectors_written) << 9);
@@ -1142,18 +1140,6 @@ STORE(__bch_cache)
if (bcache_is_reboot)
return -EBUSY;
- if (attr == &sysfs_discard) {
- bool v = strtoul_or_return(buf);
-
- if (bdev_max_discard_sectors(ca->bdev))
- ca->discard = v;
-
- if (v != CACHE_DISCARD(&ca->sb)) {
- SET_CACHE_DISCARD(&ca->sb, v);
- bcache_write_super(ca->set);
- }
- }
-
if (attr == &sysfs_cache_replacement_policy) {
v = __sysfs_match_string(cache_replacement_policies, -1, buf);
if (v < 0)
@@ -1185,7 +1171,6 @@ static struct attribute *bch_cache_attrs[] = {
&sysfs_block_size,
&sysfs_nbuckets,
&sysfs_priority_stats,
- &sysfs_discard,
&sysfs_written,
&sysfs_btree_written,
&sysfs_metadata_written,
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 6ba73dc1a3df..4b237074f453 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -805,8 +805,7 @@ static int bch_writeback_thread(void *arg)
* may set BCH_ENABLE_AUTO_GC via sysfs, then when
* BCH_DO_AUTO_GC is set, garbage collection thread
* will be wake up here. After moving gc, the shrunk
- * btree and discarded free buckets SSD space may be
- * helpful for following write requests.
+ * btree may be helpful for following write requests.
*/
if (c->gc_after_writeback ==
(BCH_ENABLE_AUTO_GC|BCH_DO_AUTO_GC)) {
@@ -1076,7 +1075,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
int bch_cached_dev_writeback_start(struct cached_dev *dc)
{
dc->writeback_write_wq = alloc_workqueue("bcache_writeback_wq",
- WQ_MEM_RECLAIM, 0);
+ WQ_MEM_RECLAIM | WQ_PERCPU, 0);
if (!dc->writeback_write_wq)
return -ENOMEM;
diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c
index 78e17dd4d01b..5a840c4ae316 100644
--- a/drivers/md/dm-zone.c
+++ b/drivers/md/dm-zone.c
@@ -17,33 +17,26 @@
* For internal zone reports bypassing the top BIO submission path.
*/
static int dm_blk_do_report_zones(struct mapped_device *md, struct dm_table *t,
- sector_t sector, unsigned int nr_zones,
- report_zones_cb cb, void *data)
+ unsigned int nr_zones,
+ struct dm_report_zones_args *args)
{
- struct gendisk *disk = md->disk;
- int ret;
- struct dm_report_zones_args args = {
- .next_sector = sector,
- .orig_data = data,
- .orig_cb = cb,
- };
-
do {
struct dm_target *tgt;
+ int ret;
- tgt = dm_table_find_target(t, args.next_sector);
+ tgt = dm_table_find_target(t, args->next_sector);
if (WARN_ON_ONCE(!tgt->type->report_zones))
return -EIO;
- args.tgt = tgt;
- ret = tgt->type->report_zones(tgt, &args,
- nr_zones - args.zone_idx);
+ args->tgt = tgt;
+ ret = tgt->type->report_zones(tgt, args,
+ nr_zones - args->zone_idx);
if (ret < 0)
return ret;
- } while (args.zone_idx < nr_zones &&
- args.next_sector < get_capacity(disk));
+ } while (args->zone_idx < nr_zones &&
+ args->next_sector < get_capacity(md->disk));
- return args.zone_idx;
+ return args->zone_idx;
}
/*
@@ -52,7 +45,8 @@ static int dm_blk_do_report_zones(struct mapped_device *md, struct dm_table *t,
* generally implemented by targets using dm_report_zones().
*/
int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
- unsigned int nr_zones, report_zones_cb cb, void *data)
+ unsigned int nr_zones,
+ struct blk_report_zones_args *args)
{
struct mapped_device *md = disk->private_data;
struct dm_table *map;
@@ -76,9 +70,14 @@ int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
map = zone_revalidate_map;
}
- if (map)
- ret = dm_blk_do_report_zones(md, map, sector, nr_zones, cb,
- data);
+ if (map) {
+ struct dm_report_zones_args dm_args = {
+ .disk = md->disk,
+ .next_sector = sector,
+ .rep_args = args,
+ };
+ ret = dm_blk_do_report_zones(md, map, nr_zones, &dm_args);
+ }
if (put_table)
dm_put_live_table(md, srcu_idx);
@@ -113,7 +112,18 @@ static int dm_report_zones_cb(struct blk_zone *zone, unsigned int idx,
}
args->next_sector = zone->start + zone->len;
- return args->orig_cb(zone, args->zone_idx++, args->orig_data);
+
+ /* If we have an internal callback, call it first. */
+ if (args->cb) {
+ int ret;
+
+ ret = args->cb(zone, args->zone_idx, args->data);
+ if (ret)
+ return ret;
+ }
+
+ return disk_report_zone(args->disk, zone, args->zone_idx++,
+ args->rep_args);
}
/*
@@ -492,10 +502,15 @@ int dm_zone_get_reset_bitmap(struct mapped_device *md, struct dm_table *t,
sector_t sector, unsigned int nr_zones,
unsigned long *need_reset)
{
+ struct dm_report_zones_args args = {
+ .disk = md->disk,
+ .next_sector = sector,
+ .cb = dm_zone_need_reset_cb,
+ .data = need_reset,
+ };
int ret;
- ret = dm_blk_do_report_zones(md, t, sector, nr_zones,
- dm_zone_need_reset_cb, need_reset);
+ ret = dm_blk_do_report_zones(md, t, nr_zones, &args);
if (ret != nr_zones) {
DMERR("Get %s zone reset bitmap failed\n",
md->disk->disk_name);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 245f52b59215..7a795979ec72 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -109,7 +109,8 @@ void dm_finalize_zone_settings(struct dm_table *t, struct queue_limits *lim);
void dm_zone_endio(struct dm_io *io, struct bio *clone);
#ifdef CONFIG_BLK_DEV_ZONED
int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
- unsigned int nr_zones, report_zones_cb cb, void *data);
+ unsigned int nr_zones,
+ struct blk_report_zones_args *args);
bool dm_is_zone_write(struct mapped_device *md, struct bio *bio);
int dm_zone_get_reset_bitmap(struct mapped_device *md, struct dm_table *t,
sector_t sector, unsigned int nr_zones,
diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c
index 7033d982d377..8d7b82c4a723 100644
--- a/drivers/md/md-linear.c
+++ b/drivers/md/md-linear.c
@@ -72,9 +72,11 @@ static int linear_set_limits(struct mddev *mddev)
md_init_stacking_limits(&lim);
lim.max_hw_sectors = mddev->chunk_sectors;
+ lim.logical_block_size = mddev->logical_block_size;
lim.max_write_zeroes_sectors = mddev->chunk_sectors;
lim.max_hw_wzeroes_unmap_sectors = mddev->chunk_sectors;
lim.io_min = mddev->chunk_sectors << 9;
+ lim.features |= BLK_FEAT_ATOMIC_WRITES;
err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY);
if (err)
return err;
diff --git a/drivers/md/md-llbitmap.c b/drivers/md/md-llbitmap.c
index 1eb434306162..9c1ade19b774 100644
--- a/drivers/md/md-llbitmap.c
+++ b/drivers/md/md-llbitmap.c
@@ -378,7 +378,7 @@ static void llbitmap_infect_dirty_bits(struct llbitmap *llbitmap,
case BitClean:
pctl->state[pos] = BitDirty;
break;
- };
+ }
}
}
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 41c476b40c7a..e5922a682953 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -99,7 +99,7 @@ static int remove_and_add_spares(struct mddev *mddev,
struct md_rdev *this);
static void mddev_detach(struct mddev *mddev);
static void export_rdev(struct md_rdev *rdev, struct mddev *mddev);
-static void md_wakeup_thread_directly(struct md_thread __rcu *thread);
+static void md_wakeup_thread_directly(struct md_thread __rcu **thread);
/*
* Default number of read corrections we'll attempt on an rdev
@@ -339,6 +339,7 @@ static int start_readonly;
*/
static bool create_on_open = true;
static bool legacy_async_del_gendisk = true;
+static bool check_new_feature = true;
/*
* We have a system wide 'event count' that is incremented
@@ -730,6 +731,8 @@ static void mddev_clear_bitmap_ops(struct mddev *mddev)
int mddev_init(struct mddev *mddev)
{
+ int err = 0;
+
if (!IS_ENABLED(CONFIG_MD_BITMAP))
mddev->bitmap_id = ID_BITMAP_NONE;
else
@@ -741,10 +744,23 @@ int mddev_init(struct mddev *mddev)
if (percpu_ref_init(&mddev->writes_pending, no_op,
PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
- percpu_ref_exit(&mddev->active_io);
- return -ENOMEM;
+ err = -ENOMEM;
+ goto exit_acitve_io;
}
+ err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
+ if (err)
+ goto exit_writes_pending;
+
+ err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
+ if (err)
+ goto exit_bio_set;
+
+ err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE,
+ offsetof(struct md_io_clone, bio_clone), 0);
+ if (err)
+ goto exit_sync_set;
+
/* We want to start with the refcount at zero */
percpu_ref_put(&mddev->writes_pending);
@@ -773,11 +789,24 @@ int mddev_init(struct mddev *mddev)
INIT_WORK(&mddev->del_work, mddev_delayed_delete);
return 0;
+
+exit_sync_set:
+ bioset_exit(&mddev->sync_set);
+exit_bio_set:
+ bioset_exit(&mddev->bio_set);
+exit_writes_pending:
+ percpu_ref_exit(&mddev->writes_pending);
+exit_acitve_io:
+ percpu_ref_exit(&mddev->active_io);
+ return err;
}
EXPORT_SYMBOL_GPL(mddev_init);
void mddev_destroy(struct mddev *mddev)
{
+ bioset_exit(&mddev->bio_set);
+ bioset_exit(&mddev->sync_set);
+ bioset_exit(&mddev->io_clone_set);
percpu_ref_exit(&mddev->active_io);
percpu_ref_exit(&mddev->writes_pending);
}
@@ -941,8 +970,11 @@ void mddev_unlock(struct mddev *mddev)
* do_md_stop. dm raid only uses md_stop to stop. So dm raid
* doesn't need to check MD_DELETED when getting reconfig lock
*/
- if (test_bit(MD_DELETED, &mddev->flags))
+ if (test_bit(MD_DELETED, &mddev->flags) &&
+ !test_and_set_bit(MD_DO_DELETE, &mddev->flags)) {
+ kobject_del(&mddev->kobj);
del_gendisk(mddev->gendisk);
+ }
}
}
EXPORT_SYMBOL_GPL(mddev_unlock);
@@ -1820,9 +1852,13 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
}
if (sb->pad0 ||
sb->pad3[0] ||
- memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1])))
- /* Some padding is non-zero, might be a new feature */
- return -EINVAL;
+ memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) {
+ pr_warn("Some padding is non-zero on %pg, might be a new feature\n",
+ rdev->bdev);
+ if (check_new_feature)
+ return -EINVAL;
+ pr_warn("check_new_feature is disabled, data corruption possible\n");
+ }
rdev->preferred_minor = 0xffff;
rdev->data_offset = le64_to_cpu(sb->data_offset);
@@ -1963,6 +1999,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struc
mddev->layout = le32_to_cpu(sb->layout);
mddev->raid_disks = le32_to_cpu(sb->raid_disks);
mddev->dev_sectors = le64_to_cpu(sb->size);
+ mddev->logical_block_size = le32_to_cpu(sb->logical_block_size);
mddev->events = ev1;
mddev->bitmap_info.offset = 0;
mddev->bitmap_info.space = 0;
@@ -2172,6 +2209,7 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
sb->level = cpu_to_le32(mddev->level);
sb->layout = cpu_to_le32(mddev->layout);
+ sb->logical_block_size = cpu_to_le32(mddev->logical_block_size);
if (test_bit(FailFast, &rdev->flags))
sb->devflags |= FailFast1;
else
@@ -2750,6 +2788,7 @@ void md_update_sb(struct mddev *mddev, int force_change)
if (!md_is_rdwr(mddev)) {
if (force_change)
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
+ pr_err("%s: can't update sb for read-only array %s\n", __func__, mdname(mddev));
return;
}
@@ -5134,7 +5173,7 @@ static void stop_sync_thread(struct mddev *mddev, bool locked)
* Thread might be blocked waiting for metadata update which will now
* never happen
*/
- md_wakeup_thread_directly(mddev->sync_thread);
+ md_wakeup_thread_directly(&mddev->sync_thread);
if (work_pending(&mddev->sync_work))
flush_work(&mddev->sync_work);
@@ -5900,6 +5939,68 @@ static struct md_sysfs_entry md_serialize_policy =
__ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show,
serialize_policy_store);
+static int mddev_set_logical_block_size(struct mddev *mddev,
+ unsigned int lbs)
+{
+ int err = 0;
+ struct queue_limits lim;
+
+ if (queue_logical_block_size(mddev->gendisk->queue) >= lbs) {
+ pr_err("%s: Cannot set LBS smaller than mddev LBS %u\n",
+ mdname(mddev), lbs);
+ return -EINVAL;
+ }
+
+ lim = queue_limits_start_update(mddev->gendisk->queue);
+ lim.logical_block_size = lbs;
+ pr_info("%s: logical_block_size is changed, data may be lost\n",
+ mdname(mddev));
+ err = queue_limits_commit_update(mddev->gendisk->queue, &lim);
+ if (err)
+ return err;
+
+ mddev->logical_block_size = lbs;
+ /* New lbs will be written to superblock after array is running */
+ set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
+ return 0;
+}
+
+static ssize_t
+lbs_show(struct mddev *mddev, char *page)
+{
+ return sprintf(page, "%u\n", mddev->logical_block_size);
+}
+
+static ssize_t
+lbs_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ unsigned int lbs;
+ int err = -EBUSY;
+
+ /* Only 1.x meta supports configurable LBS */
+ if (mddev->major_version == 0)
+ return -EINVAL;
+
+ if (mddev->pers)
+ return -EBUSY;
+
+ err = kstrtouint(buf, 10, &lbs);
+ if (err < 0)
+ return -EINVAL;
+
+ err = mddev_lock(mddev);
+ if (err)
+ goto unlock;
+
+ err = mddev_set_logical_block_size(mddev, lbs);
+
+unlock:
+ mddev_unlock(mddev);
+ return err ?: len;
+}
+
+static struct md_sysfs_entry md_logical_block_size =
+__ATTR(logical_block_size, 0644, lbs_show, lbs_store);
static struct attribute *md_default_attrs[] = {
&md_level.attr,
@@ -5922,6 +6023,7 @@ static struct attribute *md_default_attrs[] = {
&md_consistency_policy.attr,
&md_fail_last_dev.attr,
&md_serialize_policy.attr,
+ &md_logical_block_size.attr,
NULL,
};
@@ -6052,6 +6154,17 @@ int mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim,
return -EINVAL;
}
+ /*
+ * Before RAID adding folio support, the logical_block_size
+ * should be smaller than the page size.
+ */
+ if (lim->logical_block_size > PAGE_SIZE) {
+ pr_err("%s: logical_block_size must not larger than PAGE_SIZE\n",
+ mdname(mddev));
+ return -EINVAL;
+ }
+ mddev->logical_block_size = lim->logical_block_size;
+
return 0;
}
EXPORT_SYMBOL_GPL(mddev_stack_rdev_limits);
@@ -6064,6 +6177,13 @@ int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev)
if (mddev_is_dm(mddev))
return 0;
+ if (queue_logical_block_size(rdev->bdev->bd_disk->queue) >
+ queue_logical_block_size(mddev->gendisk->queue)) {
+ pr_err("%s: incompatible logical_block_size, can not add\n",
+ mdname(mddev));
+ return -EINVAL;
+ }
+
lim = queue_limits_start_update(mddev->gendisk->queue);
queue_limits_stack_bdev(&lim, rdev->bdev, rdev->data_offset,
mddev->gendisk->disk_name);
@@ -6384,29 +6504,9 @@ int md_run(struct mddev *mddev)
nowait = nowait && bdev_nowait(rdev->bdev);
}
- if (!bioset_initialized(&mddev->bio_set)) {
- err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
- if (err)
- return err;
- }
- if (!bioset_initialized(&mddev->sync_set)) {
- err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
- if (err)
- goto exit_bio_set;
- }
-
- if (!bioset_initialized(&mddev->io_clone_set)) {
- err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE,
- offsetof(struct md_io_clone, bio_clone), 0);
- if (err)
- goto exit_sync_set;
- }
-
pers = get_pers(mddev->level, mddev->clevel);
- if (!pers) {
- err = -EINVAL;
- goto abort;
- }
+ if (!pers)
+ return -EINVAL;
if (mddev->level != pers->head.id) {
mddev->level = pers->head.id;
mddev->new_level = pers->head.id;
@@ -6417,8 +6517,7 @@ int md_run(struct mddev *mddev)
pers->start_reshape == NULL) {
/* This personality cannot handle reshaping... */
put_pers(pers);
- err = -EINVAL;
- goto abort;
+ return -EINVAL;
}
if (pers->sync_request) {
@@ -6545,12 +6644,6 @@ bitmap_abort:
mddev->private = NULL;
put_pers(pers);
md_bitmap_destroy(mddev);
-abort:
- bioset_exit(&mddev->io_clone_set);
-exit_sync_set:
- bioset_exit(&mddev->sync_set);
-exit_bio_set:
- bioset_exit(&mddev->bio_set);
return err;
}
EXPORT_SYMBOL_GPL(md_run);
@@ -6683,6 +6776,7 @@ static void md_clean(struct mddev *mddev)
mddev->chunk_sectors = 0;
mddev->ctime = mddev->utime = 0;
mddev->layout = 0;
+ mddev->logical_block_size = 0;
mddev->max_disks = 0;
mddev->events = 0;
mddev->can_decrease_events = 0;
@@ -6775,10 +6869,6 @@ static void __md_stop(struct mddev *mddev)
mddev->private = NULL;
put_pers(pers);
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
-
- bioset_exit(&mddev->bio_set);
- bioset_exit(&mddev->sync_set);
- bioset_exit(&mddev->io_clone_set);
}
void md_stop(struct mddev *mddev)
@@ -6869,6 +6959,10 @@ static int do_md_stop(struct mddev *mddev, int mode)
if (!md_is_rdwr(mddev))
set_disk_ro(disk, 0);
+ if (mode == 2 && mddev->pers->sync_request &&
+ mddev->to_remove == NULL)
+ mddev->to_remove = &md_redundancy_group;
+
__md_stop_writes(mddev);
__md_stop(mddev);
@@ -8373,22 +8467,21 @@ static int md_thread(void *arg)
return 0;
}
-static void md_wakeup_thread_directly(struct md_thread __rcu *thread)
+static void md_wakeup_thread_directly(struct md_thread __rcu **thread)
{
struct md_thread *t;
rcu_read_lock();
- t = rcu_dereference(thread);
+ t = rcu_dereference(*thread);
if (t)
wake_up_process(t->tsk);
rcu_read_unlock();
}
-void md_wakeup_thread(struct md_thread __rcu *thread)
+void __md_wakeup_thread(struct md_thread __rcu *thread)
{
struct md_thread *t;
- rcu_read_lock();
t = rcu_dereference(thread);
if (t) {
pr_debug("md: waking up MD thread %s.\n", t->tsk->comm);
@@ -8396,9 +8489,8 @@ void md_wakeup_thread(struct md_thread __rcu *thread)
if (wq_has_sleeper(&t->wqueue))
wake_up(&t->wqueue);
}
- rcu_read_unlock();
}
-EXPORT_SYMBOL(md_wakeup_thread);
+EXPORT_SYMBOL(__md_wakeup_thread);
struct md_thread *md_register_thread(void (*run) (struct md_thread *),
struct mddev *mddev, const char *name)
@@ -9978,6 +10070,52 @@ static void unregister_sync_thread(struct mddev *mddev)
md_reap_sync_thread(mddev);
}
+static bool md_should_do_recovery(struct mddev *mddev)
+{
+ /*
+ * As long as one of the following flags is set,
+ * recovery needs to do or cleanup.
+ */
+ if (test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
+ test_bit(MD_RECOVERY_DONE, &mddev->recovery))
+ return true;
+
+ /*
+ * If no flags are set and it is in read-only status,
+ * there is nothing to do.
+ */
+ if (!md_is_rdwr(mddev))
+ return false;
+
+ /*
+ * MD_SB_CHANGE_PENDING indicates that the array is switching from clean to
+ * active, and no action is needed for now.
+ * All other MD_SB_* flags require to update the superblock.
+ */
+ if (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING))
+ return true;
+
+ /*
+ * If the array is not using external metadata and there has been no data
+ * written for some time, then the array's status needs to be set to
+ * in_sync.
+ */
+ if (mddev->external == 0 && mddev->safemode == 1)
+ return true;
+
+ /*
+ * When the system is about to restart or the process receives an signal,
+ * the array needs to be synchronized as soon as possible.
+ * Once the data synchronization is completed, need to change the array
+ * status to in_sync.
+ */
+ if (mddev->safemode == 2 && !mddev->in_sync &&
+ mddev->resync_offset == MaxSector)
+ return true;
+
+ return false;
+}
+
/*
* This routine is regularly called by all per-raid-array threads to
* deal with generic issues like resync and super-block update.
@@ -10014,18 +10152,7 @@ void md_check_recovery(struct mddev *mddev)
flush_signals(current);
}
- if (!md_is_rdwr(mddev) &&
- !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) &&
- !test_bit(MD_RECOVERY_DONE, &mddev->recovery))
- return;
- if ( ! (
- (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) ||
- test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
- test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
- (mddev->external == 0 && mddev->safemode == 1) ||
- (mddev->safemode == 2
- && !mddev->in_sync && mddev->resync_offset == MaxSector)
- ))
+ if (!md_should_do_recovery(mddev))
return;
if (mddev_trylock(mddev)) {
@@ -10281,7 +10408,6 @@ static int md_notify_reboot(struct notifier_block *this,
unsigned long code, void *x)
{
struct mddev *mddev;
- int need_delay = 0;
spin_lock(&all_mddevs_lock);
list_for_each_entry(mddev, &all_mddevs, all_mddevs) {
@@ -10295,21 +10421,11 @@ static int md_notify_reboot(struct notifier_block *this,
mddev->safemode = 2;
mddev_unlock(mddev);
}
- need_delay = 1;
spin_lock(&all_mddevs_lock);
mddev_put_locked(mddev);
}
spin_unlock(&all_mddevs_lock);
- /*
- * certain more exotic SCSI devices are known to be
- * volatile wrt too early system reboots. While the
- * right place to handle this issue is the given
- * driver, we do want to have a safe RAID driver ...
- */
- if (need_delay)
- msleep(1000);
-
return NOTIFY_DONE;
}
@@ -10697,6 +10813,7 @@ module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
module_param(create_on_open, bool, S_IRUSR|S_IWUSR);
module_param(legacy_async_del_gendisk, bool, 0600);
+module_param(check_new_feature, bool, 0600);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("MD RAID framework");
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 1979c2d4fe89..6985f2829bbd 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -354,6 +354,7 @@ enum mddev_flags {
MD_HAS_MULTIPLE_PPLS,
MD_NOT_READY,
MD_BROKEN,
+ MD_DO_DELETE,
MD_DELETED,
};
@@ -432,6 +433,7 @@ struct mddev {
sector_t array_sectors; /* exported array size */
int external_size; /* size managed
* externally */
+ unsigned int logical_block_size;
__u64 events;
/* If the last 'event' was simply a clean->dirty transition, and
* we didn't write it to the spares, then it is safe and simple
@@ -882,6 +884,12 @@ struct md_io_clone {
#define THREAD_WAKEUP 0
+#define md_wakeup_thread(thread) do { \
+ rcu_read_lock(); \
+ __md_wakeup_thread(thread); \
+ rcu_read_unlock(); \
+} while (0)
+
static inline void safe_put_page(struct page *p)
{
if (p) put_page(p);
@@ -895,7 +903,7 @@ extern struct md_thread *md_register_thread(
struct mddev *mddev,
const char *name);
extern void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp);
-extern void md_wakeup_thread(struct md_thread __rcu *thread);
+extern void __md_wakeup_thread(struct md_thread __rcu *thread);
extern void md_check_recovery(struct mddev *mddev);
extern void md_reap_sync_thread(struct mddev *mddev);
extern enum sync_action md_sync_action(struct mddev *mddev);
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index e443e478645a..985c377356eb 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -68,7 +68,10 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
struct strip_zone *zone;
int cnt;
struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL);
- unsigned blksize = 512;
+ unsigned int blksize = 512;
+
+ if (!mddev_is_dm(mddev))
+ blksize = queue_logical_block_size(mddev->gendisk->queue);
*private_conf = ERR_PTR(-ENOMEM);
if (!conf)
@@ -84,7 +87,8 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
sector_div(sectors, mddev->chunk_sectors);
rdev1->sectors = sectors * mddev->chunk_sectors;
- blksize = max(blksize, queue_logical_block_size(
+ if (mddev_is_dm(mddev))
+ blksize = max(blksize, queue_logical_block_size(
rdev1->bdev->bd_disk->queue));
rdev_for_each(rdev2, mddev) {
@@ -383,6 +387,7 @@ static int raid0_set_limits(struct mddev *mddev)
lim.max_hw_sectors = mddev->chunk_sectors;
lim.max_write_zeroes_sectors = mddev->chunk_sectors;
lim.max_hw_wzeroes_unmap_sectors = mddev->chunk_sectors;
+ lim.logical_block_size = mddev->logical_block_size;
lim.io_min = mddev->chunk_sectors << 9;
lim.io_opt = lim.io_min * mddev->raid_disks;
lim.chunk_sectors = mddev->chunk_sectors;
@@ -405,6 +410,12 @@ static int raid0_run(struct mddev *mddev)
if (md_check_no_bitmap(mddev))
return -EINVAL;
+ if (!mddev_is_dm(mddev)) {
+ ret = raid0_set_limits(mddev);
+ if (ret)
+ return ret;
+ }
+
/* if private is not null, we are here after takeover */
if (mddev->private == NULL) {
ret = create_strip_zones(mddev, &conf);
@@ -413,11 +424,6 @@ static int raid0_run(struct mddev *mddev)
mddev->private = conf;
}
conf = mddev->private;
- if (!mddev_is_dm(mddev)) {
- ret = raid0_set_limits(mddev);
- if (ret)
- return ret;
- }
/* calculate array device size */
md_set_array_sectors(mddev, raid0_size(mddev, 0, 0));
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 592a40233004..57d50465eed1 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -3213,6 +3213,7 @@ static int raid1_set_limits(struct mddev *mddev)
md_init_stacking_limits(&lim);
lim.max_write_zeroes_sectors = 0;
lim.max_hw_wzeroes_unmap_sectors = 0;
+ lim.logical_block_size = mddev->logical_block_size;
lim.features |= BLK_FEAT_ATOMIC_WRITES;
err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY);
if (err)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 14dcd5142eb4..84be4cc7e873 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -4000,6 +4000,7 @@ static int raid10_set_queue_limits(struct mddev *mddev)
md_init_stacking_limits(&lim);
lim.max_write_zeroes_sectors = 0;
lim.max_hw_wzeroes_unmap_sectors = 0;
+ lim.logical_block_size = mddev->logical_block_size;
lim.io_min = mddev->chunk_sectors << 9;
lim.chunk_sectors = mddev->chunk_sectors;
lim.io_opt = lim.io_min * raid10_nr_stripes(conf);
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index ba768ca7f422..e29e69335c69 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -3104,7 +3104,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
goto out_mempool;
spin_lock_init(&log->tree_lock);
- INIT_RADIX_TREE(&log->big_stripe_tree, GFP_NOWAIT | __GFP_NOWARN);
+ INIT_RADIX_TREE(&log->big_stripe_tree, GFP_NOWAIT);
thread = md_register_thread(r5l_reclaim_thread, log->rdev->mddev,
"reclaim");
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 24b32a0c95b4..e57ce3295292 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4956,7 +4956,8 @@ static void handle_stripe(struct stripe_head *sh)
goto finish;
if (s.handle_bad_blocks ||
- test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
+ (md_is_rdwr(conf->mddev) &&
+ test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags))) {
set_bit(STRIPE_HANDLE, &sh->state);
goto finish;
}
@@ -6768,7 +6769,8 @@ static void raid5d(struct md_thread *thread)
int batch_size, released;
unsigned int offset;
- if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
+ if (md_is_rdwr(mddev) &&
+ test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
break;
released = release_stripe_list(conf, conf->temp_inactive_list);
@@ -7745,6 +7747,7 @@ static int raid5_set_limits(struct mddev *mddev)
stripe = roundup_pow_of_two(data_disks * (mddev->chunk_sectors << 9));
md_init_stacking_limits(&lim);
+ lim.logical_block_size = mddev->logical_block_size;
lim.io_min = mddev->chunk_sectors << 9;
lim.io_opt = lim.io_min * (conf->raid_disks - conf->max_degraded);
lim.features |= BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE;
diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c
index f35d3f71d14f..15b3d07f8ccd 100644
--- a/drivers/nvme/host/apple.c
+++ b/drivers/nvme/host/apple.c
@@ -1283,6 +1283,7 @@ static const struct nvme_ctrl_ops nvme_ctrl_ops = {
.reg_read64 = apple_nvme_reg_read64,
.free_ctrl = apple_nvme_free_ctrl,
.get_address = apple_nvme_get_address,
+ .get_virt_boundary = nvme_get_virt_boundary,
};
static void apple_nvme_async_probe(void *data, async_cookie_t cookie)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index f1f719351f3f..7bf228df6001 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2069,13 +2069,13 @@ static u32 nvme_max_drv_segments(struct nvme_ctrl *ctrl)
}
static void nvme_set_ctrl_limits(struct nvme_ctrl *ctrl,
- struct queue_limits *lim)
+ struct queue_limits *lim, bool is_admin)
{
lim->max_hw_sectors = ctrl->max_hw_sectors;
lim->max_segments = min_t(u32, USHRT_MAX,
min_not_zero(nvme_max_drv_segments(ctrl), ctrl->max_segments));
lim->max_integrity_segments = ctrl->max_integrity_segments;
- lim->virt_boundary_mask = NVME_CTRL_PAGE_SIZE - 1;
+ lim->virt_boundary_mask = ctrl->ops->get_virt_boundary(ctrl, is_admin);
lim->max_segment_size = UINT_MAX;
lim->dma_alignment = 3;
}
@@ -2177,7 +2177,7 @@ static int nvme_update_ns_info_generic(struct nvme_ns *ns,
int ret;
lim = queue_limits_start_update(ns->disk->queue);
- nvme_set_ctrl_limits(ns->ctrl, &lim);
+ nvme_set_ctrl_limits(ns->ctrl, &lim, false);
memflags = blk_mq_freeze_queue(ns->disk->queue);
ret = queue_limits_commit_update(ns->disk->queue, &lim);
@@ -2381,7 +2381,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
ns->head->lba_shift = id->lbaf[lbaf].ds;
ns->head->nuse = le64_to_cpu(id->nuse);
capacity = nvme_lba_to_sect(ns->head, le64_to_cpu(id->nsze));
- nvme_set_ctrl_limits(ns->ctrl, &lim);
+ nvme_set_ctrl_limits(ns->ctrl, &lim, false);
nvme_configure_metadata(ns->ctrl, ns->head, id, nvm, info);
nvme_set_chunk_sectors(ns, id, &lim);
if (!nvme_update_disk_info(ns, id, &lim))
@@ -2599,10 +2599,9 @@ static void nvme_configure_opal(struct nvme_ctrl *ctrl, bool was_suspended)
#ifdef CONFIG_BLK_DEV_ZONED
static int nvme_report_zones(struct gendisk *disk, sector_t sector,
- unsigned int nr_zones, report_zones_cb cb, void *data)
+ unsigned int nr_zones, struct blk_report_zones_args *args)
{
- return nvme_ns_report_zones(disk->private_data, sector, nr_zones, cb,
- data);
+ return nvme_ns_report_zones(disk->private_data, sector, nr_zones, args);
}
#else
#define nvme_report_zones NULL
@@ -3589,7 +3588,7 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl)
min_not_zero(ctrl->max_hw_sectors, max_hw_sectors);
lim = queue_limits_start_update(ctrl->admin_q);
- nvme_set_ctrl_limits(ctrl, &lim);
+ nvme_set_ctrl_limits(ctrl, &lim, true);
ret = queue_limits_commit_update(ctrl->admin_q, &lim);
if (ret)
goto out_free;
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index 1b58ee7d0dce..caf5503d0833 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -217,6 +217,12 @@ static inline unsigned int nvmf_nr_io_queues(struct nvmf_ctrl_options *opts)
min(opts->nr_poll_queues, num_online_cpus());
}
+static inline unsigned long nvmf_get_virt_boundary(struct nvme_ctrl *ctrl,
+ bool is_admin)
+{
+ return 0;
+}
+
int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val);
int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val);
int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val);
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 2c903729b0b9..873954d43b18 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -3361,6 +3361,7 @@ static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = {
.submit_async_event = nvme_fc_submit_async_event,
.delete_ctrl = nvme_fc_delete_ctrl,
.get_address = nvmf_get_address,
+ .get_virt_boundary = nvmf_get_virt_boundary,
};
static void
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index e35eccacee8c..174027d1cc19 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -576,7 +576,7 @@ static int nvme_ns_head_get_unique_id(struct gendisk *disk, u8 id[16],
#ifdef CONFIG_BLK_DEV_ZONED
static int nvme_ns_head_report_zones(struct gendisk *disk, sector_t sector,
- unsigned int nr_zones, report_zones_cb cb, void *data)
+ unsigned int nr_zones, struct blk_report_zones_args *args)
{
struct nvme_ns_head *head = disk->private_data;
struct nvme_ns *ns;
@@ -585,7 +585,7 @@ static int nvme_ns_head_report_zones(struct gendisk *disk, sector_t sector,
srcu_idx = srcu_read_lock(&head->srcu);
ns = nvme_find_path(head);
if (ns)
- ret = nvme_ns_report_zones(ns, sector, nr_zones, cb, data);
+ ret = nvme_ns_report_zones(ns, sector, nr_zones, args);
srcu_read_unlock(&head->srcu, srcu_idx);
return ret;
}
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 102fae6a231c..9a5f28c5103c 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -558,6 +558,12 @@ static inline bool nvme_ns_has_pi(struct nvme_ns_head *head)
return head->pi_type && head->ms == head->pi_size;
}
+static inline unsigned long nvme_get_virt_boundary(struct nvme_ctrl *ctrl,
+ bool is_admin)
+{
+ return NVME_CTRL_PAGE_SIZE - 1;
+}
+
struct nvme_ctrl_ops {
const char *name;
struct module *module;
@@ -578,6 +584,7 @@ struct nvme_ctrl_ops {
int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size);
void (*print_device_info)(struct nvme_ctrl *ctrl);
bool (*supports_pci_p2pdma)(struct nvme_ctrl *ctrl);
+ unsigned long (*get_virt_boundary)(struct nvme_ctrl *ctrl, bool is_admin);
};
/*
@@ -1108,7 +1115,7 @@ struct nvme_zone_info {
};
int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
- unsigned int nr_zones, report_zones_cb cb, void *data);
+ unsigned int nr_zones, struct blk_report_zones_args *args);
int nvme_query_zone_info(struct nvme_ns *ns, unsigned lbaf,
struct nvme_zone_info *zi);
void nvme_update_zone_info(struct nvme_ns *ns, struct queue_limits *lim,
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 72fb675a696f..e5ca8301bb8b 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -260,8 +260,20 @@ enum nvme_iod_flags {
/* single segment dma mapping */
IOD_SINGLE_SEGMENT = 1U << 2,
+ /* Data payload contains p2p memory */
+ IOD_DATA_P2P = 1U << 3,
+
+ /* Metadata contains p2p memory */
+ IOD_META_P2P = 1U << 4,
+
+ /* Data payload contains MMIO memory */
+ IOD_DATA_MMIO = 1U << 5,
+
+ /* Metadata contains MMIO memory */
+ IOD_META_MMIO = 1U << 6,
+
/* Metadata using non-coalesced MPTR */
- IOD_SINGLE_META_SEGMENT = 1U << 5,
+ IOD_SINGLE_META_SEGMENT = 1U << 7,
};
struct nvme_dma_vec {
@@ -613,9 +625,22 @@ static inline enum nvme_use_sgl nvme_pci_use_sgls(struct nvme_dev *dev,
struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
if (nvmeq->qid && nvme_ctrl_sgl_supported(&dev->ctrl)) {
- if (nvme_req(req)->flags & NVME_REQ_USERCMD)
- return SGL_FORCED;
- if (req->nr_integrity_segments > 1)
+ /*
+ * When the controller is capable of using SGL, there are
+ * several conditions that we force to use it:
+ *
+ * 1. A request containing page gaps within the controller's
+ * mask can not use the PRP format.
+ *
+ * 2. User commands use SGL because that lets the device
+ * validate the requested transfer lengths.
+ *
+ * 3. Multiple integrity segments must use SGL as that's the
+ * only way to describe such a command in NVMe.
+ */
+ if (req_phys_gap_mask(req) & (NVME_CTRL_PAGE_SIZE - 1) ||
+ nvme_req(req)->flags & NVME_REQ_USERCMD ||
+ req->nr_integrity_segments > 1)
return SGL_FORCED;
return SGL_SUPPORTED;
}
@@ -685,20 +710,20 @@ static void nvme_free_descriptors(struct request *req)
}
}
-static void nvme_free_prps(struct request *req)
+static void nvme_free_prps(struct request *req, unsigned int attrs)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
unsigned int i;
for (i = 0; i < iod->nr_dma_vecs; i++)
- dma_unmap_page(nvmeq->dev->dev, iod->dma_vecs[i].addr,
- iod->dma_vecs[i].len, rq_dma_dir(req));
+ dma_unmap_phys(nvmeq->dev->dev, iod->dma_vecs[i].addr,
+ iod->dma_vecs[i].len, rq_dma_dir(req), attrs);
mempool_free(iod->dma_vecs, nvmeq->dev->dmavec_mempool);
}
static void nvme_free_sgls(struct request *req, struct nvme_sgl_desc *sge,
- struct nvme_sgl_desc *sg_list)
+ struct nvme_sgl_desc *sg_list, unsigned int attrs)
{
struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
enum dma_data_direction dir = rq_dma_dir(req);
@@ -707,22 +732,25 @@ static void nvme_free_sgls(struct request *req, struct nvme_sgl_desc *sge,
unsigned int i;
if (sge->type == (NVME_SGL_FMT_DATA_DESC << 4)) {
- dma_unmap_page(dma_dev, le64_to_cpu(sge->addr), len, dir);
+ dma_unmap_phys(dma_dev, le64_to_cpu(sge->addr), len, dir,
+ attrs);
return;
}
for (i = 0; i < len / sizeof(*sg_list); i++)
- dma_unmap_page(dma_dev, le64_to_cpu(sg_list[i].addr),
- le32_to_cpu(sg_list[i].length), dir);
+ dma_unmap_phys(dma_dev, le64_to_cpu(sg_list[i].addr),
+ le32_to_cpu(sg_list[i].length), dir, attrs);
}
static void nvme_unmap_metadata(struct request *req)
{
struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+ enum pci_p2pdma_map_type map = PCI_P2PDMA_MAP_NONE;
enum dma_data_direction dir = rq_dma_dir(req);
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
struct device *dma_dev = nvmeq->dev->dev;
struct nvme_sgl_desc *sge = iod->meta_descriptor;
+ unsigned int attrs = 0;
if (iod->flags & IOD_SINGLE_META_SEGMENT) {
dma_unmap_page(dma_dev, iod->meta_dma,
@@ -731,13 +759,20 @@ static void nvme_unmap_metadata(struct request *req)
return;
}
- if (!blk_rq_integrity_dma_unmap(req, dma_dev, &iod->meta_dma_state,
- iod->meta_total_len)) {
+ if (iod->flags & IOD_META_P2P)
+ map = PCI_P2PDMA_MAP_BUS_ADDR;
+ else if (iod->flags & IOD_META_MMIO) {
+ map = PCI_P2PDMA_MAP_THRU_HOST_BRIDGE;
+ attrs |= DMA_ATTR_MMIO;
+ }
+
+ if (!blk_rq_dma_unmap(req, dma_dev, &iod->meta_dma_state,
+ iod->meta_total_len, map)) {
if (nvme_pci_cmd_use_meta_sgl(&iod->cmd))
- nvme_free_sgls(req, sge, &sge[1]);
+ nvme_free_sgls(req, sge, &sge[1], attrs);
else
- dma_unmap_page(dma_dev, iod->meta_dma,
- iod->meta_total_len, dir);
+ dma_unmap_phys(dma_dev, iod->meta_dma,
+ iod->meta_total_len, dir, attrs);
}
if (iod->meta_descriptor)
@@ -747,9 +782,11 @@ static void nvme_unmap_metadata(struct request *req)
static void nvme_unmap_data(struct request *req)
{
+ enum pci_p2pdma_map_type map = PCI_P2PDMA_MAP_NONE;
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
struct device *dma_dev = nvmeq->dev->dev;
+ unsigned int attrs = 0;
if (iod->flags & IOD_SINGLE_SEGMENT) {
static_assert(offsetof(union nvme_data_ptr, prp1) ==
@@ -759,12 +796,20 @@ static void nvme_unmap_data(struct request *req)
return;
}
- if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len)) {
+ if (iod->flags & IOD_DATA_P2P)
+ map = PCI_P2PDMA_MAP_BUS_ADDR;
+ else if (iod->flags & IOD_DATA_MMIO) {
+ map = PCI_P2PDMA_MAP_THRU_HOST_BRIDGE;
+ attrs |= DMA_ATTR_MMIO;
+ }
+
+ if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len,
+ map)) {
if (nvme_pci_cmd_use_sgl(&iod->cmd))
nvme_free_sgls(req, iod->descriptors[0],
- &iod->cmd.common.dptr.sgl);
+ &iod->cmd.common.dptr.sgl, attrs);
else
- nvme_free_prps(req);
+ nvme_free_prps(req, attrs);
}
if (iod->nr_descriptors)
@@ -1035,6 +1080,19 @@ static blk_status_t nvme_map_data(struct request *req)
if (!blk_rq_dma_map_iter_start(req, dev->dev, &iod->dma_state, &iter))
return iter.status;
+ switch (iter.p2pdma.map) {
+ case PCI_P2PDMA_MAP_BUS_ADDR:
+ iod->flags |= IOD_DATA_P2P;
+ break;
+ case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
+ iod->flags |= IOD_DATA_MMIO;
+ break;
+ case PCI_P2PDMA_MAP_NONE:
+ break;
+ default:
+ return BLK_STS_RESOURCE;
+ }
+
if (use_sgl == SGL_FORCED ||
(use_sgl == SGL_SUPPORTED &&
(sgl_threshold && nvme_pci_avg_seg_size(req) >= sgl_threshold)))
@@ -1057,6 +1115,19 @@ static blk_status_t nvme_pci_setup_meta_iter(struct request *req)
&iod->meta_dma_state, &iter))
return iter.status;
+ switch (iter.p2pdma.map) {
+ case PCI_P2PDMA_MAP_BUS_ADDR:
+ iod->flags |= IOD_META_P2P;
+ break;
+ case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
+ iod->flags |= IOD_META_MMIO;
+ break;
+ case PCI_P2PDMA_MAP_NONE:
+ break;
+ default:
+ return BLK_STS_RESOURCE;
+ }
+
if (blk_rq_dma_map_coalesce(&iod->meta_dma_state))
entries = 1;
@@ -3250,6 +3321,14 @@ static bool nvme_pci_supports_pci_p2pdma(struct nvme_ctrl *ctrl)
return dma_pci_p2pdma_supported(dev->dev);
}
+static unsigned long nvme_pci_get_virt_boundary(struct nvme_ctrl *ctrl,
+ bool is_admin)
+{
+ if (!nvme_ctrl_sgl_supported(ctrl) || is_admin)
+ return NVME_CTRL_PAGE_SIZE - 1;
+ return 0;
+}
+
static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
.name = "pcie",
.module = THIS_MODULE,
@@ -3264,6 +3343,7 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
.get_address = nvme_pci_get_address,
.print_device_info = nvme_pci_print_device_info,
.supports_pci_p2pdma = nvme_pci_supports_pci_p2pdma,
+ .get_virt_boundary = nvme_pci_get_virt_boundary,
};
static int nvme_dev_map(struct nvme_dev *dev)
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 190a4cfa8a5e..35c0822edb2d 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -2202,6 +2202,7 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
.delete_ctrl = nvme_rdma_delete_ctrl,
.get_address = nvmf_get_address,
.stop_ctrl = nvme_rdma_stop_ctrl,
+ .get_virt_boundary = nvme_get_virt_boundary,
};
/*
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 6795b8286c35..69cb04406b47 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -2865,6 +2865,7 @@ static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = {
.delete_ctrl = nvme_tcp_delete_ctrl,
.get_address = nvme_tcp_get_address,
.stop_ctrl = nvme_tcp_stop_ctrl,
+ .get_virt_boundary = nvmf_get_virt_boundary,
};
static bool
diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
index cce4c5b55aa9..deea2dbef5b8 100644
--- a/drivers/nvme/host/zns.c
+++ b/drivers/nvme/host/zns.c
@@ -148,8 +148,8 @@ static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns,
static int nvme_zone_parse_entry(struct nvme_ns *ns,
struct nvme_zone_descriptor *entry,
- unsigned int idx, report_zones_cb cb,
- void *data)
+ unsigned int idx,
+ struct blk_report_zones_args *args)
{
struct nvme_ns_head *head = ns->head;
struct blk_zone zone = { };
@@ -169,11 +169,11 @@ static int nvme_zone_parse_entry(struct nvme_ns *ns,
else
zone.wp = nvme_lba_to_sect(head, le64_to_cpu(entry->wp));
- return cb(&zone, idx, data);
+ return disk_report_zone(ns->disk, &zone, idx, args);
}
int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
- unsigned int nr_zones, report_zones_cb cb, void *data)
+ unsigned int nr_zones, struct blk_report_zones_args *args)
{
struct nvme_zone_report *report;
struct nvme_command c = { };
@@ -213,7 +213,7 @@ int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
for (i = 0; i < nz && zone_idx < nr_zones; i++) {
ret = nvme_zone_parse_entry(ns, &report->entries[i],
- zone_idx, cb, data);
+ zone_idx, args);
if (ret)
goto out_free;
zone_idx++;
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index f85a8441bcc6..fc8e7c9ad858 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -511,6 +511,7 @@ static const struct nvme_ctrl_ops nvme_loop_ctrl_ops = {
.submit_async_event = nvme_loop_submit_async_event,
.delete_ctrl = nvme_loop_delete_ctrl_host,
.get_address = nvmf_get_address,
+ .get_virt_boundary = nvme_get_virt_boundary,
};
static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl)
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index 9a5c6a73511f..35031357ac4d 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -207,19 +207,6 @@ static int dasd_state_known_to_new(struct dasd_device *device)
return 0;
}
-static struct dentry *dasd_debugfs_setup(const char *name,
- struct dentry *base_dentry)
-{
- struct dentry *pde;
-
- if (!base_dentry)
- return NULL;
- pde = debugfs_create_dir(name, base_dentry);
- if (!pde || IS_ERR(pde))
- return NULL;
- return pde;
-}
-
/*
* Request the irq line for the device.
*/
@@ -234,14 +221,14 @@ static int dasd_state_known_to_basic(struct dasd_device *device)
if (rc)
return rc;
block->debugfs_dentry =
- dasd_debugfs_setup(block->gdp->disk_name,
+ debugfs_create_dir(block->gdp->disk_name,
dasd_debugfs_root_entry);
dasd_profile_init(&block->profile, block->debugfs_dentry);
if (dasd_global_profile_level == DASD_PROFILE_ON)
dasd_profile_on(&device->block->profile);
}
device->debugfs_dentry =
- dasd_debugfs_setup(dev_name(&device->cdev->dev),
+ debugfs_create_dir(dev_name(&device->cdev->dev),
dasd_debugfs_root_entry);
dasd_profile_init(&device->profile, device->debugfs_dentry);
dasd_hosts_init(device->debugfs_dentry, device);
@@ -1057,19 +1044,9 @@ static const struct file_operations dasd_stats_raw_fops = {
static void dasd_profile_init(struct dasd_profile *profile,
struct dentry *base_dentry)
{
- umode_t mode;
- struct dentry *pde;
-
- if (!base_dentry)
- return;
- profile->dentry = NULL;
profile->data = NULL;
- mode = (S_IRUSR | S_IWUSR | S_IFREG);
- pde = debugfs_create_file("statistics", mode, base_dentry,
- profile, &dasd_stats_raw_fops);
- if (pde && !IS_ERR(pde))
- profile->dentry = pde;
- return;
+ profile->dentry = debugfs_create_file("statistics", 0600, base_dentry,
+ profile, &dasd_stats_raw_fops);
}
static void dasd_profile_exit(struct dasd_profile *profile)
@@ -1089,25 +1066,9 @@ static void dasd_statistics_removeroot(void)
static void dasd_statistics_createroot(void)
{
- struct dentry *pde;
-
- dasd_debugfs_root_entry = NULL;
- pde = debugfs_create_dir("dasd", NULL);
- if (!pde || IS_ERR(pde))
- goto error;
- dasd_debugfs_root_entry = pde;
- pde = debugfs_create_dir("global", dasd_debugfs_root_entry);
- if (!pde || IS_ERR(pde))
- goto error;
- dasd_debugfs_global_entry = pde;
+ dasd_debugfs_root_entry = debugfs_create_dir("dasd", NULL);
+ dasd_debugfs_global_entry = debugfs_create_dir("global", dasd_debugfs_root_entry);
dasd_profile_init(&dasd_global_profile, dasd_debugfs_global_entry);
- return;
-
-error:
- DBF_EVENT(DBF_ERR, "%s",
- "Creation of the dasd debugfs interface failed");
- dasd_statistics_removeroot();
- return;
}
#else
@@ -1168,17 +1129,8 @@ static void dasd_hosts_exit(struct dasd_device *device)
static void dasd_hosts_init(struct dentry *base_dentry,
struct dasd_device *device)
{
- struct dentry *pde;
- umode_t mode;
-
- if (!base_dentry)
- return;
-
- mode = S_IRUSR | S_IFREG;
- pde = debugfs_create_file("host_access_list", mode, base_dentry,
- device, &dasd_hosts_fops);
- if (pde && !IS_ERR(pde))
- device->hosts_dentry = pde;
+ device->hosts_dentry = debugfs_create_file("host_access_list", 0400, base_dentry,
+ device, &dasd_hosts_fops);
}
struct dasd_ccw_req *dasd_smalloc_request(int magic, int cplength, int datasize,
diff --git a/drivers/s390/block/dasd_devmap.c b/drivers/s390/block/dasd_devmap.c
index ddbdf1f85d44..73972900fc55 100644
--- a/drivers/s390/block/dasd_devmap.c
+++ b/drivers/s390/block/dasd_devmap.c
@@ -355,7 +355,8 @@ static int __init dasd_parse_range(const char *range)
/* each device in dasd= parameter should be set initially online */
features |= DASD_FEATURE_INITIAL_ONLINE;
while (from <= to) {
- sprintf(bus_id, "%01x.%01x.%04x", from_id0, from_id1, from++);
+ scnprintf(bus_id, sizeof(bus_id),
+ "%01x.%01x.%04x", from_id0, from_id1, from++);
devmap = dasd_add_busid(bus_id, features);
if (IS_ERR(devmap)) {
rc = PTR_ERR(devmap);
diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c
index 687396703788..b08e900687f3 100644
--- a/drivers/s390/block/dasd_eckd.c
+++ b/drivers/s390/block/dasd_eckd.c
@@ -6139,6 +6139,7 @@ static int dasd_eckd_copy_pair_swap(struct dasd_device *device, char *prim_busid
struct dasd_copy_relation *copy;
struct dasd_block *block;
struct gendisk *gdp;
+ int rc;
copy = device->copy;
if (!copy)
@@ -6173,6 +6174,13 @@ static int dasd_eckd_copy_pair_swap(struct dasd_device *device, char *prim_busid
/* swap blocklayer device link */
gdp = block->gdp;
dasd_add_link_to_gendisk(gdp, secondary);
+ rc = device_move(disk_to_dev(gdp), &secondary->cdev->dev, DPM_ORDER_NONE);
+ if (rc) {
+ dev_err(&primary->cdev->dev,
+ "copy_pair_swap: moving blockdevice parent %s->%s failed (%d)\n",
+ dev_name(&primary->cdev->dev),
+ dev_name(&secondary->cdev->dev), rc);
+ }
/* re-enable device */
dasd_device_remove_stop_bits(primary, DASD_STOPPED_PPRC);
diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c
index 28e92fad0ca1..6ee3d952412e 100644
--- a/drivers/s390/block/dasd_genhd.c
+++ b/drivers/s390/block/dasd_genhd.c
@@ -22,6 +22,7 @@
static unsigned int queue_depth = 32;
static unsigned int nr_hw_queues = 4;
+static void dasd_gd_free(struct gendisk *gdp);
module_param(queue_depth, uint, 0444);
MODULE_PARM_DESC(queue_depth, "Default queue depth for new DASD devices");
@@ -30,6 +31,37 @@ module_param(nr_hw_queues, uint, 0444);
MODULE_PARM_DESC(nr_hw_queues, "Default number of hardware queues for new DASD devices");
/*
+ * Set device name.
+ * dasda - dasdz : 26 devices
+ * dasdaa - dasdzz : 676 devices, added up = 702
+ * dasdaaa - dasdzzz : 17576 devices, added up = 18278
+ * dasdaaaa - dasdzzzz : 456976 devices, added up = 475252
+ */
+static int dasd_name_format(char *prefix, int index, char *buf, int buflen)
+{
+ const int base = 'z' - 'a' + 1;
+ char *begin = buf + strlen(prefix);
+ char *end = buf + buflen;
+ char *p;
+ int unit;
+
+ p = end - 1;
+ *p = '\0';
+ unit = base;
+ do {
+ if (p == begin)
+ return -EINVAL;
+ *--p = 'a' + (index % unit);
+ index = (index / unit) - 1;
+ } while (index >= 0);
+
+ memmove(begin, p, end - p);
+ memcpy(buf, prefix, strlen(prefix));
+
+ return 0;
+}
+
+/*
* Allocate and register gendisk structure for device.
*/
int dasd_gendisk_alloc(struct dasd_block *block)
@@ -45,11 +77,13 @@ int dasd_gendisk_alloc(struct dasd_block *block)
};
struct gendisk *gdp;
struct dasd_device *base;
- int len, rc;
+ unsigned int devindex;
+ int rc;
/* Make sure the minor for this device exists. */
base = block->base;
- if (base->devindex >= DASD_PER_MAJOR)
+ devindex = base->devindex;
+ if (devindex >= DASD_PER_MAJOR)
return -EBUSY;
block->tag_set.ops = &dasd_mq_ops;
@@ -69,31 +103,17 @@ int dasd_gendisk_alloc(struct dasd_block *block)
/* Initialize gendisk structure. */
gdp->major = DASD_MAJOR;
- gdp->first_minor = base->devindex << DASD_PARTN_BITS;
+ gdp->first_minor = devindex << DASD_PARTN_BITS;
gdp->minors = 1 << DASD_PARTN_BITS;
gdp->fops = &dasd_device_operations;
- /*
- * Set device name.
- * dasda - dasdz : 26 devices
- * dasdaa - dasdzz : 676 devices, added up = 702
- * dasdaaa - dasdzzz : 17576 devices, added up = 18278
- * dasdaaaa - dasdzzzz : 456976 devices, added up = 475252
- */
- len = sprintf(gdp->disk_name, "dasd");
- if (base->devindex > 25) {
- if (base->devindex > 701) {
- if (base->devindex > 18277)
- len += sprintf(gdp->disk_name + len, "%c",
- 'a'+(((base->devindex-18278)
- /17576)%26));
- len += sprintf(gdp->disk_name + len, "%c",
- 'a'+(((base->devindex-702)/676)%26));
- }
- len += sprintf(gdp->disk_name + len, "%c",
- 'a'+(((base->devindex-26)/26)%26));
+ rc = dasd_name_format("dasd", devindex, gdp->disk_name, sizeof(gdp->disk_name));
+ if (rc) {
+ DBF_DEV_EVENT(DBF_ERR, block->base,
+ "setting disk name failed, rc %d", rc);
+ dasd_gd_free(gdp);
+ return rc;
}
- len += sprintf(gdp->disk_name + len, "%c", 'a'+(base->devindex%26));
if (base->features & DASD_FEATURE_READONLY ||
test_bit(DASD_FLAG_DEVICE_RO, &base->flags))
@@ -112,14 +132,22 @@ int dasd_gendisk_alloc(struct dasd_block *block)
}
/*
+ * Free gendisk structure
+ */
+static void dasd_gd_free(struct gendisk *gd)
+{
+ del_gendisk(gd);
+ gd->private_data = NULL;
+ put_disk(gd);
+}
+
+/*
* Unregister and free gendisk structure for device.
*/
void dasd_gendisk_free(struct dasd_block *block)
{
if (block->gdp) {
- del_gendisk(block->gdp);
- block->gdp->private_data = NULL;
- put_disk(block->gdp);
+ dasd_gd_free(block->gdp);
block->gdp = NULL;
blk_mq_free_tag_set(&block->tag_set);
}
diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h
index 36382eca941c..574af8243016 100644
--- a/drivers/scsi/sd.h
+++ b/drivers/scsi/sd.h
@@ -240,7 +240,7 @@ blk_status_t sd_zbc_setup_zone_mgmt_cmnd(struct scsi_cmnd *cmd,
unsigned int sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes,
struct scsi_sense_hdr *sshdr);
int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
- unsigned int nr_zones, report_zones_cb cb, void *data);
+ unsigned int nr_zones, struct blk_report_zones_args *args);
#else /* CONFIG_BLK_DEV_ZONED */
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index a8db66428f80..56e455fb5add 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -35,8 +35,7 @@ static bool sd_zbc_is_gap_zone(const u8 buf[64])
* @buf: SCSI zone descriptor.
* @idx: Index of the zone relative to the first zone reported by the current
* sd_zbc_report_zones() call.
- * @cb: Callback function pointer.
- * @data: Second argument passed to @cb.
+ * @args: report zones arguments (callback, etc)
*
* Return: Value returned by @cb.
*
@@ -44,12 +43,11 @@ static bool sd_zbc_is_gap_zone(const u8 buf[64])
* call @cb(blk_zone, @data).
*/
static int sd_zbc_parse_report(struct scsi_disk *sdkp, const u8 buf[64],
- unsigned int idx, report_zones_cb cb, void *data)
+ unsigned int idx, struct blk_report_zones_args *args)
{
struct scsi_device *sdp = sdkp->device;
struct blk_zone zone = { 0 };
sector_t start_lba, gran;
- int ret;
if (WARN_ON_ONCE(sd_zbc_is_gap_zone(buf)))
return -EINVAL;
@@ -87,11 +85,7 @@ static int sd_zbc_parse_report(struct scsi_disk *sdkp, const u8 buf[64],
else
zone.wp = logical_to_sectors(sdp, get_unaligned_be64(&buf[24]));
- ret = cb(&zone, idx, data);
- if (ret)
- return ret;
-
- return 0;
+ return disk_report_zone(sdkp->disk, &zone, idx, args);
}
/**
@@ -217,14 +211,14 @@ static inline sector_t sd_zbc_zone_sectors(struct scsi_disk *sdkp)
* @disk: Disk to report zones for.
* @sector: Start sector.
* @nr_zones: Maximum number of zones to report.
- * @cb: Callback function called to report zone information.
- * @data: Second argument passed to @cb.
+ * @args: Callback arguments.
*
* Called by the block layer to iterate over zone information. See also the
* disk->fops->report_zones() calls in block/blk-zoned.c.
*/
int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
- unsigned int nr_zones, report_zones_cb cb, void *data)
+ unsigned int nr_zones,
+ struct blk_report_zones_args *args)
{
struct scsi_disk *sdkp = scsi_disk(disk);
sector_t lba = sectors_to_logical(sdkp->device, sector);
@@ -283,7 +277,7 @@ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
}
ret = sd_zbc_parse_report(sdkp, buf + offset, zone_idx,
- cb, data);
+ args);
if (ret)
goto out;