summaryrefslogtreecommitdiff
path: root/drivers
diff options
context:
space:
mode:
authorMike Christie <michael.christie@oracle.com>2025-09-17 17:12:55 -0500
committerMartin K. Petersen <martin.petersen@oracle.com>2025-11-02 22:06:12 -0500
commitbbb490053173b737604a87af03f2113fb1c279a0 (patch)
treed9432a07971601b78189efc93d807d49e5cf65d2 /drivers
parented6b97a79577db9bf9d7b21fd623f24f499e3acc (diff)
scsi: target: Move LUN stats to per-CPU
The atomic use in the main I/O path is causing perf issues when using higher performance backend devices and multiple queues (more than 10 when using vhost-scsi) like with this fio workload: [global] bs=4K iodepth=128 direct=1 ioengine=libaio group_reporting time_based runtime=120 name=standard-iops rw=randread numjobs=16 cpus_allowed=0-15 To fix this issue, move the LUN stats to per CPU. Note: I forgot to include this patch with the delayed/ordered per CPU tracking and per device/device entry per CPU stats. With this patch you get the full 33% improvements when using fast backends, multiple queues and multiple IO submiters. Signed-off-by: Mike Christie <michael.christie@oracle.com> Reviewed-by: Dmitry Bogdanov <d.bogdanov@yadro.com> Link: https://patch.msgid.link/20250917221338.14813-4-michael.christie@oracle.com Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Diffstat (limited to 'drivers')
-rw-r--r--drivers/target/target_core_device.c1
-rw-r--r--drivers/target/target_core_fabric_configfs.c2
-rw-r--r--drivers/target/target_core_internal.h1
-rw-r--r--drivers/target/target_core_stat.c67
-rw-r--r--drivers/target/target_core_tpg.c23
-rw-r--r--drivers/target/target_core_transport.c22
6 files changed, 61 insertions, 55 deletions
diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c
index 7bb711b24c0d..2d4a7c0c69ce 100644
--- a/drivers/target/target_core_device.c
+++ b/drivers/target/target_core_device.c
@@ -814,6 +814,7 @@ struct se_device *target_alloc_device(struct se_hba *hba, const char *name)
dev->dev_attrib.max_write_same_len = DA_MAX_WRITE_SAME_LEN;
dev->dev_attrib.submit_type = TARGET_FABRIC_DEFAULT_SUBMIT;
+ /* Skip allocating lun_stats since we can't export them. */
xcopy_lun = &dev->xcopy_lun;
rcu_assign_pointer(xcopy_lun->lun_se_dev, dev);
init_completion(&xcopy_lun->lun_shutdown_comp);
diff --git a/drivers/target/target_core_fabric_configfs.c b/drivers/target/target_core_fabric_configfs.c
index 7156a4dc1ca7..13159928e365 100644
--- a/drivers/target/target_core_fabric_configfs.c
+++ b/drivers/target/target_core_fabric_configfs.c
@@ -697,7 +697,7 @@ static void target_fabric_port_release(struct config_item *item)
struct se_lun *lun = container_of(to_config_group(item),
struct se_lun, lun_group);
- kfree_rcu(lun, rcu_head);
+ call_rcu(&lun->rcu_head, target_tpg_free_lun);
}
static struct configfs_item_operations target_fabric_port_item_ops = {
diff --git a/drivers/target/target_core_internal.h b/drivers/target/target_core_internal.h
index 20aab1f50565..763e6d26e187 100644
--- a/drivers/target/target_core_internal.h
+++ b/drivers/target/target_core_internal.h
@@ -125,6 +125,7 @@ void core_tpg_add_node_to_devs(struct se_node_acl *, struct se_portal_group *,
struct se_lun *);
void core_tpg_wait_for_nacl_pr_ref(struct se_node_acl *);
struct se_lun *core_tpg_alloc_lun(struct se_portal_group *, u64);
+void target_tpg_free_lun(struct rcu_head *head);
int core_tpg_add_lun(struct se_portal_group *, struct se_lun *,
bool, struct se_device *);
void core_tpg_remove_lun(struct se_portal_group *, struct se_lun *);
diff --git a/drivers/target/target_core_stat.c b/drivers/target/target_core_stat.c
index e29d43dacaf7..083205052be2 100644
--- a/drivers/target/target_core_stat.c
+++ b/drivers/target/target_core_stat.c
@@ -606,53 +606,30 @@ static ssize_t target_stat_tgt_port_port_index_show(struct config_item *item,
return ret;
}
-static ssize_t target_stat_tgt_port_in_cmds_show(struct config_item *item,
- char *page)
-{
- struct se_lun *lun = to_stat_tgt_port(item);
- struct se_device *dev;
- ssize_t ret = -ENODEV;
-
- rcu_read_lock();
- dev = rcu_dereference(lun->lun_se_dev);
- if (dev)
- ret = snprintf(page, PAGE_SIZE, "%lu\n",
- atomic_long_read(&lun->lun_stats.cmd_pdus));
- rcu_read_unlock();
- return ret;
-}
-
-static ssize_t target_stat_tgt_port_write_mbytes_show(struct config_item *item,
- char *page)
-{
- struct se_lun *lun = to_stat_tgt_port(item);
- struct se_device *dev;
- ssize_t ret = -ENODEV;
-
- rcu_read_lock();
- dev = rcu_dereference(lun->lun_se_dev);
- if (dev)
- ret = snprintf(page, PAGE_SIZE, "%u\n",
- (u32)(atomic_long_read(&lun->lun_stats.rx_data_octets) >> 20));
- rcu_read_unlock();
- return ret;
+#define tgt_port_show_per_cpu_stat(prefix, field, shift) \
+per_cpu_stat_snprintf(scsi_port_stats, prefix, field, shift); \
+static ssize_t \
+target_stat_##prefix##_show(struct config_item *item, char *page) \
+{ \
+ struct se_lun *lun = to_stat_tgt_port(item); \
+ struct se_device *dev; \
+ int ret; \
+ \
+ rcu_read_lock(); \
+ dev = rcu_dereference(lun->lun_se_dev); \
+ if (!dev) { \
+ rcu_read_unlock(); \
+ return -ENODEV; \
+ } \
+ \
+ ret = per_cpu_stat_##prefix##_snprintf(lun->lun_stats, page); \
+ rcu_read_unlock(); \
+ return ret; \
}
-static ssize_t target_stat_tgt_port_read_mbytes_show(struct config_item *item,
- char *page)
-{
- struct se_lun *lun = to_stat_tgt_port(item);
- struct se_device *dev;
- ssize_t ret = -ENODEV;
-
- rcu_read_lock();
- dev = rcu_dereference(lun->lun_se_dev);
- if (dev)
- ret = snprintf(page, PAGE_SIZE, "%u\n",
- (u32)(atomic_long_read(&lun->lun_stats.tx_data_octets) >> 20));
- rcu_read_unlock();
- return ret;
-}
+tgt_port_show_per_cpu_stat(tgt_port_in_cmds, cmd_pdus, 0);
+tgt_port_show_per_cpu_stat(tgt_port_write_mbytes, rx_data_octets, 20);
+tgt_port_show_per_cpu_stat(tgt_port_read_mbytes, tx_data_octets, 20);
static ssize_t target_stat_tgt_port_hs_in_cmds_show(struct config_item *item,
char *page)
diff --git a/drivers/target/target_core_tpg.c b/drivers/target/target_core_tpg.c
index c0e429e5ef31..8b5ad50baa43 100644
--- a/drivers/target/target_core_tpg.c
+++ b/drivers/target/target_core_tpg.c
@@ -548,7 +548,7 @@ int core_tpg_register(
ret = core_tpg_add_lun(se_tpg, se_tpg->tpg_virt_lun0,
true, g_lun0_dev);
if (ret < 0) {
- kfree(se_tpg->tpg_virt_lun0);
+ target_tpg_free_lun(&se_tpg->tpg_virt_lun0->rcu_head);
return ret;
}
}
@@ -595,7 +595,7 @@ int core_tpg_deregister(struct se_portal_group *se_tpg)
if (se_tpg->proto_id >= 0) {
core_tpg_remove_lun(se_tpg, se_tpg->tpg_virt_lun0);
- kfree_rcu(se_tpg->tpg_virt_lun0, rcu_head);
+ call_rcu(&se_tpg->tpg_virt_lun0->rcu_head, target_tpg_free_lun);
}
target_tpg_deregister_rtpi(se_tpg);
@@ -615,6 +615,13 @@ struct se_lun *core_tpg_alloc_lun(
pr_err("Unable to allocate se_lun memory\n");
return ERR_PTR(-ENOMEM);
}
+
+ lun->lun_stats = alloc_percpu(struct scsi_port_stats);
+ if (!lun->lun_stats) {
+ pr_err("Unable to allocate se_lun stats memory\n");
+ goto free_lun;
+ }
+
lun->unpacked_lun = unpacked_lun;
atomic_set(&lun->lun_acl_count, 0);
init_completion(&lun->lun_shutdown_comp);
@@ -628,6 +635,18 @@ struct se_lun *core_tpg_alloc_lun(
lun->lun_tpg = tpg;
return lun;
+
+free_lun:
+ kfree(lun);
+ return ERR_PTR(-ENOMEM);
+}
+
+void target_tpg_free_lun(struct rcu_head *head)
+{
+ struct se_lun *lun = container_of(head, struct se_lun, rcu_head);
+
+ free_percpu(lun->lun_stats);
+ kfree(lun);
}
int core_tpg_add_lun(
diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
index 0a76bdfe5528..fca9b44288bc 100644
--- a/drivers/target/target_core_transport.c
+++ b/drivers/target/target_core_transport.c
@@ -1571,7 +1571,12 @@ target_cmd_parse_cdb(struct se_cmd *cmd)
return ret;
cmd->se_cmd_flags |= SCF_SUPPORTED_SAM_OPCODE;
- atomic_long_inc(&cmd->se_lun->lun_stats.cmd_pdus);
+ /*
+ * If this is the xcopy_lun then we won't have lun_stats since we
+ * can't export them.
+ */
+ if (cmd->se_lun->lun_stats)
+ this_cpu_inc(cmd->se_lun->lun_stats->cmd_pdus);
return 0;
}
EXPORT_SYMBOL(target_cmd_parse_cdb);
@@ -2597,8 +2602,9 @@ queue_rsp:
!(cmd->se_cmd_flags & SCF_TREAT_READ_AS_NORMAL))
goto queue_status;
- atomic_long_add(cmd->data_length,
- &cmd->se_lun->lun_stats.tx_data_octets);
+ if (cmd->se_lun->lun_stats)
+ this_cpu_add(cmd->se_lun->lun_stats->tx_data_octets,
+ cmd->data_length);
/*
* Perform READ_STRIP of PI using software emulation when
* backend had PI enabled, if the transport will not be
@@ -2621,14 +2627,16 @@ queue_rsp:
goto queue_full;
break;
case DMA_TO_DEVICE:
- atomic_long_add(cmd->data_length,
- &cmd->se_lun->lun_stats.rx_data_octets);
+ if (cmd->se_lun->lun_stats)
+ this_cpu_add(cmd->se_lun->lun_stats->rx_data_octets,
+ cmd->data_length);
/*
* Check if we need to send READ payload for BIDI-COMMAND
*/
if (cmd->se_cmd_flags & SCF_BIDI) {
- atomic_long_add(cmd->data_length,
- &cmd->se_lun->lun_stats.tx_data_octets);
+ if (cmd->se_lun->lun_stats)
+ this_cpu_add(cmd->se_lun->lun_stats->tx_data_octets,
+ cmd->data_length);
ret = cmd->se_tfo->queue_data_in(cmd);
if (ret)
goto queue_full;