summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.mailmap9
-rw-r--r--Documentation/devicetree/bindings/pinctrl/toshiba,visconti-pinctrl.yaml26
-rw-r--r--Documentation/devicetree/bindings/pinctrl/xlnx,versal-pinctrl.yaml1
-rw-r--r--Documentation/filesystems/iomap/operations.rst50
-rw-r--r--Documentation/filesystems/porting.rst15
-rw-r--r--Documentation/input/event-codes.rst25
-rw-r--r--Documentation/locking/seqlock.rst9
-rw-r--r--Documentation/sound/codecs/cs35l56.rst9
-rw-r--r--Documentation/userspace-api/netlink/intro-specs.rst4
-rw-r--r--Documentation/wmi/driver-development-guide.rst1
-rw-r--r--MAINTAINERS89
-rw-r--r--Makefile5
-rw-r--r--arch/alpha/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-fuji-data64.dts14
-rw-r--r--arch/arm/boot/dts/broadcom/bcm47189-luxul-xap-1440.dts4
-rw-r--r--arch/arm/boot/dts/nxp/imx/imx51-zii-rdu1.dts4
-rw-r--r--arch/arm/boot/dts/nxp/imx/imx6ul.dtsi2
-rw-r--r--arch/arm/boot/dts/nxp/imx/imx6ull-engicam-microgea-rmm.dts2
-rw-r--r--arch/arm/tools/syscall.tbl1
-rw-r--r--arch/arm64/boot/dts/broadcom/bcm2712-rpi-5-b.dts10
-rw-r--r--arch/arm64/boot/dts/freescale/imx8-ss-img.dtsi2
-rw-r--r--arch/arm64/boot/dts/freescale/imx8dxl-ss-conn.dtsi4
-rw-r--r--arch/arm64/boot/dts/freescale/imx8dxl-ss-hsio.dtsi5
-rw-r--r--arch/arm64/boot/dts/freescale/imx8mp-kontron-bl-osm-s.dts24
-rw-r--r--arch/arm64/boot/dts/freescale/imx8qm-mek.dts4
-rw-r--r--arch/arm64/boot/dts/freescale/imx95.dtsi3
-rw-r--r--arch/arm64/boot/dts/nvidia/tegra194-p3668.dtsi1
-rw-r--r--arch/arm64/boot/dts/rockchip/rk3328.dtsi1
-rw-r--r--arch/arm64/boot/dts/rockchip/rk3399-op1.dtsi2
-rw-r--r--arch/arm64/boot/dts/rockchip/rk3399-puma-haikou-video-demo.dtso10
-rw-r--r--arch/arm64/boot/dts/rockchip/rk3566-bigtreetech-cb2.dtsi6
-rw-r--r--arch/arm64/boot/dts/rockchip/rk3566-pinetab2.dtsi2
-rw-r--r--arch/arm64/boot/dts/rockchip/rk3568-odroid-m1.dts2
-rw-r--r--arch/arm64/boot/dts/rockchip/rk3576.dtsi14
-rw-r--r--arch/arm64/boot/dts/rockchip/rk3588-opp.dtsi2
-rw-r--r--arch/arm64/boot/dts/rockchip/rk3588-tiger.dtsi4
-rw-r--r--arch/arm64/boot/dts/rockchip/rk3588j.dtsi2
-rw-r--r--arch/arm64/boot/dts/rockchip/rk3588s-orangepi-5.dts4
-rw-r--r--arch/arm64/configs/defconfig2
-rw-r--r--arch/arm64/include/asm/alternative.h7
-rw-r--r--arch/arm64/include/asm/kfence.h3
-rw-r--r--arch/arm64/include/asm/page.h4
-rw-r--r--arch/arm64/include/asm/percpu.h15
-rw-r--r--arch/arm64/include/asm/scs.h2
-rw-r--r--arch/arm64/include/asm/spectre.h1
-rw-r--r--arch/arm64/kernel/acpi.c18
-rw-r--r--arch/arm64/kernel/alternative.c19
-rw-r--r--arch/arm64/kernel/cpufeature.c6
-rw-r--r--arch/arm64/kernel/module.c21
-rw-r--r--arch/arm64/kernel/mte.c3
-rw-r--r--arch/arm64/kernel/pi/map_kernel.c2
-rw-r--r--arch/arm64/kernel/pi/patch-scs.c10
-rw-r--r--arch/arm64/kernel/pi/pi.h2
-rw-r--r--arch/arm64/kernel/probes/kprobes.c5
-rw-r--r--arch/arm64/kernel/proton-pack.c35
-rw-r--r--arch/arm64/kernel/vdso32/Makefile3
-rw-r--r--arch/arm64/kvm/arm.c2
-rw-r--r--arch/arm64/kvm/hyp/nvhe/ffa.c9
-rw-r--r--arch/arm64/kvm/hyp/nvhe/mem_protect.c28
-rw-r--r--arch/arm64/kvm/sys_regs.c75
-rw-r--r--arch/arm64/kvm/vgic/vgic-debug.c16
-rw-r--r--arch/arm64/kvm/vgic/vgic-init.c16
-rw-r--r--arch/arm64/kvm/vgic/vgic-its.c18
-rw-r--r--arch/arm64/kvm/vgic/vgic-v3.c3
-rw-r--r--arch/arm64/kvm/vgic/vgic.c23
-rw-r--r--arch/arm64/mm/fault.c21
-rw-r--r--arch/arm64/mm/mmu.c111
-rw-r--r--arch/arm64/tools/syscall_32.tbl1
-rw-r--r--arch/loongarch/include/asm/cpu-features.h2
-rw-r--r--arch/loongarch/include/asm/cpu.h27
-rw-r--r--arch/loongarch/include/asm/hw_breakpoint.h4
-rw-r--r--arch/loongarch/include/asm/io.h5
-rw-r--r--arch/loongarch/include/asm/loongarch.h2
-rw-r--r--arch/loongarch/include/asm/pgalloc.h2
-rw-r--r--arch/loongarch/include/asm/pgtable.h11
-rw-r--r--arch/loongarch/include/uapi/asm/ptrace.h40
-rw-r--r--arch/loongarch/kernel/cpu-probe.c38
-rw-r--r--arch/loongarch/kernel/kexec_efi.c2
-rw-r--r--arch/loongarch/kernel/kexec_elf.c2
-rw-r--r--arch/loongarch/kernel/machine_kexec.c24
-rw-r--r--arch/loongarch/kernel/machine_kexec_file.c2
-rw-r--r--arch/loongarch/kernel/mem.c7
-rw-r--r--arch/loongarch/kernel/numa.c83
-rw-r--r--arch/loongarch/kernel/perf_event.c7
-rw-r--r--arch/loongarch/kernel/proc.c2
-rw-r--r--arch/loongarch/kernel/setup.c5
-rw-r--r--arch/loongarch/kernel/traps.c4
-rw-r--r--arch/loongarch/kvm/intc/eiointc.c2
-rw-r--r--arch/loongarch/kvm/mmu.c2
-rw-r--r--arch/loongarch/kvm/timer.c2
-rw-r--r--arch/loongarch/kvm/vcpu.c19
-rw-r--r--arch/loongarch/mm/init.c2
-rw-r--r--arch/loongarch/mm/ioremap.c2
-rw-r--r--arch/loongarch/net/bpf_jit.c3
-rw-r--r--arch/loongarch/pci/pci.c8
-rw-r--r--arch/loongarch/vdso/Makefile2
-rw-r--r--arch/m68k/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/microblaze/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/mips/boot/dts/econet/en751221.dtsi2
-rw-r--r--arch/mips/kernel/process.c2
-rw-r--r--arch/mips/kernel/syscalls/syscall_n32.tbl1
-rw-r--r--arch/mips/kernel/syscalls/syscall_n64.tbl1
-rw-r--r--arch/mips/kernel/syscalls/syscall_o32.tbl1
-rw-r--r--arch/mips/mm/tlb-r4k.c116
-rw-r--r--arch/mips/mti-malta/malta-init.c20
-rw-r--r--arch/parisc/boot/compressed/Makefile2
-rw-r--r--arch/parisc/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/powerpc/Kconfig1
-rw-r--r--arch/powerpc/boot/Makefile3
-rw-r--r--arch/powerpc/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/powerpc/platforms/Kconfig.cputype1
-rw-r--r--arch/powerpc/platforms/cell/spufs/inode.c42
-rw-r--r--arch/powerpc/platforms/pseries/papr-hvpipe.c39
-rw-r--r--arch/powerpc/platforms/pseries/papr-platform-dump.c30
-rw-r--r--arch/powerpc/platforms/pseries/papr-rtas-common.c27
-rw-r--r--arch/riscv/Kconfig2
-rw-r--r--arch/riscv/Makefile17
-rw-r--r--arch/riscv/boot/dts/allwinner/sun20i-d1s.dtsi2
-rw-r--r--arch/riscv/include/asm/vendorid_list.h2
-rw-r--r--arch/riscv/kernel/sbi.c6
-rw-r--r--arch/riscv/kvm/aia_imsic.c16
-rw-r--r--arch/riscv/kvm/mmu.c25
-rw-r--r--arch/riscv/kvm/vcpu.c2
-rw-r--r--arch/s390/Makefile3
-rw-r--r--arch/s390/include/asm/pgtable.h12
-rw-r--r--arch/s390/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/s390/mm/pgtable.c4
-rw-r--r--arch/s390/purgatory/Makefile3
-rw-r--r--arch/sh/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/sparc/kernel/syscalls/syscall.tbl1
-rw-r--r--arch/x86/Makefile4
-rw-r--r--arch/x86/boot/compressed/Makefile7
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl1
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl1
-rw-r--r--arch/x86/events/core.c10
-rw-r--r--arch/x86/events/intel/uncore.c2
-rw-r--r--arch/x86/include/asm/ftrace.h5
-rw-r--r--arch/x86/include/uapi/asm/vmx.h1
-rw-r--r--arch/x86/kernel/acpi/cppc.c2
-rw-r--r--arch/x86/kernel/cpu/amd.c7
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c1
-rw-r--r--arch/x86/kernel/ftrace_64.S8
-rw-r--r--arch/x86/kvm/svm/avic.c24
-rw-r--r--arch/x86/kvm/svm/nested.c20
-rw-r--r--arch/x86/kvm/svm/svm.c95
-rw-r--r--arch/x86/kvm/svm/svm.h5
-rw-r--r--arch/x86/kvm/vmx/common.h2
-rw-r--r--arch/x86/kvm/vmx/nested.c8
-rw-r--r--arch/x86/kvm/vmx/vmx.c8
-rw-r--r--arch/x86/kvm/x86.c50
-rw-r--r--arch/xtensa/kernel/syscalls/syscall.tbl1
-rw-r--r--block/bdev.c25
-rw-r--r--block/fops.c5
-rw-r--r--drivers/acpi/acpi_mrrm.c43
-rw-r--r--drivers/acpi/apei/einj-core.c64
-rw-r--r--drivers/acpi/arm64/gtdt.c4
-rw-r--r--drivers/acpi/cppc_acpi.c6
-rw-r--r--drivers/acpi/numa/hmat.c46
-rw-r--r--drivers/acpi/numa/srat.c2
-rw-r--r--drivers/acpi/processor_driver.c6
-rw-r--r--drivers/acpi/processor_idle.c115
-rw-r--r--drivers/ata/libata-core.c10
-rw-r--r--drivers/ata/libata-scsi.c11
-rw-r--r--drivers/atm/fore200e.c2
-rw-r--r--drivers/base/devtmpfs.c6
-rw-r--r--drivers/base/firmware_loader/main.c59
-rw-r--r--drivers/base/power/main.c25
-rw-r--r--drivers/block/nbd.c54
-rw-r--r--drivers/bluetooth/btrtl.c24
-rw-r--r--drivers/bluetooth/btusb.c52
-rw-r--r--drivers/clk/sunxi-ng/ccu-sun55i-a523-r.c4
-rw-r--r--drivers/clk/sunxi-ng/ccu-sun55i-a523.c2
-rw-r--r--drivers/counter/microchip-tcb-capture.c2
-rw-r--r--drivers/cpufreq/intel_pstate.c9
-rw-r--r--drivers/crypto/ccp/sev-dev.c17
-rw-r--r--drivers/crypto/hisilicon/qm.c2
-rw-r--r--drivers/cxl/core/region.c2
-rw-r--r--drivers/dax/super.c2
-rw-r--r--drivers/dma-buf/dma-buf.c10
-rw-r--r--drivers/edac/altera_edac.c22
-rw-r--r--drivers/edac/versalnet_edac.c24
-rw-r--r--drivers/firewire/core-card.c2
-rw-r--r--drivers/firewire/core-topology.c3
-rw-r--r--drivers/firmware/efi/libstub/Makefile4
-rw-r--r--drivers/firmware/stratix10-svc.c7
-rw-r--r--drivers/gpio/gpiolib-cdev.c75
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c5
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c12
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c3
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_isp.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c5
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c6
-rw-r--r--drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c4
-rw-r--r--drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_1.c1
-rw-r--r--drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c4
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_queue.c12
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_svm.c2
-rw-r--r--drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c153
-rw-r--r--drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h6
-rw-r--r--drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c8
-rw-r--r--drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c59
-rw-r--r--drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c4
-rw-r--r--drivers/gpu/drm/amd/display/dc/core/dc_stream.c11
-rw-r--r--drivers/gpu/drm/amd/display/dc/dccg/dcn35/dcn35_dccg.c2
-rw-r--r--drivers/gpu/drm/amd/display/dc/hwss/dce110/dce110_hwseq.c1
-rw-r--r--drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c10
-rw-r--r--drivers/gpu/drm/amd/display/dc/hwss/dcn401/dcn401_hwseq.c2
-rw-r--r--drivers/gpu/drm/amd/display/dc/link/link_dpms.c3
-rw-r--r--drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_capability.c11
-rw-r--r--drivers/gpu/drm/amd/display/dc/virtual/virtual_stream_encoder.c7
-rw-r--r--drivers/gpu/drm/amd/display/modules/freesync/freesync.c11
-rw-r--r--drivers/gpu/drm/bridge/sii902x.c20
-rw-r--r--drivers/gpu/drm/clients/drm_client_setup.c4
-rw-r--r--drivers/gpu/drm/drm_fb_helper.c14
-rw-r--r--drivers/gpu/drm/drm_plane.c4
-rw-r--r--drivers/gpu/drm/i915/display/intel_cx0_phy.c14
-rw-r--r--drivers/gpu/drm/i915/display/intel_display.c8
-rw-r--r--drivers/gpu/drm/i915/display/intel_display_device.c13
-rw-r--r--drivers/gpu/drm/i915/display/intel_display_device.h4
-rw-r--r--drivers/gpu/drm/i915/display/intel_dmc.c10
-rw-r--r--drivers/gpu/drm/i915/display/intel_psr.c13
-rw-r--r--drivers/gpu/drm/imagination/pvr_device.h8
-rw-r--r--drivers/gpu/drm/nouveau/nvkm/falcon/fw.c2
-rw-r--r--drivers/gpu/drm/panthor/panthor_gem.c18
-rw-r--r--drivers/gpu/drm/radeon/radeon_fence.c7
-rw-r--r--drivers/gpu/drm/sti/sti_vtg.c7
-rw-r--r--drivers/gpu/drm/tegra/dc.c1
-rw-r--r--drivers/gpu/drm/tegra/dsi.c9
-rw-r--r--drivers/gpu/drm/tegra/uapi.c7
-rw-r--r--drivers/gpu/drm/vmwgfx/vmwgfx_cursor_plane.c16
-rw-r--r--drivers/gpu/drm/vmwgfx/vmwgfx_cursor_plane.h1
-rw-r--r--drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c5
-rw-r--r--drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c12
-rw-r--r--drivers/gpu/drm/xe/Kconfig1
-rw-r--r--drivers/gpu/drm/xe/regs/xe_gt_regs.h1
-rw-r--r--drivers/gpu/drm/xe/tests/xe_mocs.c2
-rw-r--r--drivers/gpu/drm/xe/xe_gt_clock.c7
-rw-r--r--drivers/gpu/drm/xe/xe_guc_ct.c15
-rw-r--r--drivers/gpu/drm/xe/xe_irq.c18
-rw-r--r--drivers/gpu/drm/xe/xe_pci.c1
-rw-r--r--drivers/gpu/drm/xe/xe_vm.c6
-rw-r--r--drivers/gpu/drm/xe/xe_wa.c11
-rw-r--r--drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_init.c2
-rw-r--r--drivers/hid/hid-apple.c1
-rw-r--r--drivers/hid/hid-corsair-void.c5
-rw-r--r--drivers/hid/hid-elecom.c6
-rw-r--r--drivers/hid/hid-haptic.c2
-rw-r--r--drivers/hid/hid-ids.h8
-rw-r--r--drivers/hid/hid-input.c5
-rw-r--r--drivers/hid/hid-lenovo.c17
-rw-r--r--drivers/hid/hid-ntrig.c7
-rw-r--r--drivers/hid/hid-playstation.c2
-rw-r--r--drivers/hid/hid-quirks.c16
-rw-r--r--drivers/hid/hid-uclogic-params.c4
-rw-r--r--drivers/hid/usbhid/hid-pidff.c4
-rw-r--r--drivers/hv/mshv_root_main.c30
-rw-r--r--drivers/hwmon/gpd-fan.c54
-rw-r--r--drivers/iio/accel/adxl355_core.c44
-rw-r--r--drivers/iio/accel/bmc150-accel-core.c5
-rw-r--r--drivers/iio/accel/bmc150-accel.h1
-rw-r--r--drivers/iio/adc/ad4030.c2
-rw-r--r--drivers/iio/adc/ad7124.c12
-rw-r--r--drivers/iio/adc/ad7280a.c2
-rw-r--r--drivers/iio/adc/ad7380.c8
-rw-r--r--drivers/iio/adc/rtq6056.c2
-rw-r--r--drivers/iio/adc/stm32-dfsdm-adc.c5
-rw-r--r--drivers/iio/buffer/industrialio-buffer-dma.c6
-rw-r--r--drivers/iio/buffer/industrialio-buffer-dmaengine.c2
-rw-r--r--drivers/iio/common/ssp_sensors/ssp_dev.c4
-rw-r--r--drivers/iio/humidity/hdc3020.c73
-rw-r--r--drivers/iio/imu/st_lsm6dsx/st_lsm6dsx.h40
-rw-r--r--drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_core.c19
-rw-r--r--drivers/iio/industrialio-buffer.c21
-rw-r--r--drivers/iio/pressure/bmp280-core.c15
-rw-r--r--drivers/infiniband/hw/mlx5/cq.c11
-rw-r--r--drivers/input/keyboard/cros_ec_keyb.c6
-rw-r--r--drivers/input/keyboard/imx_sc_key.c2
-rw-r--r--drivers/input/tablet/pegasus_notetaker.c9
-rw-r--r--drivers/input/touchscreen/goodix.c28
-rw-r--r--drivers/input/touchscreen/goodix.h1
-rw-r--r--drivers/iommu/dma-iommu.c5
-rw-r--r--drivers/iommu/iommufd/driver.c2
-rw-r--r--drivers/iommu/iommufd/iommufd_private.h4
-rw-r--r--drivers/irqchip/irq-riscv-intc.c3
-rw-r--r--drivers/mailbox/mailbox-test.c2
-rw-r--r--drivers/mailbox/mailbox-th1520.c4
-rw-r--r--drivers/mailbox/mtk-cmdq-mailbox.c45
-rw-r--r--drivers/mailbox/mtk-gpueb-mailbox.c2
-rw-r--r--drivers/mailbox/omap-mailbox.c35
-rw-r--r--drivers/mailbox/pcc.c8
-rw-r--r--drivers/md/dm-pcache/Makefile2
-rw-r--r--drivers/md/dm-pcache/cache.c4
-rw-r--r--drivers/md/dm-pcache/cache.h2
-rw-r--r--drivers/md/dm-pcache/cache_req.c6
-rw-r--r--drivers/md/dm-pcache/pcache_internal.h2
-rw-r--r--drivers/md/dm-verity-fec.c6
-rw-r--r--drivers/md/dm.c2
-rw-r--r--drivers/media/mc/mc-request.c34
-rw-r--r--drivers/memory/tegra/tegra210.c4
-rw-r--r--drivers/misc/mei/pci-me.c13
-rw-r--r--drivers/misc/mei/pci-txe.c13
-rw-r--r--drivers/misc/mei/platform-vsc.c11
-rw-r--r--drivers/misc/ntsync.c21
-rw-r--r--drivers/mmc/host/Kconfig2
-rw-r--r--drivers/mmc/host/dw_mmc-rockchip.c4
-rw-r--r--drivers/mmc/host/pxamci.c56
-rw-r--r--drivers/mmc/host/sdhci-of-dwcmshc.c31
-rw-r--r--drivers/most/most_usb.c14
-rw-r--r--drivers/mtd/mtdchar.c6
-rw-r--r--drivers/mtd/nand/Kconfig2
-rw-r--r--drivers/mtd/nand/ecc-realtek.c6
-rw-r--r--drivers/mtd/nand/onenand/onenand_samsung.c2
-rw-r--r--drivers/mtd/nand/raw/cadence-nand-controller.c3
-rw-r--r--drivers/mtd/nand/spi/fmsh.c2
-rw-r--r--drivers/net/bonding/bond_main.c5
-rw-r--r--drivers/net/can/rcar/rcar_canfd.c53
-rw-r--r--drivers/net/can/sja1000/sja1000.c4
-rw-r--r--drivers/net/can/sun4i_can.c4
-rw-r--r--drivers/net/can/usb/gs_usb.c100
-rw-r--r--drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c4
-rw-r--r--drivers/net/dsa/hirschmann/hellcreek_ptp.c14
-rw-r--r--drivers/net/dsa/microchip/ksz_common.c31
-rw-r--r--drivers/net/dsa/microchip/ksz_ptp.c22
-rw-r--r--drivers/net/dsa/microchip/lan937x_main.c1
-rw-r--r--drivers/net/dsa/sja1105/sja1105_main.c7
-rw-r--r--drivers/net/ethernet/airoha/airoha_ppe.c2
-rw-r--r--drivers/net/ethernet/aquantia/atlantic/aq_hw_utils.c22
-rw-r--r--drivers/net/ethernet/aquantia/atlantic/aq_hw_utils.h1
-rw-r--r--drivers/net/ethernet/aquantia/atlantic/aq_ring.c5
-rw-r--r--drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c19
-rw-r--r--drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2.c2
-rw-r--r--drivers/net/ethernet/emulex/benet/be_main.c7
-rw-r--r--drivers/net/ethernet/freescale/fec.h1
-rw-r--r--drivers/net/ethernet/freescale/fec_main.c2
-rw-r--r--drivers/net/ethernet/freescale/fec_ptp.c64
-rw-r--r--drivers/net/ethernet/intel/ice/ice_ptp.c22
-rw-r--r--drivers/net/ethernet/intel/idpf/idpf_main.c2
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/cq.c23
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/devlink.c2
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c3
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c33
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_main.c1
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c15
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c6
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c7
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c28
-rw-r--r--drivers/net/ethernet/mellanox/mlxsw/core_linecards.c2
-rw-r--r--drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c6
-rw-r--r--drivers/net/ethernet/meta/fbnic/fbnic_fw.c2
-rw-r--r--drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c5
-rw-r--r--drivers/net/ethernet/qlogic/qede/qede_fp.c5
-rw-r--r--drivers/net/ethernet/realtek/r8169_main.c19
-rw-r--r--drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c4
-rw-r--r--drivers/net/ethernet/ti/am65-cpsw-qos.c51
-rw-r--r--drivers/net/ethernet/toshiba/ps3_gelic_net.c45
-rw-r--r--drivers/net/ethernet/toshiba/ps3_gelic_net.h1
-rw-r--r--drivers/net/phy/mdio_bus.c5
-rw-r--r--drivers/net/phy/micrel.c12
-rw-r--r--drivers/net/phy/mxl-gpy.c20
-rw-r--r--drivers/net/phy/phylink.c3
-rw-r--r--drivers/net/team/team_core.c23
-rw-r--r--drivers/net/tun_vnet.h2
-rw-r--r--drivers/net/veth.c43
-rw-r--r--drivers/net/virtio_net.c19
-rw-r--r--drivers/net/wireless/ath/ath11k/wmi.c3
-rw-r--r--drivers/net/wireless/intel/iwlwifi/mld/link.c7
-rw-r--r--drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c13
-rw-r--r--drivers/net/wireless/intel/iwlwifi/mvm/time-event.c14
-rw-r--r--drivers/net/wireless/intel/iwlwifi/mvm/utils.c12
-rw-r--r--drivers/net/wireless/marvell/mwl8k.c71
-rw-r--r--drivers/net/wireless/realtek/rtw89/fw.c7
-rw-r--r--drivers/net/wireless/virtual/mac80211_hwsim.c14
-rw-r--r--drivers/net/wwan/mhi_wwan_mbim.c2
-rw-r--r--drivers/nvme/host/core.c3
-rw-r--r--drivers/nvme/host/fc.c15
-rw-r--r--drivers/nvme/host/multipath.c2
-rw-r--r--drivers/nvme/target/auth.c4
-rw-r--r--drivers/nvme/target/fabrics-cmd-auth.c1
-rw-r--r--drivers/nvme/target/nvmet.h1
-rw-r--r--drivers/nvmem/layouts.c2
-rw-r--r--drivers/pci/pci.h2
-rw-r--r--drivers/pci/pcie/aspm.c25
-rw-r--r--drivers/pci/probe.c7
-rw-r--r--drivers/pci/quirks.c42
-rw-r--r--drivers/perf/riscv_pmu_sbi.c2
-rw-r--r--drivers/pinctrl/cirrus/pinctrl-cs42l43.c23
-rw-r--r--drivers/pinctrl/mediatek/pinctrl-mt8189.c4
-rw-r--r--drivers/pinctrl/mediatek/pinctrl-mt8196.c6
-rw-r--r--drivers/pinctrl/nxp/pinctrl-s32cc.c3
-rw-r--r--drivers/pinctrl/qcom/pinctrl-msm.c2
-rw-r--r--drivers/pinctrl/realtek/Kconfig1
-rw-r--r--drivers/platform/arm64/lenovo-thinkpad-t14s.c16
-rw-r--r--drivers/platform/x86/Kconfig1
-rw-r--r--drivers/platform/x86/acer-wmi.c4
-rw-r--r--drivers/platform/x86/amd/pmc/pmc-quirks.c25
-rw-r--r--drivers/platform/x86/amd/pmc/pmc.c3
-rw-r--r--drivers/platform/x86/amd/pmc/pmc.h1
-rw-r--r--drivers/platform/x86/dell/alienware-wmi-wmax.c106
-rw-r--r--drivers/platform/x86/hp/hp-wmi.c6
-rw-r--r--drivers/platform/x86/huawei-wmi.c4
-rw-r--r--drivers/platform/x86/intel/hid.c1
-rw-r--r--drivers/platform/x86/intel/punit_ipc.c2
-rw-r--r--drivers/platform/x86/intel/speed_select_if/isst_if_mmio.c4
-rw-r--r--drivers/platform/x86/intel/uncore-frequency/uncore-frequency-common.h9
-rw-r--r--drivers/platform/x86/intel/uncore-frequency/uncore-frequency.c4
-rw-r--r--drivers/platform/x86/msi-wmi-platform.c43
-rw-r--r--drivers/pmdomain/arm/scmi_pm_domain.c13
-rw-r--r--drivers/pmdomain/imx/gpc.c2
-rw-r--r--drivers/pmdomain/mediatek/mtk-pm-domains.c18
-rw-r--r--drivers/pmdomain/samsung/exynos-pm-domains.c29
-rw-r--r--drivers/pmdomain/tegra/powergate-bpmp.c1
-rw-r--r--drivers/power/supply/intel_dc_ti_battery.c10
-rw-r--r--drivers/pwm/pwm-adp5585.c4
-rw-r--r--drivers/regulator/fixed.c1
-rw-r--r--drivers/regulator/rtq2208-regulator.c6
-rw-r--r--drivers/reset/reset-imx8mp-audiomix.c4
-rw-r--r--drivers/s390/net/ctcm_mpc.c1
-rw-r--r--drivers/scsi/sg.c10
-rw-r--r--drivers/slimbus/qcom-ngd-ctrl.c1
-rw-r--r--drivers/spi/Kconfig4
-rw-r--r--drivers/spi/spi-amlogic-spifc-a1.c4
-rw-r--r--drivers/spi/spi-bcm63xx.c14
-rw-r--r--drivers/spi/spi-cadence-quadspi.c18
-rw-r--r--drivers/spi/spi-fsl-lpspi.c8
-rw-r--r--drivers/spi/spi-imx.c15
-rw-r--r--drivers/spi/spi-nxp-fspi.c10
-rw-r--r--drivers/spi/spi-xilinx.c2
-rw-r--r--drivers/spi/spi.c12
-rw-r--r--drivers/target/loopback/tcm_loop.c3
-rw-r--r--drivers/target/target_core_configfs.c14
-rw-r--r--drivers/tee/qcomtee/call.c2
-rw-r--r--drivers/tee/qcomtee/core.c2
-rw-r--r--drivers/thunderbolt/nhi.c2
-rw-r--r--drivers/thunderbolt/nhi.h1
-rw-r--r--drivers/tty/pty.c51
-rw-r--r--drivers/tty/serial/8250/8250.h4
-rw-r--r--drivers/tty/serial/8250/8250_platform.c2
-rw-r--r--drivers/tty/serial/8250/8250_rsa.c26
-rw-r--r--drivers/tty/serial/8250/Makefile2
-rw-r--r--drivers/tty/serial/amba-pl011.c2
-rw-r--r--drivers/usb/cdns3/cdns3-pci-wrap.c5
-rw-r--r--drivers/usb/dwc3/core.c3
-rw-r--r--drivers/usb/dwc3/dwc3-pci.c80
-rw-r--r--drivers/usb/dwc3/ep0.c1
-rw-r--r--drivers/usb/dwc3/gadget.c7
-rw-r--r--drivers/usb/gadget/function/f_eem.c7
-rw-r--r--drivers/usb/gadget/udc/core.c17
-rw-r--r--drivers/usb/gadget/udc/renesas_usbf.c4
-rw-r--r--drivers/usb/host/xhci-dbgcap.h1
-rw-r--r--drivers/usb/host/xhci-dbgtty.c23
-rw-r--r--drivers/usb/host/xhci-ring.c15
-rw-r--r--drivers/usb/host/xhci-sideband.c102
-rw-r--r--drivers/usb/host/xhci.c1
-rw-r--r--drivers/usb/renesas_usbhs/common.c14
-rw-r--r--drivers/usb/serial/ftdi_sio.c1
-rw-r--r--drivers/usb/serial/ftdi_sio_ids.h1
-rw-r--r--drivers/usb/serial/option.c10
-rw-r--r--drivers/usb/storage/sddr55.c6
-rw-r--r--drivers/usb/storage/transport.c16
-rw-r--r--drivers/usb/storage/uas.c5
-rw-r--r--drivers/usb/storage/unusual_devs.h2
-rw-r--r--drivers/usb/typec/ucsi/psy.c5
-rw-r--r--drivers/vdpa/mlx5/net/mlx5_vnet.c6
-rw-r--r--drivers/vfio/group.c28
-rw-r--r--drivers/vhost/net.c53
-rw-r--r--drivers/vhost/vhost.c76
-rw-r--r--drivers/vhost/vhost.h10
-rw-r--r--drivers/video/fbdev/core/fbcon.c9
-rw-r--r--fs/9p/acl.c1
-rw-r--r--fs/9p/vfs_file.c17
-rw-r--r--fs/9p/vfs_inode.c2
-rw-r--r--fs/9p/vfs_inode_dotl.c2
-rw-r--r--fs/Makefile2
-rw-r--r--fs/affs/inode.c2
-rw-r--r--fs/afs/cell.c121
-rw-r--r--fs/afs/dir.c4
-rw-r--r--fs/afs/dynroot.c9
-rw-r--r--fs/afs/inode.c8
-rw-r--r--fs/afs/internal.h13
-rw-r--r--fs/afs/mntpt.c3
-rw-r--r--fs/afs/proc.c3
-rw-r--r--fs/afs/security.c49
-rw-r--r--fs/afs/super.c2
-rw-r--r--fs/afs/vl_alias.c3
-rw-r--r--fs/aio.c6
-rw-r--r--fs/anon_inodes.c23
-rw-r--r--fs/attr.c2
-rw-r--r--fs/autofs/autofs_i.h5
-rw-r--r--fs/autofs/dev-ioctl.c31
-rw-r--r--fs/autofs/inode.c1
-rw-r--r--fs/autofs/root.c8
-rw-r--r--fs/backing-file.c153
-rw-r--r--fs/befs/linuxvfs.c2
-rw-r--r--fs/bfs/inode.c21
-rw-r--r--fs/binfmt_misc.c11
-rw-r--r--fs/btrfs/block-group.c10
-rw-r--r--fs/btrfs/compression.h4
-rw-r--r--fs/btrfs/defrag.c14
-rw-r--r--fs/btrfs/extent_io.c21
-rw-r--r--fs/btrfs/file.c9
-rw-r--r--fs/btrfs/inode.c88
-rw-r--r--fs/btrfs/ioctl.c41
-rw-r--r--fs/btrfs/misc.h5
-rw-r--r--fs/btrfs/ordered-data.c2
-rw-r--r--fs/btrfs/scrub.c2
-rw-r--r--fs/btrfs/subpage.c5
-rw-r--r--fs/btrfs/tree-log.c2
-rw-r--r--fs/btrfs/volumes.c20
-rw-r--r--fs/btrfs/zoned.c60
-rw-r--r--fs/buffer.c6
-rw-r--r--fs/cachefiles/interface.c11
-rw-r--r--fs/cachefiles/namei.c98
-rw-r--r--fs/cachefiles/volume.c9
-rw-r--r--fs/ceph/addr.c6
-rw-r--r--fs/ceph/cache.c2
-rw-r--r--fs/ceph/crypto.c4
-rw-r--r--fs/ceph/file.c4
-rw-r--r--fs/ceph/inode.c28
-rw-r--r--fs/ceph/super.c2
-rw-r--r--fs/coda/cnode.c4
-rw-r--r--fs/coredump.c142
-rw-r--r--fs/cramfs/inode.c2
-rw-r--r--fs/crypto/keyring.c2
-rw-r--r--fs/crypto/keysetup.c2
-rw-r--r--fs/dax.c30
-rw-r--r--fs/dcache.c35
-rw-r--r--fs/debugfs/inode.c74
-rw-r--r--fs/drop_caches.c2
-rw-r--r--fs/ecryptfs/Kconfig2
-rw-r--r--fs/ecryptfs/crypto.c90
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h13
-rw-r--r--fs/ecryptfs/inode.c169
-rw-r--r--fs/ecryptfs/keystore.c65
-rw-r--r--fs/ecryptfs/main.c7
-rw-r--r--fs/ecryptfs/super.c5
-rw-r--r--fs/efivarfs/super.c1
-rw-r--r--fs/efs/inode.c2
-rw-r--r--fs/erofs/data.c5
-rw-r--r--fs/erofs/decompressor_zstd.c11
-rw-r--r--fs/erofs/fileio.c6
-rw-r--r--fs/erofs/inode.c2
-rw-r--r--fs/eventfd.c31
-rw-r--r--fs/eventpoll.c32
-rw-r--r--fs/exec.c3
-rw-r--r--fs/exfat/super.c5
-rw-r--r--fs/ext2/inode.c2
-rw-r--r--fs/ext4/inode.c28
-rw-r--r--fs/ext4/mmp.c8
-rw-r--r--fs/ext4/orphan.c4
-rw-r--r--fs/f2fs/acl.c1
-rw-r--r--fs/f2fs/compress.c2
-rw-r--r--fs/f2fs/data.c7
-rw-r--r--fs/f2fs/inode.c2
-rw-r--r--fs/f2fs/namei.c4
-rw-r--r--fs/f2fs/super.c2
-rw-r--r--fs/fat/inode.c7
-rw-r--r--fs/fcntl.c13
-rw-r--r--fs/fhandle.c30
-rw-r--r--fs/file.c54
-rw-r--r--fs/file_attr.c4
-rw-r--r--fs/freevxfs/vxfs_inode.c2
-rw-r--r--fs/fs-writeback.c187
-rw-r--r--fs/fs_dirent.c (renamed from fs/fs_types.c)2
-rw-r--r--fs/fs_struct.c6
-rw-r--r--fs/fuse/dir.c22
-rw-r--r--fs/fuse/file.c286
-rw-r--r--fs/fuse/fuse_i.h8
-rw-r--r--fs/fuse/inode.c17
-rw-r--r--fs/fuse/virtio_fs.c2
-rw-r--r--fs/gfs2/aops.c14
-rw-r--r--fs/gfs2/file.c2
-rw-r--r--fs/gfs2/glock.c2
-rw-r--r--fs/gfs2/glops.c2
-rw-r--r--fs/gfs2/inode.c4
-rw-r--r--fs/gfs2/ops_fstype.c2
-rw-r--r--fs/hfs/btree.c2
-rw-r--r--fs/hfs/inode.c2
-rw-r--r--fs/hfsplus/options.c1
-rw-r--r--fs/hfsplus/super.c2
-rw-r--r--fs/hostfs/hostfs_kern.c31
-rw-r--r--fs/hpfs/dir.c2
-rw-r--r--fs/hpfs/inode.c2
-rw-r--r--fs/hpfs/super.c1
-rw-r--r--fs/init.c6
-rw-r--r--fs/inode.c317
-rw-r--r--fs/internal.h3
-rw-r--r--fs/iomap/Makefile3
-rw-r--r--fs/iomap/bio.c88
-rw-r--r--fs/iomap/buffered-io.c646
-rw-r--r--fs/iomap/direct-io.c268
-rw-r--r--fs/iomap/internal.h12
-rw-r--r--fs/iomap/ioend.c2
-rw-r--r--fs/iomap/iter.c20
-rw-r--r--fs/iomap/seek.c8
-rw-r--r--fs/iomap/trace.h7
-rw-r--r--fs/isofs/inode.c7
-rw-r--r--fs/jffs2/fs.c4
-rw-r--r--fs/jfs/file.c4
-rw-r--r--fs/jfs/inode.c2
-rw-r--r--fs/jfs/jfs_incore.h6
-rw-r--r--fs/jfs/jfs_txnmgr.c2
-rw-r--r--fs/kernfs/inode.c2
-rw-r--r--fs/libfs.c43
-rw-r--r--fs/locks.c103
-rw-r--r--fs/minix/inode.c18
-rw-r--r--fs/minix/minix.h9
-rw-r--r--fs/minix/namei.c39
-rw-r--r--fs/mount.h3
-rw-r--r--fs/namei.c1059
-rw-r--r--fs/namespace.c206
-rw-r--r--fs/netfs/buffered_write.c2
-rw-r--r--fs/netfs/misc.c10
-rw-r--r--fs/netfs/read_single.c6
-rw-r--r--fs/nfs/client.c8
-rw-r--r--fs/nfs/dir.c7
-rw-r--r--fs/nfs/inode.c20
-rw-r--r--fs/nfs/localio.c273
-rw-r--r--fs/nfs/nfs3client.c14
-rw-r--r--fs/nfs/nfs4client.c14
-rw-r--r--fs/nfs/nfs4file.c2
-rw-r--r--fs/nfs/nfs4idmap.c7
-rw-r--r--fs/nfs/nfs4proc.c9
-rw-r--r--fs/nfs/pnfs.c2
-rw-r--r--fs/nfs/pnfs_nfs.c66
-rw-r--r--fs/nfs/sysfs.c1
-rw-r--r--fs/nfsd/filecache.c57
-rw-r--r--fs/nfsd/filecache.h2
-rw-r--r--fs/nfsd/nfs3proc.c16
-rw-r--r--fs/nfsd/nfs4proc.c36
-rw-r--r--fs/nfsd/nfs4recover.c40
-rw-r--r--fs/nfsd/nfs4state.c171
-rw-r--r--fs/nfsd/nfs4xdr.c5
-rw-r--r--fs/nfsd/nfsd.h1
-rw-r--r--fs/nfsd/nfsfh.c6
-rw-r--r--fs/nfsd/nfsproc.c14
-rw-r--r--fs/nfsd/state.h5
-rw-r--r--fs/nfsd/vfs.c175
-rw-r--r--fs/nfsd/vfs.h2
-rw-r--r--fs/nfsd/xdr4.h3
-rw-r--r--fs/nilfs2/cpfile.c2
-rw-r--r--fs/nilfs2/dat.c2
-rw-r--r--fs/nilfs2/ifile.c2
-rw-r--r--fs/nilfs2/inode.c10
-rw-r--r--fs/nilfs2/nilfs.h1
-rw-r--r--fs/nilfs2/segment.c7
-rw-r--r--fs/nilfs2/sufile.c2
-rw-r--r--fs/notify/fanotify/fanotify_user.c60
-rw-r--r--fs/notify/fsnotify.c2
-rw-r--r--fs/nsfs.c148
-rw-r--r--fs/ntfs3/inode.c2
-rw-r--r--fs/ntfs3/super.c1
-rw-r--r--fs/ocfs2/acl.c1
-rw-r--r--fs/ocfs2/alloc.c2
-rw-r--r--fs/ocfs2/dlmglue.c2
-rw-r--r--fs/ocfs2/inode.c27
-rw-r--r--fs/ocfs2/inode.h1
-rw-r--r--fs/ocfs2/journal.c11
-rw-r--r--fs/ocfs2/ocfs2_trace.h2
-rw-r--r--fs/ocfs2/super.c2
-rw-r--r--fs/omfs/inode.c3
-rw-r--r--fs/open.c44
-rw-r--r--fs/openpromfs/inode.c2
-rw-r--r--fs/orangefs/inode.c6
-rw-r--r--fs/orangefs/orangefs-utils.c6
-rw-r--r--fs/overlayfs/copy_up.c143
-rw-r--r--fs/overlayfs/dir.c587
-rw-r--r--fs/overlayfs/file.c97
-rw-r--r--fs/overlayfs/inode.c124
-rw-r--r--fs/overlayfs/namei.c402
-rw-r--r--fs/overlayfs/overlayfs.h63
-rw-r--r--fs/overlayfs/readdir.c110
-rw-r--r--fs/overlayfs/super.c138
-rw-r--r--fs/overlayfs/util.c43
-rw-r--r--fs/overlayfs/xattrs.c35
-rw-r--r--fs/pidfs.c189
-rw-r--r--fs/pipe.c2
-rw-r--r--fs/posix_acl.c8
-rw-r--r--fs/proc/array.c9
-rw-r--r--fs/proc/base.c13
-rw-r--r--fs/proc/generic.c12
-rw-r--r--fs/qnx4/inode.c2
-rw-r--r--fs/qnx6/inode.c2
-rw-r--r--fs/quota/dquot.c2
-rw-r--r--fs/romfs/super.c2
-rw-r--r--fs/signalfd.c29
-rw-r--r--fs/smb/client/cached_dir.c41
-rw-r--r--fs/smb/client/cifs_spnego.c6
-rw-r--r--fs/smb/client/cifsfs.c5
-rw-r--r--fs/smb/client/cifssmb.c22
-rw-r--r--fs/smb/client/connect.c1
-rw-r--r--fs/smb/client/file.c1
-rw-r--r--fs/smb/client/fs_context.c8
-rw-r--r--fs/smb/client/inode.c15
-rw-r--r--fs/smb/client/smb1ops.c1
-rw-r--r--fs/smb/client/smbdirect.c3
-rw-r--r--fs/smb/client/transport.c2
-rw-r--r--fs/smb/server/smb2pdu.c6
-rw-r--r--fs/smb/server/transport_rdma.c14
-rw-r--r--fs/smb/server/transport_tcp.c5
-rw-r--r--fs/smb/server/vfs.c123
-rw-r--r--fs/smb/server/vfs.h8
-rw-r--r--fs/splice.c2
-rw-r--r--fs/squashfs/inode.c2
-rw-r--r--fs/super.c14
-rw-r--r--fs/sync.c19
-rw-r--r--fs/timerfd.c29
-rw-r--r--fs/ubifs/file.c2
-rw-r--r--fs/ubifs/super.c2
-rw-r--r--fs/udf/inode.c2
-rw-r--r--fs/ufs/inode.c2
-rw-r--r--fs/userfaultfd.c30
-rw-r--r--fs/utimes.c5
-rw-r--r--fs/xattr.c12
-rw-r--r--fs/xfs/libxfs/xfs_errortag.h6
-rw-r--r--fs/xfs/scrub/common.c2
-rw-r--r--fs/xfs/scrub/inode_repair.c2
-rw-r--r--fs/xfs/scrub/orphanage.c13
-rw-r--r--fs/xfs/scrub/parent.c2
-rw-r--r--fs/xfs/scrub/symlink_repair.c2
-rw-r--r--fs/xfs/scrub/xfarray.c2
-rw-r--r--fs/xfs/xfs_aops.c7
-rw-r--r--fs/xfs/xfs_bmap_util.c2
-rw-r--r--fs/xfs/xfs_file.c50
-rw-r--r--fs/xfs/xfs_handle.c56
-rw-r--r--fs/xfs/xfs_health.c4
-rw-r--r--fs/xfs/xfs_icache.c6
-rw-r--r--fs/xfs/xfs_inode.c6
-rw-r--r--fs/xfs/xfs_inode_item.c4
-rw-r--r--fs/xfs/xfs_ioctl.c6
-rw-r--r--fs/xfs/xfs_iomap.c38
-rw-r--r--fs/xfs/xfs_iops.c2
-rw-r--r--fs/xfs/xfs_reflink.h2
-rw-r--r--fs/xfs/xfs_super.c5
-rw-r--r--fs/xfs/xfs_zone_alloc.c28
-rw-r--r--fs/zonefs/file.c5
-rw-r--r--fs/zonefs/super.c4
-rw-r--r--include/acpi/processor.h34
-rw-r--r--include/asm-generic/vmlinux.lds.h3
-rw-r--r--include/drm/intel/pciids.h5
-rw-r--r--include/linux/ata.h1
-rw-r--r--include/linux/atomic/atomic-instrumented.h26
-rw-r--r--include/linux/backing-dev-defs.h2
-rw-r--r--include/linux/backing-dev.h5
-rw-r--r--include/linux/ceph/libceph.h3
-rw-r--r--include/linux/cleanup.h22
-rw-r--r--include/linux/cred.h22
-rw-r--r--include/linux/dma-mapping.h2
-rw-r--r--include/linux/entry-virt.h2
-rw-r--r--include/linux/ethtool.h2
-rw-r--r--include/linux/file.h126
-rw-r--r--include/linux/filelock.h98
-rw-r--r--include/linux/filter.h20
-rw-r--r--include/linux/fs.h727
-rw-r--r--include/linux/fs/super.h238
-rw-r--r--include/linux/fs/super_types.h336
-rw-r--r--include/linux/fs_dirent.h (renamed from include/linux/fs_types.h)11
-rw-r--r--include/linux/fs_struct.h6
-rw-r--r--include/linux/ftrace.h10
-rw-r--r--include/linux/gfp.h3
-rw-r--r--include/linux/highmem.h6
-rw-r--r--include/linux/huge_mm.h55
-rw-r--r--include/linux/iio/buffer-dma.h1
-rw-r--r--include/linux/iio/buffer_impl.h2
-rw-r--r--include/linux/init_task.h1
-rw-r--r--include/linux/iomap.h86
-rw-r--r--include/linux/local_lock.h4
-rw-r--r--include/linux/local_lock_internal.h62
-rw-r--r--include/linux/mailbox/mtk-cmdq-mailbox.h10
-rw-r--r--include/linux/map_benchmark.h1
-rw-r--r--include/linux/mlx5/cq.h1
-rw-r--r--include/linux/mm.h21
-rw-r--r--include/linux/mutex.h45
-rw-r--r--include/linux/namei.h83
-rw-r--r--include/linux/ns/ns_common_types.h196
-rw-r--r--include/linux/ns/nstree_types.h55
-rw-r--r--include/linux/ns_common.h233
-rw-r--r--include/linux/nsfs.h3
-rw-r--r--include/linux/nsproxy.h9
-rw-r--r--include/linux/nstree.h52
-rw-r--r--include/linux/pagemap.h18
-rw-r--r--include/linux/pci.h2
-rw-r--r--include/linux/pid_namespace.h3
-rw-r--r--include/linux/pipe_fs_i.h23
-rw-r--r--include/linux/pseudo_fs.h1
-rw-r--r--include/linux/sched/coredump.h2
-rw-r--r--include/linux/seqlock.h114
-rw-r--r--include/linux/shmem_fs.h2
-rw-r--r--include/linux/syscalls.h4
-rw-r--r--include/linux/types.h1
-rw-r--r--include/linux/usb/gadget.h5
-rw-r--r--include/linux/user_namespace.h4
-rw-r--r--include/linux/virtio_net.h7
-rw-r--r--include/linux/writeback.h15
-rw-r--r--include/linux/xattr.h4
-rw-r--r--include/net/bluetooth/hci.h5
-rw-r--r--include/net/bluetooth/hci_core.h21
-rw-r--r--include/net/pkt_cls.h2
-rw-r--r--include/net/xfrm.h3
-rw-r--r--include/trace/events/writeback.h8
-rw-r--r--include/uapi/asm-generic/posix_types.h1
-rw-r--r--include/uapi/asm-generic/unistd.h4
-rw-r--r--include/uapi/linux/fcntl.h16
-rw-r--r--include/uapi/linux/input-event-codes.h2
-rw-r--r--include/uapi/linux/io_uring/query.h3
-rw-r--r--include/uapi/linux/isst_if.h50
-rw-r--r--include/uapi/linux/mount.h2
-rw-r--r--include/uapi/linux/nsfs.h58
-rw-r--r--include/uapi/linux/pidfd.h11
-rw-r--r--include/uapi/linux/tee.h23
-rw-r--r--init/do_mounts.c3
-rw-r--r--init/do_mounts_rd.c3
-rw-r--r--init/init_task.c27
-rw-r--r--init/version-timestamp.c7
-rw-r--r--io_uring/cmd_net.c2
-rw-r--r--io_uring/io_uring.c2
-rw-r--r--io_uring/mock_file.c43
-rw-r--r--io_uring/net.c6
-rw-r--r--io_uring/query.c2
-rw-r--r--io_uring/rsrc.c16
-rw-r--r--io_uring/rw.c19
-rw-r--r--ipc/mqueue.c83
-rw-r--r--ipc/msgutil.c7
-rw-r--r--ipc/namespace.c3
-rw-r--r--kernel/Kconfig.kexec9
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/acct.c29
-rw-r--r--kernel/bpf/bpf_iter.c29
-rw-r--r--kernel/bpf/helpers.c26
-rw-r--r--kernel/bpf/stream.c3
-rw-r--r--kernel/bpf/token.c47
-rw-r--r--kernel/bpf/trampoline.c5
-rw-r--r--kernel/bpf/verifier.c18
-rw-r--r--kernel/cgroup/cgroup.c21
-rw-r--r--kernel/cgroup/namespace.c2
-rw-r--r--kernel/crash_core.c2
-rw-r--r--kernel/cred.c33
-rw-r--r--kernel/dma/direct.c1
-rw-r--r--kernel/events/core.c2
-rw-r--r--kernel/exit.c3
-rw-r--r--kernel/fork.c3
-rw-r--r--kernel/gcov/gcc_4_7.c4
-rw-r--r--kernel/kexec_handover.c95
-rw-r--r--kernel/kexec_handover_debug.c25
-rw-r--r--kernel/kexec_handover_internal.h20
-rw-r--r--kernel/locking/mutex-debug.c10
-rw-r--r--kernel/locking/mutex.c28
-rw-r--r--kernel/locking/mutex.h5
-rw-r--r--kernel/locking/rtmutex_api.c19
-rw-r--r--kernel/locking/spinlock_debug.c4
-rw-r--r--kernel/nscommon.c246
-rw-r--r--kernel/nsproxy.c57
-rw-r--r--kernel/nstree.c782
-rw-r--r--kernel/pid.c12
-rw-r--r--kernel/pid_namespace.c2
-rw-r--r--kernel/power/hibernate.c9
-rw-r--r--kernel/power/suspend.c3
-rw-r--r--kernel/power/swap.c22
-rw-r--r--kernel/sched/cputime.c20
-rw-r--r--kernel/sched/ext.c31
-rw-r--r--kernel/time/namespace.c5
-rw-r--r--kernel/time/posix-timers.c12
-rw-r--r--kernel/time/tick-sched.c11
-rw-r--r--kernel/time/timekeeping.c25
-rw-r--r--kernel/time/timer.c7
-rw-r--r--kernel/trace/ftrace.c60
-rw-r--r--kernel/trace/trace.c10
-rw-r--r--kernel/trace/trace_events_user.c22
-rw-r--r--kernel/user.c7
-rw-r--r--kernel/watch_queue.c4
-rw-r--r--lib/Kconfig.debug3
-rw-r--r--lib/crypto/tests/sha256_kunit.c1
-rw-r--r--lib/maple_tree.c30
-rw-r--r--lib/test_kho.c3
-rw-r--r--mm/Kconfig7
-rw-r--r--mm/backing-dev.c2
-rw-r--r--mm/damon/stat.c9
-rw-r--r--mm/damon/sysfs.c10
-rw-r--r--mm/fadvise.c3
-rw-r--r--mm/filemap.c204
-rw-r--r--mm/huge_memory.c63
-rw-r--r--mm/kmsan/core.c3
-rw-r--r--mm/kmsan/hooks.c6
-rw-r--r--mm/kmsan/shadow.c2
-rw-r--r--mm/ksm.c113
-rw-r--r--mm/memblock.c3
-rw-r--r--mm/memfd.c56
-rw-r--r--mm/memory.c20
-rw-r--r--mm/mempool.c32
-rw-r--r--mm/mm_init.c2
-rw-r--r--mm/mmap_lock.c1
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/page-writeback.c6
-rw-r--r--mm/page_alloc.c9
-rw-r--r--mm/secretmem.c22
-rw-r--r--mm/shmem.c32
-rw-r--r--mm/slub.c14
-rw-r--r--mm/swap_state.c13
-rw-r--r--mm/swapfile.c4
-rw-r--r--mm/truncate.c47
-rw-r--r--mm/vmscan.c2
-rw-r--r--mm/workingset.c2
-rw-r--r--net/atm/common.c2
-rw-r--r--net/bluetooth/6lowpan.c105
-rw-r--r--net/bluetooth/hci_conn.c33
-rw-r--r--net/bluetooth/hci_core.c89
-rw-r--r--net/bluetooth/hci_event.c56
-rw-r--r--net/bluetooth/hci_sock.c2
-rw-r--r--net/bluetooth/hci_sync.c2
-rw-r--r--net/bluetooth/iso.c30
-rw-r--r--net/bluetooth/l2cap_core.c24
-rw-r--r--net/bluetooth/mgmt.c1
-rw-r--r--net/bluetooth/sco.c35
-rw-r--r--net/bluetooth/smp.c31
-rw-r--r--net/ceph/auth_x.c2
-rw-r--r--net/ceph/ceph_common.c58
-rw-r--r--net/ceph/debugfs.c14
-rw-r--r--net/ceph/messenger_v2.c11
-rw-r--r--net/ceph/osdmap.c18
-rw-r--r--net/core/dev_ioctl.c3
-rw-r--r--net/core/net_namespace.c2
-rw-r--r--net/core/netpoll.c7
-rw-r--r--net/devlink/rate.c4
-rw-r--r--net/dns_resolver/dns_query.c6
-rw-r--r--net/dsa/tag_brcm.c6
-rw-r--r--net/handshake/netlink.c38
-rw-r--r--net/handshake/tlshd.c1
-rw-r--r--net/hsr/hsr_device.c5
-rw-r--r--net/hsr/hsr_forward.c22
-rw-r--r--net/ipv4/esp4_offload.c6
-rw-r--r--net/ipv4/route.c5
-rw-r--r--net/ipv6/esp6_offload.c6
-rw-r--r--net/kcm/kcmsock.c22
-rw-r--r--net/l2tp/l2tp_core.c6
-rw-r--r--net/mac80211/iface.c14
-rw-r--r--net/mac80211/rx.c10
-rw-r--r--net/mctp/route.c1
-rw-r--r--net/mptcp/options.c54
-rw-r--r--net/mptcp/pm.c20
-rw-r--r--net/mptcp/pm_kernel.c2
-rw-r--r--net/mptcp/protocol.c103
-rw-r--r--net/mptcp/protocol.h3
-rw-r--r--net/mptcp/subflow.c8
-rw-r--r--net/openvswitch/actions.c68
-rw-r--r--net/openvswitch/flow_netlink.c64
-rw-r--r--net/openvswitch/flow_netlink.h2
-rw-r--r--net/sched/act_bpf.c6
-rw-r--r--net/sched/act_connmark.c12
-rw-r--r--net/sched/act_ife.c12
-rw-r--r--net/sched/cls_bpf.c6
-rw-r--r--net/sched/em_canid.c3
-rw-r--r--net/sched/em_cmp.c5
-rw-r--r--net/sched/em_nbyte.c2
-rw-r--r--net/sched/em_text.c11
-rw-r--r--net/sched/sch_api.c5
-rw-r--r--net/sched/sch_generic.c17
-rw-r--r--net/sctp/transport.c13
-rw-r--r--net/smc/smc_clc.c1
-rw-r--r--net/socket.c34
-rw-r--r--net/strparser/strparser.c2
-rw-r--r--net/sunrpc/Kconfig3
-rw-r--r--net/tipc/net.c2
-rw-r--r--net/unix/af_unix.c38
-rw-r--r--net/unix/garbage.c14
-rw-r--r--net/vmw_vsock/af_vsock.c40
-rw-r--r--net/xdp/xsk.c143
-rw-r--r--net/xfrm/xfrm_device.c2
-rw-r--r--net/xfrm/xfrm_output.c8
-rw-r--r--net/xfrm/xfrm_state.c30
-rw-r--r--net/xfrm/xfrm_user.c8
-rw-r--r--rust/Makefile2
-rw-r--r--rust/kernel/debugfs/traits.rs55
-rw-r--r--rust/kernel/sync/atomic.rs12
-rw-r--r--rust/kernel/sync/lock.rs41
-rw-r--r--rust/kernel/sync/lock/global.rs5
-rw-r--r--samples/rust/rust_debugfs.rs12
-rw-r--r--samples/rust/rust_debugfs_scoped.rs6
-rw-r--r--samples/vfs/test-statx.c6
-rw-r--r--samples/watch_queue/watch_test.c6
-rw-r--r--scripts/Makefile.extrawarn4
-rwxr-xr-xscripts/atomic/gen-atomic-instrumented.sh11
-rwxr-xr-xscripts/decode_stacktrace.sh14
-rw-r--r--scripts/gendwarfksyms/gendwarfksyms.c3
-rw-r--r--scripts/gendwarfksyms/gendwarfksyms.h2
-rw-r--r--scripts/gendwarfksyms/symbols.c4
-rw-r--r--scripts/syscall.tbl1
-rw-r--r--security/apparmor/apparmorfs.c8
-rw-r--r--security/keys/process_keys.c2
-rw-r--r--security/landlock/fs.c9
-rw-r--r--security/selinux/hooks.c251
-rw-r--r--security/selinux/include/objsec.h22
-rw-r--r--security/selinux/selinuxfs.c15
-rw-r--r--sound/hda/codecs/cirrus/cs420x.c1
-rw-r--r--sound/hda/codecs/hdmi/nvhdmi-mcp.c4
-rw-r--r--sound/hda/codecs/realtek/alc269.c11
-rw-r--r--sound/pci/au88x0/au88x0.c8
-rw-r--r--sound/soc/codecs/cs4271.c10
-rw-r--r--sound/soc/codecs/da7213.c69
-rw-r--r--sound/soc/codecs/da7213.h1
-rw-r--r--sound/soc/codecs/lpass-va-macro.c2
-rw-r--r--sound/soc/codecs/tas2781-i2c.c9
-rw-r--r--sound/soc/codecs/tas2783-sdw.c20
-rw-r--r--sound/soc/renesas/rcar/ssiu.c3
-rw-r--r--sound/soc/sdca/sdca_functions.c3
-rw-r--r--sound/soc/sdw_utils/soc_sdw_utils.c20
-rw-r--r--sound/usb/endpoint.c5
-rw-r--r--sound/usb/mixer.c4
-rw-r--r--sound/usb/quirks.c11
-rw-r--r--tools/arch/riscv/include/asm/csr.h5
-rw-r--r--tools/arch/x86/include/uapi/asm/vmx.h1
-rw-r--r--tools/bpf/bpftool/Documentation/bpftool-prog.rst2
-rw-r--r--tools/build/feature/Makefile4
-rw-r--r--tools/include/uapi/linux/nsfs.h70
-rw-r--r--tools/lib/bpf/bpf_helpers.h28
-rwxr-xr-xtools/net/ynl/pyynl/ynl_gen_c.py12
-rw-r--r--tools/perf/Makefile.config5
-rw-r--r--tools/perf/builtin-lock.c2
-rwxr-xr-xtools/perf/tests/shell/lock_contention.sh14
-rw-r--r--tools/perf/util/header.c10
-rw-r--r--tools/perf/util/libbfd.c38
-rw-r--r--tools/perf/util/mutex.c14
-rw-r--r--tools/perf/util/mutex.h2
-rw-r--r--tools/testing/selftests/bpf/config3
-rw-r--r--tools/testing/selftests/bpf/prog_tests/livepatch_trampoline.c107
-rw-r--r--tools/testing/selftests/bpf/prog_tests/mptcp.c140
-rw-r--r--tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c150
-rw-r--r--tools/testing/selftests/bpf/progs/iters_looping.c53
-rw-r--r--tools/testing/selftests/bpf/progs/livepatch_trampoline.c30
-rw-r--r--tools/testing/selftests/bpf/progs/mptcp_sockmap.c43
-rw-r--r--tools/testing/selftests/bpf/progs/stacktrace_ips.c49
-rw-r--r--tools/testing/selftests/bpf/progs/stream_fail.c6
-rw-r--r--tools/testing/selftests/bpf/progs/task_work.c6
-rw-r--r--tools/testing/selftests/bpf/progs/task_work_fail.c8
-rw-r--r--tools/testing/selftests/bpf/progs/task_work_stress.c4
-rw-r--r--tools/testing/selftests/bpf/test_kmods/bpf_testmod.c26
-rw-r--r--tools/testing/selftests/coredump/.gitignore4
-rw-r--r--tools/testing/selftests/coredump/Makefile8
-rw-r--r--tools/testing/selftests/coredump/coredump_socket_protocol_test.c1568
-rw-r--r--tools/testing/selftests/coredump/coredump_socket_test.c742
-rw-r--r--tools/testing/selftests/coredump/coredump_test.h59
-rw-r--r--tools/testing/selftests/coredump/coredump_test_helpers.c383
-rw-r--r--tools/testing/selftests/coredump/stackdump_test.c1662
-rw-r--r--tools/testing/selftests/drivers/net/Makefile1
-rw-r--r--tools/testing/selftests/drivers/net/bonding/Makefile2
-rw-r--r--tools/testing/selftests/drivers/net/bonding/config4
-rwxr-xr-xtools/testing/selftests/drivers/net/bonding/netcons_over_bonding.sh361
-rw-r--r--tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh78
-rwxr-xr-xtools/testing/selftests/drivers/net/netcons_torture.sh130
-rw-r--r--tools/testing/selftests/filesystems/utils.c2
-rw-r--r--tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc4
-rw-r--r--tools/testing/selftests/kvm/arm64/get-reg-list.c3
-rw-r--r--tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c9
-rw-r--r--tools/testing/selftests/mm/uffd-unit-tests.c15
-rw-r--r--tools/testing/selftests/namespaces/.gitignore9
-rw-r--r--tools/testing/selftests/namespaces/Makefile24
-rw-r--r--tools/testing/selftests/namespaces/cred_change_test.c814
-rw-r--r--tools/testing/selftests/namespaces/listns_efault_test.c530
-rw-r--r--tools/testing/selftests/namespaces/listns_pagination_bug.c138
-rw-r--r--tools/testing/selftests/namespaces/listns_permissions_test.c759
-rw-r--r--tools/testing/selftests/namespaces/listns_test.c679
-rw-r--r--tools/testing/selftests/namespaces/ns_active_ref_test.c2672
-rw-r--r--tools/testing/selftests/namespaces/nsid_test.c107
-rw-r--r--tools/testing/selftests/namespaces/regression_pidfd_setns_test.c113
-rw-r--r--tools/testing/selftests/namespaces/siocgskns_test.c1824
-rw-r--r--tools/testing/selftests/namespaces/stress_test.c626
-rw-r--r--tools/testing/selftests/namespaces/wrappers.h35
-rw-r--r--tools/testing/selftests/net/.gitignore1
-rw-r--r--tools/testing/selftests/net/af_unix/Makefile1
-rw-r--r--tools/testing/selftests/net/af_unix/so_peek_off.c162
-rwxr-xr-xtools/testing/selftests/net/forwarding/lib_sh_test.sh7
-rwxr-xr-xtools/testing/selftests/net/forwarding/local_termination.sh2
-rw-r--r--tools/testing/selftests/net/lib.sh2
-rw-r--r--tools/testing/selftests/net/mptcp/mptcp_connect.c18
-rwxr-xr-xtools/testing/selftests/net/mptcp/mptcp_connect.sh2
-rwxr-xr-xtools/testing/selftests/net/mptcp/mptcp_join.sh99
-rw-r--r--tools/testing/selftests/net/mptcp/mptcp_lib.sh21
-rw-r--r--tools/testing/selftests/pidfd/pidfd.h15
-rw-r--r--tools/testing/selftests/pidfd/pidfd_info_test.c73
-rw-r--r--tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json44
-rw-r--r--tools/testing/selftests/user_events/perf_test.c2
-rw-r--r--tools/testing/selftests/vfio/lib/include/vfio_util.h19
-rw-r--r--tools/testing/selftests/vfio/lib/vfio_pci_device.c246
-rw-r--r--tools/testing/selftests/vfio/vfio_dma_mapping_test.c20
-rw-r--r--tools/testing/selftests/vfio/vfio_pci_driver_test.c12
-rw-r--r--virt/kvm/guest_memfd.c45
1087 files changed, 27304 insertions, 10995 deletions
diff --git a/.mailmap b/.mailmap
index 369cfe467932..fffbfd413474 100644
--- a/.mailmap
+++ b/.mailmap
@@ -206,6 +206,7 @@ Danilo Krummrich <dakr@kernel.org> <dakr@redhat.com>
David Brownell <david-b@pacbell.net>
David Collins <quic_collinsd@quicinc.com> <collinsd@codeaurora.org>
David Heidelberg <david@ixit.cz> <d.okias@gmail.com>
+David Hildenbrand <david@kernel.org> <david@redhat.com>
David Rheinsberg <david@readahead.eu> <dh.herrmann@gmail.com>
David Rheinsberg <david@readahead.eu> <dh.herrmann@googlemail.com>
David Rheinsberg <david@readahead.eu> <david.rheinsberg@gmail.com>
@@ -426,7 +427,7 @@ Kenneth W Chen <kenneth.w.chen@intel.com>
Kenneth Westfield <quic_kwestfie@quicinc.com> <kwestfie@codeaurora.org>
Kiran Gunda <quic_kgunda@quicinc.com> <kgunda@codeaurora.org>
Kirill Tkhai <tkhai@ya.ru> <ktkhai@virtuozzo.com>
-Kirill A. Shutemov <kas@kernel.org> <kirill.shutemov@linux.intel.com>
+Kiryl Shutsemau <kas@kernel.org> <kirill.shutemov@linux.intel.com>
Kishon Vijay Abraham I <kishon@kernel.org> <kishon@ti.com>
Konrad Dybcio <konradybcio@kernel.org> <konrad.dybcio@linaro.org>
Konrad Dybcio <konradybcio@kernel.org> <konrad.dybcio@somainline.org>
@@ -437,6 +438,7 @@ Krishna Manikandan <quic_mkrishn@quicinc.com> <mkrishn@codeaurora.org>
Krzysztof Kozlowski <krzk@kernel.org> <k.kozlowski.k@gmail.com>
Krzysztof Kozlowski <krzk@kernel.org> <k.kozlowski@samsung.com>
Krzysztof Kozlowski <krzk@kernel.org> <krzysztof.kozlowski@canonical.com>
+Krzysztof Kozlowski <krzk@kernel.org> <krzysztof.kozlowski@linaro.org>
Krzysztof Wilczyński <kwilczynski@kernel.org> <krzysztof.wilczynski@linux.com>
Krzysztof Wilczyński <kwilczynski@kernel.org> <kw@linux.com>
Kshitiz Godara <quic_kgodara@quicinc.com> <kgodara@codeaurora.org>
@@ -605,7 +607,8 @@ Oleksij Rempel <o.rempel@pengutronix.de>
Oleksij Rempel <o.rempel@pengutronix.de> <ore@pengutronix.de>
Oliver Hartkopp <socketcan@hartkopp.net> <oliver.hartkopp@volkswagen.de>
Oliver Hartkopp <socketcan@hartkopp.net> <oliver@hartkopp.net>
-Oliver Upton <oliver.upton@linux.dev> <oupton@google.com>
+Oliver Upton <oupton@kernel.org> <oupton@google.com>
+Oliver Upton <oupton@kernel.org> <oliver.upton@linux.dev>
Ondřej Jirman <megi@xff.cz> <megous@megous.com>
Oza Pawandeep <quic_poza@quicinc.com> <poza@codeaurora.org>
Pali Rohár <pali@kernel.org> <pali.rohar@gmail.com>
@@ -688,6 +691,8 @@ Sachin Mokashi <sachin.mokashi@intel.com> <sachinx.mokashi@intel.com>
Sachin P Sant <ssant@in.ibm.com>
Sai Prakash Ranjan <quic_saipraka@quicinc.com> <saiprakash.ranjan@codeaurora.org>
Sakari Ailus <sakari.ailus@linux.intel.com> <sakari.ailus@iki.fi>
+Sam Protsenko <semen.protsenko@linaro.org>
+Sam Protsenko <semen.protsenko@linaro.org> <semen.protsenko@globallogic.com>
Sam Ravnborg <sam@mars.ravnborg.org>
Sankeerth Billakanti <quic_sbillaka@quicinc.com> <sbillaka@codeaurora.org>
Santosh Shilimkar <santosh.shilimkar@oracle.org>
diff --git a/Documentation/devicetree/bindings/pinctrl/toshiba,visconti-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/toshiba,visconti-pinctrl.yaml
index 19d47fd414bc..ce04d2eadec9 100644
--- a/Documentation/devicetree/bindings/pinctrl/toshiba,visconti-pinctrl.yaml
+++ b/Documentation/devicetree/bindings/pinctrl/toshiba,visconti-pinctrl.yaml
@@ -50,18 +50,20 @@ patternProperties:
groups:
description:
Name of the pin group to use for the functions.
- $ref: /schemas/types.yaml#/definitions/string
- enum: [i2c0_grp, i2c1_grp, i2c2_grp, i2c3_grp, i2c4_grp,
- i2c5_grp, i2c6_grp, i2c7_grp, i2c8_grp,
- spi0_grp, spi0_cs0_grp, spi0_cs1_grp, spi0_cs2_grp,
- spi1_grp, spi2_grp, spi3_grp, spi4_grp, spi5_grp, spi6_grp,
- uart0_grp, uart1_grp, uart2_grp, uart3_grp,
- pwm0_gpio4_grp, pwm0_gpio8_grp, pwm0_gpio12_grp,
- pwm0_gpio16_grp, pwm1_gpio5_grp, pwm1_gpio9_grp,
- pwm1_gpio13_grp, pwm1_gpio17_grp, pwm2_gpio6_grp,
- pwm2_gpio10_grp, pwm2_gpio14_grp, pwm2_gpio18_grp,
- pwm3_gpio7_grp, pwm3_gpio11_grp, pwm3_gpio15_grp,
- pwm3_gpio19_grp, pcmif_out_grp, pcmif_in_grp]
+ items:
+ enum: [i2c0_grp, i2c1_grp, i2c2_grp, i2c3_grp, i2c4_grp,
+ i2c5_grp, i2c6_grp, i2c7_grp, i2c8_grp,
+ spi0_grp, spi0_cs0_grp, spi0_cs1_grp, spi0_cs2_grp,
+ spi1_grp, spi2_grp, spi3_grp, spi4_grp, spi5_grp, spi6_grp,
+ uart0_grp, uart1_grp, uart2_grp, uart3_grp,
+ pwm0_gpio4_grp, pwm0_gpio8_grp, pwm0_gpio12_grp,
+ pwm0_gpio16_grp, pwm1_gpio5_grp, pwm1_gpio9_grp,
+ pwm1_gpio13_grp, pwm1_gpio17_grp, pwm2_gpio6_grp,
+ pwm2_gpio10_grp, pwm2_gpio14_grp, pwm2_gpio18_grp,
+ pwm3_gpio7_grp, pwm3_gpio11_grp, pwm3_gpio15_grp,
+ pwm3_gpio19_grp, pcmif_out_grp, pcmif_in_grp]
+ minItems: 1
+ maxItems: 8
drive-strength:
enum: [2, 4, 6, 8, 16, 24, 32]
diff --git a/Documentation/devicetree/bindings/pinctrl/xlnx,versal-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/xlnx,versal-pinctrl.yaml
index 55ece6a8be5e..81e2164ea98f 100644
--- a/Documentation/devicetree/bindings/pinctrl/xlnx,versal-pinctrl.yaml
+++ b/Documentation/devicetree/bindings/pinctrl/xlnx,versal-pinctrl.yaml
@@ -74,6 +74,7 @@ patternProperties:
'^conf':
type: object
+ unevaluatedProperties: false
description:
Pinctrl node's client devices use subnodes for pin configurations,
which in turn use the standard properties below.
diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst
index 387fd9cc72ca..da982ca7e413 100644
--- a/Documentation/filesystems/iomap/operations.rst
+++ b/Documentation/filesystems/iomap/operations.rst
@@ -135,6 +135,27 @@ These ``struct kiocb`` flags are significant for buffered I/O with iomap:
* ``IOCB_DONTCACHE``: Turns on ``IOMAP_DONTCACHE``.
+``struct iomap_read_ops``
+--------------------------
+
+.. code-block:: c
+
+ struct iomap_read_ops {
+ int (*read_folio_range)(const struct iomap_iter *iter,
+ struct iomap_read_folio_ctx *ctx, size_t len);
+ void (*submit_read)(struct iomap_read_folio_ctx *ctx);
+ };
+
+iomap calls these functions:
+
+ - ``read_folio_range``: Called to read in the range. This must be provided
+ by the caller. If this succeeds, iomap_finish_folio_read() must be called
+ after the range is read in, regardless of whether the read succeeded or
+ failed.
+
+ - ``submit_read``: Submit any pending read requests. This function is
+ optional.
+
Internal per-Folio State
------------------------
@@ -182,6 +203,28 @@ The ``flags`` argument to ``->iomap_begin`` will be set to zero.
The pagecache takes whatever locks it needs before calling the
filesystem.
+Both ``iomap_readahead`` and ``iomap_read_folio`` pass in a ``struct
+iomap_read_folio_ctx``:
+
+.. code-block:: c
+
+ struct iomap_read_folio_ctx {
+ const struct iomap_read_ops *ops;
+ struct folio *cur_folio;
+ struct readahead_control *rac;
+ void *read_ctx;
+ };
+
+``iomap_readahead`` must set:
+ * ``ops->read_folio_range()`` and ``rac``
+
+``iomap_read_folio`` must set:
+ * ``ops->read_folio_range()`` and ``cur_folio``
+
+``ops->submit_read()`` and ``read_ctx`` are optional. ``read_ctx`` is used to
+pass in any custom data the caller needs accessible in the ops callbacks for
+fulfilling reads.
+
Buffered Writes
---------------
@@ -317,6 +360,9 @@ The fields are as follows:
delalloc reservations to avoid having delalloc reservations for
clean pagecache.
This function must be supplied by the filesystem.
+ If this succeeds, iomap_finish_folio_write() must be called once writeback
+ completes for the range, regardless of whether the writeback succeeded or
+ failed.
- ``writeback_submit``: Submit the previous built writeback context.
Block based file systems should use the iomap_ioend_writeback_submit
@@ -444,10 +490,6 @@ These ``struct kiocb`` flags are significant for direct I/O with iomap:
Only meaningful for asynchronous I/O, and only if the entire I/O can
be issued as a single ``struct bio``.
- * ``IOCB_DIO_CALLER_COMP``: Try to run I/O completion from the caller's
- process context.
- See ``linux/fs.h`` for more details.
-
Filesystems should call ``iomap_dio_rw`` from ``->read_iter`` and
``->write_iter``, and set ``FMODE_CAN_ODIRECT`` in the ``->open``
function for the file.
diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst
index 7233b04668fc..d33429294252 100644
--- a/Documentation/filesystems/porting.rst
+++ b/Documentation/filesystems/porting.rst
@@ -211,7 +211,7 @@ test and set for you.
e.g.::
inode = iget_locked(sb, ino);
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
err = read_inode_from_disk(inode);
if (err < 0) {
iget_failed(inode);
@@ -1309,3 +1309,16 @@ a different length, use
vfs_parse_fs_qstr(fc, key, &QSTR_LEN(value, len))
instead.
+
+---
+
+**mandatory**
+
+vfs_mkdir() now returns a dentry - the one returned by ->mkdir(). If
+that dentry is different from the dentry passed in, including if it is
+an IS_ERR() dentry pointer, the original dentry is dput().
+
+When vfs_mkdir() returns an error, and so both dputs() the original
+dentry and doesn't provide a replacement, it also unlocks the parent.
+Consequently the return value from vfs_mkdir() can be passed to
+end_creating() and the parent will be unlocked precisely when necessary.
diff --git a/Documentation/input/event-codes.rst b/Documentation/input/event-codes.rst
index 1ead9bb8d9c6..4424cbff251f 100644
--- a/Documentation/input/event-codes.rst
+++ b/Documentation/input/event-codes.rst
@@ -400,19 +400,30 @@ can report through the rotational axes (absolute and/or relative rx, ry, rz).
All other axes retain their meaning. A device must not mix
regular directional axes and accelerometer axes on the same event node.
-INPUT_PROP_HAPTIC_TOUCHPAD
---------------------------
+INPUT_PROP_PRESSUREPAD
+----------------------
+
+The INPUT_PROP_PRESSUREPAD property indicates that the device provides
+simulated haptic feedback (e.g. a vibrator motor situated below the surface)
+instead of physical haptic feedback (e.g. a hinge). This property is only set
+if the device:
-The INPUT_PROP_HAPTIC_TOUCHPAD property indicates that device:
-- supports simple haptic auto and manual triggering
- can differentiate between at least 5 fingers
- uses correct resolution for the X/Y (units and value)
-- reports correct force per touch, and correct units for them (newtons or grams)
- follows the MT protocol type B
+If the simulated haptic feedback is controllable by userspace the device must:
+
+- support simple haptic auto and manual triggering, and
+- report correct force per touch, and correct units for them (newtons or grams), and
+- provide the EV_FF FF_HAPTIC force feedback effect.
+
Summing up, such devices follow the MS spec for input devices in
-Win8 and Win8.1, and in addition support the Simple haptic controller HID table,
-and report correct units for the pressure.
+Win8 and Win8.1, and in addition may support the Simple haptic controller HID
+table, and report correct units for the pressure.
+
+Where applicable, this property is set in addition to INPUT_PROP_BUTTONPAD, it
+does not replace that property.
Guidelines
==========
diff --git a/Documentation/locking/seqlock.rst b/Documentation/locking/seqlock.rst
index 3fb7ea3ab22a..9899871d3d9a 100644
--- a/Documentation/locking/seqlock.rst
+++ b/Documentation/locking/seqlock.rst
@@ -220,13 +220,14 @@ Read path, three categories:
according to a passed marker. This is used to avoid lockless readers
starvation (too much retry loops) in case of a sharp spike in write
activity. First, a lockless read is tried (even marker passed). If
- that trial fails (odd sequence counter is returned, which is used as
- the next iteration marker), the lockless read is transformed to a
- full locking read and no retry loop is necessary::
+ that trial fails (sequence counter doesn't match), make the marker
+ odd for the next iteration, the lockless read is transformed to a
+ full locking read and no retry loop is necessary, for example::
/* marker; even initialization */
- int seq = 0;
+ int seq = 1;
do {
+ seq++; /* 2 on the 1st/lockless path, otherwise odd */
read_seqbegin_or_lock(&foo_seqlock, &seq);
/* ... [[read-side critical section]] ... */
diff --git a/Documentation/sound/codecs/cs35l56.rst b/Documentation/sound/codecs/cs35l56.rst
index 57d1964453e1..d5363b08f515 100644
--- a/Documentation/sound/codecs/cs35l56.rst
+++ b/Documentation/sound/codecs/cs35l56.rst
@@ -105,10 +105,10 @@ In this example the SSID is 10280c63.
The format of the firmware file names is:
-SoundWire (except CS35L56 Rev B0):
+SoundWire:
cs35lxx-b0-dsp1-misc-SSID[-spkidX]-l?u?
-SoundWire CS35L56 Rev B0:
+SoundWire CS35L56 Rev B0 firmware released before kernel version 6.16:
cs35lxx-b0-dsp1-misc-SSID[-spkidX]-ampN
Non-SoundWire (HDA and I2S):
@@ -127,9 +127,8 @@ Where:
* spkidX is an optional part, used for laptops that have firmware
configurations for different makes and models of internal speakers.
-The CS35L56 Rev B0 continues to use the old filename scheme because a
-large number of firmware files have already been published with these
-names.
+Early firmware for CS35L56 Rev B0 used the ALSA prefix (ampN) as the
+filename qualifier. Support for the l?u? qualifier was added in kernel 6.16.
Sound Open Firmware and ALSA topology files
-------------------------------------------
diff --git a/Documentation/userspace-api/netlink/intro-specs.rst b/Documentation/userspace-api/netlink/intro-specs.rst
index a4435ae4628d..e5ebc617754a 100644
--- a/Documentation/userspace-api/netlink/intro-specs.rst
+++ b/Documentation/userspace-api/netlink/intro-specs.rst
@@ -13,10 +13,10 @@ Simple CLI
Kernel comes with a simple CLI tool which should be useful when
developing Netlink related code. The tool is implemented in Python
and can use a YAML specification to issue Netlink requests
-to the kernel. Only Generic Netlink is supported.
+to the kernel.
The tool is located at ``tools/net/ynl/pyynl/cli.py``. It accepts
-a handul of arguments, the most important ones are:
+a handful of arguments, the most important ones are:
- ``--spec`` - point to the spec file
- ``--do $name`` / ``--dump $name`` - issue request ``$name``
diff --git a/Documentation/wmi/driver-development-guide.rst b/Documentation/wmi/driver-development-guide.rst
index 99ef21fc1c1e..5680303ae314 100644
--- a/Documentation/wmi/driver-development-guide.rst
+++ b/Documentation/wmi/driver-development-guide.rst
@@ -54,6 +54,7 @@ to matching WMI devices using a struct wmi_device_id table:
::
static const struct wmi_device_id foo_id_table[] = {
+ /* Only use uppercase letters! */
{ "936DA01F-9ABD-4D9D-80C7-02AF85C822A8", NULL },
{ }
};
diff --git a/MAINTAINERS b/MAINTAINERS
index 7bf6385efe04..980a3d5d6e90 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -915,6 +915,7 @@ F: drivers/staging/media/sunxi/cedrus/
ALPHA PORT
M: Richard Henderson <richard.henderson@linaro.org>
M: Matt Turner <mattst88@gmail.com>
+M: Magnus Lindholm <linmag7@gmail.com>
L: linux-alpha@vger.kernel.org
S: Odd Fixes
F: arch/alpha/
@@ -3925,7 +3926,7 @@ F: crypto/async_tx/
F: include/linux/async_tx.h
AT24 EEPROM DRIVER
-M: Bartosz Golaszewski <brgl@bgdev.pl>
+M: Bartosz Golaszewski <brgl@kernel.org>
L: linux-i2c@vger.kernel.org
S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git
@@ -4398,7 +4399,7 @@ BLOCK LAYER
M: Jens Axboe <axboe@kernel.dk>
L: linux-block@vger.kernel.org
S: Maintained
-T: git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux.git
F: Documentation/ABI/stable/sysfs-block
F: Documentation/block/
F: block/
@@ -9208,6 +9209,7 @@ R: Yue Hu <zbestahu@gmail.com>
R: Jeffle Xu <jefflexu@linux.alibaba.com>
R: Sandeep Dhavale <dhavale@google.com>
R: Hongbo Li <lihongbo22@huawei.com>
+R: Chunhai Guo <guochunhai@vivo.com>
L: linux-erofs@lists.ozlabs.org
S: Maintained
W: https://erofs.docs.kernel.org
@@ -9264,7 +9266,6 @@ M: Ido Schimmel <idosch@nvidia.com>
L: bridge@lists.linux.dev
L: netdev@vger.kernel.org
S: Maintained
-W: http://www.linuxfoundation.org/en/Net:Bridge
F: include/linux/if_bridge.h
F: include/uapi/linux/if_bridge.h
F: include/linux/netfilter_bridge/
@@ -10677,7 +10678,7 @@ F: tools/gpio/gpio-sloppy-logic-analyzer.sh
GPIO SUBSYSTEM
M: Linus Walleij <linus.walleij@linaro.org>
-M: Bartosz Golaszewski <brgl@bgdev.pl>
+M: Bartosz Golaszewski <brgl@kernel.org>
L: linux-gpio@vger.kernel.org
S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git
@@ -10694,7 +10695,7 @@ K: GPIOD_FLAGS_BIT_NONEXCLUSIVE
K: devm_gpiod_unhinge
GPIO UAPI
-M: Bartosz Golaszewski <brgl@bgdev.pl>
+M: Bartosz Golaszewski <brgl@kernel.org>
R: Kent Gibson <warthog618@gmail.com>
L: linux-gpio@vger.kernel.org
S: Maintained
@@ -11526,7 +11527,7 @@ F: include/linux/platform_data/huawei-gaokun-ec.h
HUGETLB SUBSYSTEM
M: Muchun Song <muchun.song@linux.dev>
M: Oscar Salvador <osalvador@suse.de>
-R: David Hildenbrand <david@redhat.com>
+R: David Hildenbrand <david@kernel.org>
L: linux-mm@kvack.org
S: Maintained
F: Documentation/ABI/testing/sysfs-kernel-mm-hugepages
@@ -13659,7 +13660,7 @@ F: virt/kvm/*
KERNEL VIRTUAL MACHINE FOR ARM64 (KVM/arm64)
M: Marc Zyngier <maz@kernel.org>
-M: Oliver Upton <oliver.upton@linux.dev>
+M: Oliver Upton <oupton@kernel.org>
R: Joey Gouly <joey.gouly@arm.com>
R: Suzuki K Poulose <suzuki.poulose@arm.com>
R: Zenghui Yu <yuzenghui@huawei.com>
@@ -13733,7 +13734,7 @@ KERNEL VIRTUAL MACHINE for s390 (KVM/s390)
M: Christian Borntraeger <borntraeger@linux.ibm.com>
M: Janosch Frank <frankja@linux.ibm.com>
M: Claudio Imbrenda <imbrenda@linux.ibm.com>
-R: David Hildenbrand <david@redhat.com>
+R: David Hildenbrand <david@kernel.org>
L: kvm@vger.kernel.org
S: Supported
T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux.git
@@ -13798,6 +13799,7 @@ F: Documentation/admin-guide/mm/kho.rst
F: Documentation/core-api/kho/*
F: include/linux/kexec_handover.h
F: kernel/kexec_handover.c
+F: lib/test_kho.c
F: tools/testing/selftests/kho/
KEYS-ENCRYPTED
@@ -14535,6 +14537,7 @@ S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git locking/core
F: Documentation/locking/
F: arch/*/include/asm/spinlock*.h
+F: include/linux/local_lock*.h
F: include/linux/lockdep*.h
F: include/linux/mutex*.h
F: include/linux/rwlock*.h
@@ -15309,7 +15312,7 @@ F: drivers/pwm/pwm-max7360.c
F: include/linux/mfd/max7360.h
MAXIM MAX77650 PMIC MFD DRIVER
-M: Bartosz Golaszewski <brgl@bgdev.pl>
+M: Bartosz Golaszewski <brgl@kernel.org>
L: linux-kernel@vger.kernel.org
S: Maintained
F: Documentation/devicetree/bindings/*/*max77650.yaml
@@ -16205,7 +16208,7 @@ MEMORY CONTROLLER DRIVERS
M: Krzysztof Kozlowski <krzk@kernel.org>
L: linux-kernel@vger.kernel.org
S: Maintained
-B: mailto:krzysztof.kozlowski@linaro.org
+B: mailto:krzk@kernel.org
T: git git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux-mem-ctrl.git
F: Documentation/devicetree/bindings/memory-controllers/
F: drivers/memory/
@@ -16221,7 +16224,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/chanwoo/linux.git
F: drivers/devfreq/tegra30-devfreq.c
MEMORY HOT(UN)PLUG
-M: David Hildenbrand <david@redhat.com>
+M: David Hildenbrand <david@kernel.org>
M: Oscar Salvador <osalvador@suse.de>
L: linux-mm@kvack.org
S: Maintained
@@ -16246,7 +16249,7 @@ F: tools/mm/
MEMORY MANAGEMENT - CORE
M: Andrew Morton <akpm@linux-foundation.org>
-M: David Hildenbrand <david@redhat.com>
+M: David Hildenbrand <david@kernel.org>
R: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
R: Liam R. Howlett <Liam.Howlett@oracle.com>
R: Vlastimil Babka <vbabka@suse.cz>
@@ -16302,7 +16305,7 @@ F: mm/execmem.c
MEMORY MANAGEMENT - GUP (GET USER PAGES)
M: Andrew Morton <akpm@linux-foundation.org>
-M: David Hildenbrand <david@redhat.com>
+M: David Hildenbrand <david@kernel.org>
R: Jason Gunthorpe <jgg@nvidia.com>
R: John Hubbard <jhubbard@nvidia.com>
R: Peter Xu <peterx@redhat.com>
@@ -16318,7 +16321,7 @@ F: tools/testing/selftests/mm/gup_test.c
MEMORY MANAGEMENT - KSM (Kernel Samepage Merging)
M: Andrew Morton <akpm@linux-foundation.org>
-M: David Hildenbrand <david@redhat.com>
+M: David Hildenbrand <david@kernel.org>
R: Xu Xin <xu.xin16@zte.com.cn>
R: Chengming Zhou <chengming.zhou@linux.dev>
L: linux-mm@kvack.org
@@ -16334,7 +16337,7 @@ F: mm/mm_slot.h
MEMORY MANAGEMENT - MEMORY POLICY AND MIGRATION
M: Andrew Morton <akpm@linux-foundation.org>
-M: David Hildenbrand <david@redhat.com>
+M: David Hildenbrand <david@kernel.org>
R: Zi Yan <ziy@nvidia.com>
R: Matthew Brost <matthew.brost@intel.com>
R: Joshua Hahn <joshua.hahnjy@gmail.com>
@@ -16374,7 +16377,7 @@ F: mm/workingset.c
MEMORY MANAGEMENT - MISC
M: Andrew Morton <akpm@linux-foundation.org>
-M: David Hildenbrand <david@redhat.com>
+M: David Hildenbrand <david@kernel.org>
R: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
R: Liam R. Howlett <Liam.Howlett@oracle.com>
R: Vlastimil Babka <vbabka@suse.cz>
@@ -16462,7 +16465,7 @@ F: mm/shuffle.h
MEMORY MANAGEMENT - RECLAIM
M: Andrew Morton <akpm@linux-foundation.org>
M: Johannes Weiner <hannes@cmpxchg.org>
-R: David Hildenbrand <david@redhat.com>
+R: David Hildenbrand <david@kernel.org>
R: Michal Hocko <mhocko@kernel.org>
R: Qi Zheng <zhengqi.arch@bytedance.com>
R: Shakeel Butt <shakeel.butt@linux.dev>
@@ -16475,7 +16478,7 @@ F: mm/workingset.c
MEMORY MANAGEMENT - RMAP (REVERSE MAPPING)
M: Andrew Morton <akpm@linux-foundation.org>
-M: David Hildenbrand <david@redhat.com>
+M: David Hildenbrand <david@kernel.org>
M: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
R: Rik van Riel <riel@surriel.com>
R: Liam R. Howlett <Liam.Howlett@oracle.com>
@@ -16499,12 +16502,12 @@ F: mm/secretmem.c
MEMORY MANAGEMENT - SWAP
M: Andrew Morton <akpm@linux-foundation.org>
+M: Chris Li <chrisl@kernel.org>
+M: Kairui Song <kasong@tencent.com>
R: Kemeng Shi <shikemeng@huaweicloud.com>
-R: Kairui Song <kasong@tencent.com>
R: Nhat Pham <nphamcs@gmail.com>
R: Baoquan He <bhe@redhat.com>
R: Barry Song <baohua@kernel.org>
-R: Chris Li <chrisl@kernel.org>
L: linux-mm@kvack.org
S: Maintained
F: Documentation/mm/swap-table.rst
@@ -16520,7 +16523,7 @@ F: mm/swapfile.c
MEMORY MANAGEMENT - THP (TRANSPARENT HUGE PAGE)
M: Andrew Morton <akpm@linux-foundation.org>
-M: David Hildenbrand <david@redhat.com>
+M: David Hildenbrand <david@kernel.org>
M: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
R: Zi Yan <ziy@nvidia.com>
R: Baolin Wang <baolin.wang@linux.alibaba.com>
@@ -16622,7 +16625,7 @@ MEMORY MAPPING - MADVISE (MEMORY ADVICE)
M: Andrew Morton <akpm@linux-foundation.org>
M: Liam R. Howlett <Liam.Howlett@oracle.com>
M: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
-M: David Hildenbrand <david@redhat.com>
+M: David Hildenbrand <david@kernel.org>
R: Vlastimil Babka <vbabka@suse.cz>
R: Jann Horn <jannh@google.com>
L: linux-mm@kvack.org
@@ -18780,6 +18783,10 @@ S: Maintained
F: arch/arm/*omap*/*clock*
OMAP DEVICE TREE SUPPORT
+M: Aaro Koskinen <aaro.koskinen@iki.fi>
+M: Andreas Kemnade <andreas@kemnade.info>
+M: Kevin Hilman <khilman@baylibre.com>
+M: Roger Quadros <rogerq@kernel.org>
M: Tony Lindgren <tony@atomide.com>
L: linux-omap@vger.kernel.org
L: devicetree@vger.kernel.org
@@ -19899,7 +19906,7 @@ F: drivers/pci/p2pdma.c
F: include/linux/pci-p2pdma.h
PCI POWER CONTROL
-M: Bartosz Golaszewski <brgl@bgdev.pl>
+M: Bartosz Golaszewski <brgl@kernel.org>
L: linux-pci@vger.kernel.org
S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git
@@ -20496,7 +20503,7 @@ F: include/linux/powercap.h
F: kernel/configs/nopm.config
POWER SEQUENCING
-M: Bartosz Golaszewski <brgl@bgdev.pl>
+M: Bartosz Golaszewski <brgl@kernel.org>
L: linux-pm@vger.kernel.org
S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git
@@ -21178,7 +21185,7 @@ F: Documentation/devicetree/bindings/i2c/qcom,i2c-cci.yaml
F: drivers/i2c/busses/i2c-qcom-cci.c
QUALCOMM INTERCONNECT BWMON DRIVER
-M: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
+M: Krzysztof Kozlowski <krzk@kernel.org>
L: linux-arm-msm@vger.kernel.org
S: Maintained
F: Documentation/devicetree/bindings/interconnect/qcom,msm8998-bwmon.yaml
@@ -21299,7 +21306,7 @@ F: Documentation/tee/qtee.rst
F: drivers/tee/qcomtee/
QUALCOMM TRUST ZONE MEMORY ALLOCATOR
-M: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
+M: Bartosz Golaszewski <brgl@kernel.org>
L: linux-arm-msm@vger.kernel.org
S: Maintained
F: drivers/firmware/qcom/qcom_tzmem.c
@@ -22650,7 +22657,7 @@ F: arch/s390/mm
S390 NETWORK DRIVERS
M: Alexandra Winter <wintera@linux.ibm.com>
-R: Aswin Karuvally <aswin@linux.ibm.com>
+M: Aswin Karuvally <aswin@linux.ibm.com>
L: linux-s390@vger.kernel.org
L: netdev@vger.kernel.org
S: Supported
@@ -25667,7 +25674,7 @@ F: Documentation/devicetree/bindings/crypto/ti,am62l-dthev2.yaml
F: drivers/crypto/ti/
TI DAVINCI MACHINE SUPPORT
-M: Bartosz Golaszewski <brgl@bgdev.pl>
+M: Bartosz Golaszewski <brgl@kernel.org>
L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git
@@ -26051,6 +26058,8 @@ S: Supported
W: https://www.tq-group.com/en/products/tq-embedded/
F: arch/arm/boot/dts/nxp/imx/*mba*.dts*
F: arch/arm/boot/dts/nxp/imx/*tqma*.dts*
+F: arch/arm/boot/dts/ti/omap/*mba*.dts*
+F: arch/arm/boot/dts/ti/omap/*tqma*.dts*
F: arch/arm64/boot/dts/freescale/fsl-*tqml*.dts*
F: arch/arm64/boot/dts/freescale/imx*mba*.dts*
F: arch/arm64/boot/dts/freescale/imx*tqma*.dts*
@@ -27089,7 +27098,7 @@ F: net/vmw_vsock/virtio_transport_common.c
VIRTIO BALLOON
M: "Michael S. Tsirkin" <mst@redhat.com>
-M: David Hildenbrand <david@redhat.com>
+M: David Hildenbrand <david@kernel.org>
L: virtualization@lists.linux.dev
S: Maintained
F: drivers/virtio/virtio_balloon.c
@@ -27117,7 +27126,7 @@ S: Maintained
F: drivers/char/virtio_console.c
F: include/uapi/linux/virtio_console.h
-VIRTIO CORE AND NET DRIVERS
+VIRTIO CORE
M: "Michael S. Tsirkin" <mst@redhat.com>
M: Jason Wang <jasowang@redhat.com>
R: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
@@ -27130,7 +27139,6 @@ F: Documentation/devicetree/bindings/virtio/
F: Documentation/driver-api/virtio/
F: drivers/block/virtio_blk.c
F: drivers/crypto/virtio/
-F: drivers/net/virtio_net.c
F: drivers/vdpa/
F: drivers/virtio/
F: include/linux/vdpa.h
@@ -27139,7 +27147,6 @@ F: include/linux/vringh.h
F: include/uapi/linux/virtio_*.h
F: net/vmw_vsock/virtio*
F: tools/virtio/
-F: tools/testing/selftests/drivers/net/virtio_net/
VIRTIO CRYPTO DRIVER
M: Gonglei <arei.gonglei@huawei.com>
@@ -27161,6 +27168,7 @@ F: arch/s390/include/uapi/asm/virtio-ccw.h
F: drivers/s390/virtio/
VIRTIO FILE SYSTEM
+M: German Maglione <gmaglione@redhat.com>
M: Vivek Goyal <vgoyal@redhat.com>
M: Stefan Hajnoczi <stefanha@redhat.com>
M: Miklos Szeredi <miklos@szeredi.hu>
@@ -27244,13 +27252,26 @@ F: drivers/iommu/virtio-iommu.c
F: include/uapi/linux/virtio_iommu.h
VIRTIO MEM DRIVER
-M: David Hildenbrand <david@redhat.com>
+M: David Hildenbrand <david@kernel.org>
L: virtualization@lists.linux.dev
S: Maintained
W: https://virtio-mem.gitlab.io/
F: drivers/virtio/virtio_mem.c
F: include/uapi/linux/virtio_mem.h
+VIRTIO NET DRIVER
+M: "Michael S. Tsirkin" <mst@redhat.com>
+M: Jason Wang <jasowang@redhat.com>
+R: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
+R: Eugenio Pérez <eperezma@redhat.com>
+L: netdev@vger.kernel.org
+L: virtualization@lists.linux.dev
+S: Maintained
+F: drivers/net/virtio_net.c
+F: include/linux/virtio_net.h
+F: include/uapi/linux/virtio_net.h
+F: tools/testing/selftests/drivers/net/virtio_net/
+
VIRTIO PMEM DRIVER
M: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
L: virtualization@lists.linux.dev
@@ -27850,7 +27871,7 @@ F: arch/x86/kernel/stacktrace.c
F: arch/x86/kernel/unwind_*.c
X86 TRUST DOMAIN EXTENSIONS (TDX)
-M: Kirill A. Shutemov <kas@kernel.org>
+M: Kiryl Shutsemau <kas@kernel.org>
R: Dave Hansen <dave.hansen@linux.intel.com>
R: Rick Edgecombe <rick.p.edgecombe@intel.com>
L: x86@kernel.org
diff --git a/Makefile b/Makefile
index fb4389aa5d5f..6f0e72ff4d0c 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@
VERSION = 6
PATCHLEVEL = 18
SUBLEVEL = 0
-EXTRAVERSION = -rc5
+EXTRAVERSION =
NAME = Baby Opossum Posse
# *DOCUMENTATION*
@@ -1061,6 +1061,9 @@ NOSTDINC_FLAGS += -nostdinc
# perform bounds checking.
KBUILD_CFLAGS += $(call cc-option, -fstrict-flex-arrays=3)
+# Allow including a tagged struct or union anonymously in another struct/union.
+KBUILD_CFLAGS += -fms-extensions
+
# disable invalid "can't wrap" optimizations for signed / pointers
KBUILD_CFLAGS += -fno-strict-overflow
diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index 16dca28ebf17..3fed97478058 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -509,3 +509,4 @@
577 common open_tree_attr sys_open_tree_attr
578 common file_getattr sys_file_getattr
579 common file_setattr sys_file_setattr
+580 common listns sys_listns
diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-fuji-data64.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-fuji-data64.dts
index aa9576d8ab56..48ca25f57ef6 100644
--- a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-fuji-data64.dts
+++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-fuji-data64.dts
@@ -1254,3 +1254,17 @@
max-frequency = <25000000>;
bus-width = <4>;
};
+
+/*
+ * FIXME: rgmii delay is introduced by MAC (configured in u-boot now)
+ * instead of PCB on fuji board, so the "phy-mode" should be updated to
+ * "rgmii-[tx|rx]id" when the aspeed-mac driver can handle the delay
+ * properly.
+ */
+&mac3 {
+ status = "okay";
+ phy-mode = "rgmii";
+ phy-handle = <&ethphy3>;
+ pinctrl-names = "default";
+ pinctrl-0 = <&pinctrl_rgmii4_default>;
+};
diff --git a/arch/arm/boot/dts/broadcom/bcm47189-luxul-xap-1440.dts b/arch/arm/boot/dts/broadcom/bcm47189-luxul-xap-1440.dts
index ac44c745bdf8..a39a021a3910 100644
--- a/arch/arm/boot/dts/broadcom/bcm47189-luxul-xap-1440.dts
+++ b/arch/arm/boot/dts/broadcom/bcm47189-luxul-xap-1440.dts
@@ -55,8 +55,8 @@
mdio {
/delete-node/ switch@1e;
- bcm54210e: ethernet-phy@0 {
- reg = <0>;
+ bcm54210e: ethernet-phy@25 {
+ reg = <25>;
};
};
};
diff --git a/arch/arm/boot/dts/nxp/imx/imx51-zii-rdu1.dts b/arch/arm/boot/dts/nxp/imx/imx51-zii-rdu1.dts
index 06545a6052f7..43ff5eafb2bb 100644
--- a/arch/arm/boot/dts/nxp/imx/imx51-zii-rdu1.dts
+++ b/arch/arm/boot/dts/nxp/imx/imx51-zii-rdu1.dts
@@ -259,7 +259,7 @@
pinctrl-0 = <&pinctrl_audmux>;
status = "okay";
- ssi2 {
+ mux-ssi2 {
fsl,audmux-port = <1>;
fsl,port-config = <
(IMX_AUDMUX_V2_PTCR_SYN |
@@ -271,7 +271,7 @@
>;
};
- aud3 {
+ mux-aud3 {
fsl,audmux-port = <2>;
fsl,port-config = <
IMX_AUDMUX_V2_PTCR_SYN
diff --git a/arch/arm/boot/dts/nxp/imx/imx6ul.dtsi b/arch/arm/boot/dts/nxp/imx/imx6ul.dtsi
index 6de224dd2bb9..6eb80f867f50 100644
--- a/arch/arm/boot/dts/nxp/imx/imx6ul.dtsi
+++ b/arch/arm/boot/dts/nxp/imx/imx6ul.dtsi
@@ -339,7 +339,7 @@
#sound-dai-cells = <0>;
compatible = "fsl,imx6ul-sai", "fsl,imx6sx-sai";
reg = <0x02030000 0x4000>;
- interrupts = <GIC_SPI 24 IRQ_TYPE_LEVEL_HIGH>;
+ interrupts = <GIC_SPI 25 IRQ_TYPE_LEVEL_HIGH>;
clocks = <&clks IMX6UL_CLK_SAI3_IPG>,
<&clks IMX6UL_CLK_SAI3>,
<&clks IMX6UL_CLK_DUMMY>, <&clks IMX6UL_CLK_DUMMY>;
diff --git a/arch/arm/boot/dts/nxp/imx/imx6ull-engicam-microgea-rmm.dts b/arch/arm/boot/dts/nxp/imx/imx6ull-engicam-microgea-rmm.dts
index 107b00b9a939..540642e99a41 100644
--- a/arch/arm/boot/dts/nxp/imx/imx6ull-engicam-microgea-rmm.dts
+++ b/arch/arm/boot/dts/nxp/imx/imx6ull-engicam-microgea-rmm.dts
@@ -136,7 +136,7 @@
interrupt-parent = <&gpio2>;
interrupts = <8 IRQ_TYPE_EDGE_FALLING>;
reset-gpios = <&gpio2 14 GPIO_ACTIVE_LOW>;
- report-rate-hz = <6>;
+ report-rate-hz = <60>;
/* settings valid only for Hycon touchscreen */
touchscreen-size-x = <1280>;
touchscreen-size-y = <800>;
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index b07e699aaa3c..fd09afae72a2 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -484,3 +484,4 @@
467 common open_tree_attr sys_open_tree_attr
468 common file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr
+470 common listns sys_listns
diff --git a/arch/arm64/boot/dts/broadcom/bcm2712-rpi-5-b.dts b/arch/arm64/boot/dts/broadcom/bcm2712-rpi-5-b.dts
index b8f256545022..3e0319fdb93f 100644
--- a/arch/arm64/boot/dts/broadcom/bcm2712-rpi-5-b.dts
+++ b/arch/arm64/boot/dts/broadcom/bcm2712-rpi-5-b.dts
@@ -18,11 +18,21 @@
#include "bcm2712-rpi-5-b-ovl-rp1.dts"
+/ {
+ aliases {
+ ethernet0 = &rp1_eth;
+ };
+};
+
&pcie2 {
#include "rp1-nexus.dtsi"
};
&rp1_eth {
+ assigned-clocks = <&rp1_clocks RP1_CLK_ETH_TSU>,
+ <&rp1_clocks RP1_CLK_ETH>;
+ assigned-clock-rates = <50000000>,
+ <125000000>;
status = "okay";
phy-mode = "rgmii-id";
phy-handle = <&phy1>;
diff --git a/arch/arm64/boot/dts/freescale/imx8-ss-img.dtsi b/arch/arm64/boot/dts/freescale/imx8-ss-img.dtsi
index 2cf0f7208350..a72b2f1c4a1b 100644
--- a/arch/arm64/boot/dts/freescale/imx8-ss-img.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8-ss-img.dtsi
@@ -67,7 +67,6 @@ img_subsys: bus@58000000 {
power-domains = <&pd IMX_SC_R_CSI_0>;
fsl,channel = <0>;
fsl,num-irqs = <32>;
- status = "disabled";
};
gpio0_mipi_csi0: gpio@58222000 {
@@ -144,7 +143,6 @@ img_subsys: bus@58000000 {
power-domains = <&pd IMX_SC_R_CSI_1>;
fsl,channel = <0>;
fsl,num-irqs = <32>;
- status = "disabled";
};
gpio0_mipi_csi1: gpio@58242000 {
diff --git a/arch/arm64/boot/dts/freescale/imx8dxl-ss-conn.dtsi b/arch/arm64/boot/dts/freescale/imx8dxl-ss-conn.dtsi
index a66ba6d0a8c0..da33a35c6d46 100644
--- a/arch/arm64/boot/dts/freescale/imx8dxl-ss-conn.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8dxl-ss-conn.dtsi
@@ -29,8 +29,8 @@
compatible = "nxp,imx8dxl-dwmac-eqos", "snps,dwmac-5.10a";
reg = <0x5b050000 0x10000>;
interrupt-parent = <&gic>;
- interrupts = <GIC_SPI 163 IRQ_TYPE_LEVEL_HIGH>,
- <GIC_SPI 162 IRQ_TYPE_LEVEL_HIGH>;
+ interrupts = <GIC_SPI 162 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 163 IRQ_TYPE_LEVEL_HIGH>;
interrupt-names = "macirq", "eth_wake_irq";
clocks = <&eqos_lpcg IMX_LPCG_CLK_4>,
<&eqos_lpcg IMX_LPCG_CLK_6>,
diff --git a/arch/arm64/boot/dts/freescale/imx8dxl-ss-hsio.dtsi b/arch/arm64/boot/dts/freescale/imx8dxl-ss-hsio.dtsi
index ec466e4d7df5..5c0d09c5c086 100644
--- a/arch/arm64/boot/dts/freescale/imx8dxl-ss-hsio.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8dxl-ss-hsio.dtsi
@@ -54,3 +54,8 @@
interrupt-names = "dma";
};
};
+
+&pcieb_ep {
+ interrupts = <GIC_SPI 46 IRQ_TYPE_LEVEL_HIGH>;
+ interrupt-names = "dma";
+};
diff --git a/arch/arm64/boot/dts/freescale/imx8mp-kontron-bl-osm-s.dts b/arch/arm64/boot/dts/freescale/imx8mp-kontron-bl-osm-s.dts
index 614b4ce330b1..0924ac50fd2d 100644
--- a/arch/arm64/boot/dts/freescale/imx8mp-kontron-bl-osm-s.dts
+++ b/arch/arm64/boot/dts/freescale/imx8mp-kontron-bl-osm-s.dts
@@ -16,11 +16,20 @@
ethernet1 = &eqos;
};
- extcon_usbc: usbc {
- compatible = "linux,extcon-usb-gpio";
+ connector {
+ compatible = "gpio-usb-b-connector", "usb-b-connector";
+ id-gpios = <&gpio1 10 GPIO_ACTIVE_HIGH>;
+ label = "Type-C";
pinctrl-names = "default";
pinctrl-0 = <&pinctrl_usb1_id>;
- id-gpios = <&gpio1 10 GPIO_ACTIVE_HIGH>;
+ type = "micro";
+ vbus-supply = <&reg_usb1_vbus>;
+
+ port {
+ usb_dr_connector: endpoint {
+ remote-endpoint = <&usb3_dwc>;
+ };
+ };
};
leds {
@@ -244,9 +253,15 @@
hnp-disable;
srp-disable;
dr_mode = "otg";
- extcon = <&extcon_usbc>;
usb-role-switch;
+ role-switch-default-mode = "peripheral";
status = "okay";
+
+ port {
+ usb3_dwc: endpoint {
+ remote-endpoint = <&usb_dr_connector>;
+ };
+ };
};
&usb_dwc3_1 {
@@ -273,7 +288,6 @@
};
&usb3_phy0 {
- vbus-supply = <&reg_usb1_vbus>;
status = "okay";
};
diff --git a/arch/arm64/boot/dts/freescale/imx8qm-mek.dts b/arch/arm64/boot/dts/freescale/imx8qm-mek.dts
index 202d5c67ac40..9c0b6b8d6459 100644
--- a/arch/arm64/boot/dts/freescale/imx8qm-mek.dts
+++ b/arch/arm64/boot/dts/freescale/imx8qm-mek.dts
@@ -217,8 +217,8 @@
compatible = "nxp,cbdtu02043", "gpio-sbu-mux";
pinctrl-names = "default";
pinctrl-0 = <&pinctrl_typec_mux>;
- select-gpios = <&lsio_gpio4 6 GPIO_ACTIVE_LOW>;
- enable-gpios = <&lsio_gpio4 19 GPIO_ACTIVE_HIGH>;
+ select-gpios = <&lsio_gpio4 6 GPIO_ACTIVE_HIGH>;
+ enable-gpios = <&lsio_gpio4 19 GPIO_ACTIVE_LOW>;
orientation-switch;
port {
diff --git a/arch/arm64/boot/dts/freescale/imx95.dtsi b/arch/arm64/boot/dts/freescale/imx95.dtsi
index 1292677cbe4e..6da961eb3fe5 100644
--- a/arch/arm64/boot/dts/freescale/imx95.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx95.dtsi
@@ -1886,7 +1886,7 @@
assigned-clock-rates = <3600000000>, <100000000>, <10000000>;
assigned-clock-parents = <0>, <0>,
<&scmi_clk IMX95_CLK_SYSPLL1_PFD1_DIV2>;
- msi-map = <0x0 &its 0x98 0x1>;
+ msi-map = <0x0 &its 0x10 0x1>;
power-domains = <&scmi_devpd IMX95_PD_HSIO_TOP>;
status = "disabled";
};
@@ -1963,6 +1963,7 @@
assigned-clock-rates = <3600000000>, <100000000>, <10000000>;
assigned-clock-parents = <0>, <0>,
<&scmi_clk IMX95_CLK_SYSPLL1_PFD1_DIV2>;
+ msi-map = <0x0 &its 0x98 0x1>;
power-domains = <&scmi_devpd IMX95_PD_HSIO_TOP>;
status = "disabled";
};
diff --git a/arch/arm64/boot/dts/nvidia/tegra194-p3668.dtsi b/arch/arm64/boot/dts/nvidia/tegra194-p3668.dtsi
index a410fc335fa3..c0f17f8189fa 100644
--- a/arch/arm64/boot/dts/nvidia/tegra194-p3668.dtsi
+++ b/arch/arm64/boot/dts/nvidia/tegra194-p3668.dtsi
@@ -42,6 +42,7 @@
interrupt-parent = <&gpio>;
interrupts = <TEGRA194_MAIN_GPIO(G, 4) IRQ_TYPE_LEVEL_LOW>;
#phy-cells = <0>;
+ wakeup-source;
};
};
};
diff --git a/arch/arm64/boot/dts/rockchip/rk3328.dtsi b/arch/arm64/boot/dts/rockchip/rk3328.dtsi
index 283d9cbc4368..03b7c4313750 100644
--- a/arch/arm64/boot/dts/rockchip/rk3328.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3328.dtsi
@@ -598,7 +598,6 @@
pinctrl-2 = <&otp_pin>;
resets = <&cru SRST_TSADC>;
reset-names = "tsadc-apb";
- rockchip,grf = <&grf>;
rockchip,hw-tshut-temp = <100000>;
#thermal-sensor-cells = <1>;
status = "disabled";
diff --git a/arch/arm64/boot/dts/rockchip/rk3399-op1.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-op1.dtsi
index c4f4f1ff6117..9da6fd82e46b 100644
--- a/arch/arm64/boot/dts/rockchip/rk3399-op1.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3399-op1.dtsi
@@ -3,7 +3,7 @@
* Copyright (c) 2016-2017 Fuzhou Rockchip Electronics Co., Ltd
*/
-#include "rk3399.dtsi"
+#include "rk3399-base.dtsi"
/ {
cluster0_opp: opp-table-0 {
diff --git a/arch/arm64/boot/dts/rockchip/rk3399-puma-haikou-video-demo.dtso b/arch/arm64/boot/dts/rockchip/rk3399-puma-haikou-video-demo.dtso
index 5e8f729c2cf2..141a921a06e4 100644
--- a/arch/arm64/boot/dts/rockchip/rk3399-puma-haikou-video-demo.dtso
+++ b/arch/arm64/boot/dts/rockchip/rk3399-puma-haikou-video-demo.dtso
@@ -45,11 +45,11 @@
cam_dovdd_1v8: regulator-cam-dovdd-1v8 {
compatible = "regulator-fixed";
- gpio = <&pca9670 3 GPIO_ACTIVE_LOW>;
- regulator-max-microvolt = <1800000>;
- regulator-min-microvolt = <1800000>;
- regulator-name = "cam-dovdd-1v8";
- vin-supply = <&vcc1v8_video>;
+ gpio = <&pca9670 3 GPIO_ACTIVE_LOW>;
+ regulator-max-microvolt = <1800000>;
+ regulator-min-microvolt = <1800000>;
+ regulator-name = "cam-dovdd-1v8";
+ vin-supply = <&vcc1v8_video>;
};
cam_dvdd_1v2: regulator-cam-dvdd-1v2 {
diff --git a/arch/arm64/boot/dts/rockchip/rk3566-bigtreetech-cb2.dtsi b/arch/arm64/boot/dts/rockchip/rk3566-bigtreetech-cb2.dtsi
index 7f578c50b4ad..b6cf03a7ba66 100644
--- a/arch/arm64/boot/dts/rockchip/rk3566-bigtreetech-cb2.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3566-bigtreetech-cb2.dtsi
@@ -120,7 +120,7 @@
compatible = "regulator-fixed";
regulator-name = "vcc3v3_pcie";
enable-active-high;
- gpios = <&gpio0 RK_PB1 GPIO_ACTIVE_HIGH>;
+ gpios = <&gpio4 RK_PB1 GPIO_ACTIVE_HIGH>;
pinctrl-names = "default";
pinctrl-0 = <&pcie_drv>;
regulator-always-on;
@@ -187,7 +187,7 @@
vcc5v0_usb2b: regulator-vcc5v0-usb2b {
compatible = "regulator-fixed";
enable-active-high;
- gpio = <&gpio0 RK_PC4 GPIO_ACTIVE_HIGH>;
+ gpio = <&gpio4 RK_PC4 GPIO_ACTIVE_HIGH>;
pinctrl-names = "default";
pinctrl-0 = <&vcc5v0_usb2b_en>;
regulator-name = "vcc5v0_usb2b";
@@ -199,7 +199,7 @@
vcc5v0_usb2t: regulator-vcc5v0-usb2t {
compatible = "regulator-fixed";
enable-active-high;
- gpios = <&gpio0 RK_PD5 GPIO_ACTIVE_HIGH>;
+ gpios = <&gpio3 RK_PD5 GPIO_ACTIVE_HIGH>;
pinctrl-names = "default";
pinctrl-0 = <&vcc5v0_usb2t_en>;
regulator-name = "vcc5v0_usb2t";
diff --git a/arch/arm64/boot/dts/rockchip/rk3566-pinetab2.dtsi b/arch/arm64/boot/dts/rockchip/rk3566-pinetab2.dtsi
index d0e38412d56a..08bf40de17ea 100644
--- a/arch/arm64/boot/dts/rockchip/rk3566-pinetab2.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3566-pinetab2.dtsi
@@ -789,7 +789,7 @@
vccio1-supply = <&vccio_acodec>;
vccio2-supply = <&vcc_1v8>;
vccio3-supply = <&vccio_sd>;
- vccio4-supply = <&vcc_1v8>;
+ vccio4-supply = <&vcca1v8_pmu>;
vccio5-supply = <&vcc_1v8>;
vccio6-supply = <&vcc1v8_dvp>;
vccio7-supply = <&vcc_3v3>;
diff --git a/arch/arm64/boot/dts/rockchip/rk3568-odroid-m1.dts b/arch/arm64/boot/dts/rockchip/rk3568-odroid-m1.dts
index 0f844806ec54..442a2bc43ba8 100644
--- a/arch/arm64/boot/dts/rockchip/rk3568-odroid-m1.dts
+++ b/arch/arm64/boot/dts/rockchip/rk3568-odroid-m1.dts
@@ -482,6 +482,8 @@
};
&i2s1_8ch {
+ pinctrl-names = "default";
+ pinctrl-0 = <&i2s1m0_sclktx &i2s1m0_lrcktx &i2s1m0_sdi0 &i2s1m0_sdo0>;
rockchip,trcm-sync-tx-only;
status = "okay";
};
diff --git a/arch/arm64/boot/dts/rockchip/rk3576.dtsi b/arch/arm64/boot/dts/rockchip/rk3576.dtsi
index fc4e9e07f1cf..a86fc6b4e8c4 100644
--- a/arch/arm64/boot/dts/rockchip/rk3576.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3576.dtsi
@@ -276,12 +276,6 @@
opp-microvolt = <900000 900000 950000>;
clock-latency-ns = <40000>;
};
-
- opp-2208000000 {
- opp-hz = /bits/ 64 <2208000000>;
- opp-microvolt = <950000 950000 950000>;
- clock-latency-ns = <40000>;
- };
};
cluster1_opp_table: opp-table-cluster1 {
@@ -348,12 +342,6 @@
opp-microvolt = <925000 925000 950000>;
clock-latency-ns = <40000>;
};
-
- opp-2304000000 {
- opp-hz = /bits/ 64 <2304000000>;
- opp-microvolt = <950000 950000 950000>;
- clock-latency-ns = <40000>;
- };
};
gpu_opp_table: opp-table-gpu {
@@ -2561,8 +2549,6 @@
interrupts = <GIC_SPI 97 IRQ_TYPE_LEVEL_HIGH>;
pinctrl-names = "default";
pinctrl-0 = <&i2c9m0_xfer>;
- resets = <&cru SRST_I2C9>, <&cru SRST_P_I2C9>;
- reset-names = "i2c", "apb";
#address-cells = <1>;
#size-cells = <0>;
status = "disabled";
diff --git a/arch/arm64/boot/dts/rockchip/rk3588-opp.dtsi b/arch/arm64/boot/dts/rockchip/rk3588-opp.dtsi
index 0f1a77697351..b5d630d2c879 100644
--- a/arch/arm64/boot/dts/rockchip/rk3588-opp.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3588-opp.dtsi
@@ -115,7 +115,7 @@
};
};
- gpu_opp_table: opp-table {
+ gpu_opp_table: opp-table-gpu {
compatible = "operating-points-v2";
opp-300000000 {
diff --git a/arch/arm64/boot/dts/rockchip/rk3588-tiger.dtsi b/arch/arm64/boot/dts/rockchip/rk3588-tiger.dtsi
index b44e89e1bb15..365c1d958f2d 100644
--- a/arch/arm64/boot/dts/rockchip/rk3588-tiger.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3588-tiger.dtsi
@@ -382,14 +382,12 @@
cap-mmc-highspeed;
mmc-ddr-1_8v;
mmc-hs200-1_8v;
- mmc-hs400-1_8v;
- mmc-hs400-enhanced-strobe;
mmc-pwrseq = <&emmc_pwrseq>;
no-sdio;
no-sd;
non-removable;
pinctrl-names = "default";
- pinctrl-0 = <&emmc_bus8 &emmc_cmd &emmc_clk &emmc_data_strobe>;
+ pinctrl-0 = <&emmc_bus8 &emmc_cmd &emmc_clk>;
vmmc-supply = <&vcc_3v3_s3>;
vqmmc-supply = <&vcc_1v8_s3>;
status = "okay";
diff --git a/arch/arm64/boot/dts/rockchip/rk3588j.dtsi b/arch/arm64/boot/dts/rockchip/rk3588j.dtsi
index 9884a5df47df..e1e0e3fc0ca7 100644
--- a/arch/arm64/boot/dts/rockchip/rk3588j.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3588j.dtsi
@@ -66,7 +66,7 @@
};
};
- gpu_opp_table: opp-table {
+ gpu_opp_table: opp-table-gpu {
compatible = "operating-points-v2";
opp-300000000 {
diff --git a/arch/arm64/boot/dts/rockchip/rk3588s-orangepi-5.dts b/arch/arm64/boot/dts/rockchip/rk3588s-orangepi-5.dts
index ad6d04793b0a..83b9b6645a1e 100644
--- a/arch/arm64/boot/dts/rockchip/rk3588s-orangepi-5.dts
+++ b/arch/arm64/boot/dts/rockchip/rk3588s-orangepi-5.dts
@@ -14,8 +14,8 @@
gpios = <&gpio0 RK_PC5 GPIO_ACTIVE_HIGH>;
regulator-name = "vcc3v3_pcie20";
regulator-boot-on;
- regulator-min-microvolt = <1800000>;
- regulator-max-microvolt = <1800000>;
+ regulator-min-microvolt = <3300000>;
+ regulator-max-microvolt = <3300000>;
startup-delay-us = <50000>;
vin-supply = <&vcc5v0_sys>;
};
diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index e3a2d37bd104..1a48faad2473 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -1341,7 +1341,7 @@ CONFIG_COMMON_CLK_RS9_PCIE=y
CONFIG_COMMON_CLK_VC3=y
CONFIG_COMMON_CLK_VC5=y
CONFIG_COMMON_CLK_BD718XX=m
-CONFIG_CLK_RASPBERRYPI=m
+CONFIG_CLK_RASPBERRYPI=y
CONFIG_CLK_IMX8MM=y
CONFIG_CLK_IMX8MN=y
CONFIG_CLK_IMX8MP=y
diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h
index 00d97b8a757f..51746005239b 100644
--- a/arch/arm64/include/asm/alternative.h
+++ b/arch/arm64/include/asm/alternative.h
@@ -26,9 +26,12 @@ void __init apply_alternatives_all(void);
bool alternative_is_applied(u16 cpucap);
#ifdef CONFIG_MODULES
-void apply_alternatives_module(void *start, size_t length);
+int apply_alternatives_module(void *start, size_t length);
#else
-static inline void apply_alternatives_module(void *start, size_t length) { }
+static inline int apply_alternatives_module(void *start, size_t length)
+{
+ return 0;
+}
#endif
void alt_cb_patch_nops(struct alt_instr *alt, __le32 *origptr,
diff --git a/arch/arm64/include/asm/kfence.h b/arch/arm64/include/asm/kfence.h
index a81937fae9f6..21dbc9dda747 100644
--- a/arch/arm64/include/asm/kfence.h
+++ b/arch/arm64/include/asm/kfence.h
@@ -10,8 +10,6 @@
#include <asm/set_memory.h>
-static inline bool arch_kfence_init_pool(void) { return true; }
-
static inline bool kfence_protect_page(unsigned long addr, bool protect)
{
set_memory_valid(addr, 1, !protect);
@@ -25,6 +23,7 @@ static inline bool arm64_kfence_can_set_direct_map(void)
{
return !kfence_early_init;
}
+bool arch_kfence_init_pool(void);
#else /* CONFIG_KFENCE */
static inline bool arm64_kfence_can_set_direct_map(void) { return false; }
#endif /* CONFIG_KFENCE */
diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h
index 2312e6ee595f..258cca4b4873 100644
--- a/arch/arm64/include/asm/page.h
+++ b/arch/arm64/include/asm/page.h
@@ -33,8 +33,8 @@ struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma,
unsigned long vaddr);
#define vma_alloc_zeroed_movable_folio vma_alloc_zeroed_movable_folio
-void tag_clear_highpage(struct page *to);
-#define __HAVE_ARCH_TAG_CLEAR_HIGHPAGE
+bool tag_clear_highpages(struct page *to, int numpages);
+#define __HAVE_ARCH_TAG_CLEAR_HIGHPAGES
#define clear_user_page(page, vaddr, pg) clear_page(page)
#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)
diff --git a/arch/arm64/include/asm/percpu.h b/arch/arm64/include/asm/percpu.h
index 9abcc8ef3087..b57b2bb00967 100644
--- a/arch/arm64/include/asm/percpu.h
+++ b/arch/arm64/include/asm/percpu.h
@@ -77,7 +77,7 @@ __percpu_##name##_case_##sz(void *ptr, unsigned long val) \
" stxr" #sfx "\t%w[loop], %" #w "[tmp], %[ptr]\n" \
" cbnz %w[loop], 1b", \
/* LSE atomics */ \
- #op_lse "\t%" #w "[val], %[ptr]\n" \
+ #op_lse "\t%" #w "[val], %" #w "[tmp], %[ptr]\n" \
__nops(3)) \
: [loop] "=&r" (loop), [tmp] "=&r" (tmp), \
[ptr] "+Q"(*(u##sz *)ptr) \
@@ -124,9 +124,16 @@ PERCPU_RW_OPS(8)
PERCPU_RW_OPS(16)
PERCPU_RW_OPS(32)
PERCPU_RW_OPS(64)
-PERCPU_OP(add, add, stadd)
-PERCPU_OP(andnot, bic, stclr)
-PERCPU_OP(or, orr, stset)
+
+/*
+ * Use value-returning atomics for CPU-local ops as they are more likely
+ * to execute "near" to the CPU (e.g. in L1$).
+ *
+ * https://lore.kernel.org/r/e7d539ed-ced0-4b96-8ecd-048a5b803b85@paulmck-laptop
+ */
+PERCPU_OP(add, add, ldadd)
+PERCPU_OP(andnot, bic, ldclr)
+PERCPU_OP(or, orr, ldset)
PERCPU_RET_OP(add, add, ldadd)
#undef PERCPU_RW_OPS
diff --git a/arch/arm64/include/asm/scs.h b/arch/arm64/include/asm/scs.h
index a76f9b387a26..c59f6324f2bb 100644
--- a/arch/arm64/include/asm/scs.h
+++ b/arch/arm64/include/asm/scs.h
@@ -53,7 +53,7 @@ enum {
EDYNSCS_INVALID_CFA_OPCODE = 4,
};
-int __pi_scs_patch(const u8 eh_frame[], int size);
+int __pi_scs_patch(const u8 eh_frame[], int size, bool skip_dry_run);
#endif /* __ASSEMBLY __ */
diff --git a/arch/arm64/include/asm/spectre.h b/arch/arm64/include/asm/spectre.h
index 8fef12626090..900454aaa292 100644
--- a/arch/arm64/include/asm/spectre.h
+++ b/arch/arm64/include/asm/spectre.h
@@ -117,6 +117,7 @@ void spectre_bhb_patch_wa3(struct alt_instr *alt,
__le32 *origptr, __le32 *updptr, int nr_inst);
void spectre_bhb_patch_clearbhb(struct alt_instr *alt,
__le32 *origptr, __le32 *updptr, int nr_inst);
+void spectre_print_disabled_mitigations(void);
#endif /* __ASSEMBLY__ */
#endif /* __ASM_SPECTRE_H */
diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c
index 7aca29e1d30b..f1cb2447afc9 100644
--- a/arch/arm64/kernel/acpi.c
+++ b/arch/arm64/kernel/acpi.c
@@ -197,8 +197,6 @@ out:
*/
void __init acpi_boot_table_init(void)
{
- int ret;
-
/*
* Enable ACPI instead of device tree unless
* - ACPI has been disabled explicitly (acpi=off), or
@@ -252,12 +250,8 @@ done:
* behaviour, use acpi=nospcr to disable console in ACPI SPCR
* table as default serial console.
*/
- ret = acpi_parse_spcr(earlycon_acpi_spcr_enable,
+ acpi_parse_spcr(earlycon_acpi_spcr_enable,
!param_acpi_nospcr);
- if (!ret || param_acpi_nospcr || !IS_ENABLED(CONFIG_ACPI_SPCR_TABLE))
- pr_info("Use ACPI SPCR as default console: No\n");
- else
- pr_info("Use ACPI SPCR as default console: Yes\n");
if (IS_ENABLED(CONFIG_ACPI_BGRT))
acpi_table_parse(ACPI_SIG_BGRT, acpi_parse_bgrt);
@@ -357,16 +351,6 @@ void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size)
* as long as we take care not to create a writable
* mapping for executable code.
*/
- fallthrough;
-
- case EFI_ACPI_MEMORY_NVS:
- /*
- * ACPI NVS marks an area reserved for use by the
- * firmware, even after exiting the boot service.
- * This may be used by the firmware for sharing dynamic
- * tables/data (e.g., ACPI CCEL) with the OS. Map it
- * as read-only.
- */
prot = PAGE_KERNEL_RO;
break;
diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c
index 8ff6610af496..f5ec7e7c1d3f 100644
--- a/arch/arm64/kernel/alternative.c
+++ b/arch/arm64/kernel/alternative.c
@@ -139,9 +139,9 @@ static noinstr void clean_dcache_range_nopatch(u64 start, u64 end)
} while (cur += d_size, cur < end);
}
-static void __apply_alternatives(const struct alt_region *region,
- bool is_module,
- unsigned long *cpucap_mask)
+static int __apply_alternatives(const struct alt_region *region,
+ bool is_module,
+ unsigned long *cpucap_mask)
{
struct alt_instr *alt;
__le32 *origptr, *updptr;
@@ -166,10 +166,13 @@ static void __apply_alternatives(const struct alt_region *region,
updptr = is_module ? origptr : lm_alias(origptr);
nr_inst = alt->orig_len / AARCH64_INSN_SIZE;
- if (ALT_HAS_CB(alt))
+ if (ALT_HAS_CB(alt)) {
alt_cb = ALT_REPL_PTR(alt);
- else
+ if (is_module && !core_kernel_text((unsigned long)alt_cb))
+ return -ENOEXEC;
+ } else {
alt_cb = patch_alternative;
+ }
alt_cb(alt, origptr, updptr, nr_inst);
@@ -193,6 +196,8 @@ static void __apply_alternatives(const struct alt_region *region,
bitmap_and(applied_alternatives, applied_alternatives,
system_cpucaps, ARM64_NCAPS);
}
+
+ return 0;
}
static void __init apply_alternatives_vdso(void)
@@ -277,7 +282,7 @@ void __init apply_boot_alternatives(void)
}
#ifdef CONFIG_MODULES
-void apply_alternatives_module(void *start, size_t length)
+int apply_alternatives_module(void *start, size_t length)
{
struct alt_region region = {
.begin = start,
@@ -287,7 +292,7 @@ void apply_alternatives_module(void *start, size_t length)
bitmap_fill(all_capabilities, ARM64_NCAPS);
- __apply_alternatives(&region, true, &all_capabilities[0]);
+ return __apply_alternatives(&region, true, &all_capabilities[0]);
}
#endif
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 5ed401ff79e3..e25b0f84a22d 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -95,6 +95,7 @@
#include <asm/vectors.h>
#include <asm/virt.h>
+#include <asm/spectre.h>
/* Kernel representation of AT_HWCAP and AT_HWCAP2 */
static DECLARE_BITMAP(elf_hwcap, MAX_CPU_FEATURES) __read_mostly;
@@ -3875,6 +3876,11 @@ static void __init setup_system_capabilities(void)
*/
if (system_uses_ttbr0_pan())
pr_info("emulated: Privileged Access Never (PAN) using TTBR0_EL1 switching\n");
+
+ /*
+ * Report Spectre mitigations status.
+ */
+ spectre_print_disabled_mitigations();
}
void __init setup_system_features(void)
diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index d6d443c4a01a..24adb581af0e 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -489,16 +489,29 @@ int module_finalize(const Elf_Ehdr *hdr,
int ret;
s = find_section(hdr, sechdrs, ".altinstructions");
- if (s)
- apply_alternatives_module((void *)s->sh_addr, s->sh_size);
+ if (s) {
+ ret = apply_alternatives_module((void *)s->sh_addr, s->sh_size);
+ if (ret < 0) {
+ pr_err("module %s: error occurred when applying alternatives\n", me->name);
+ return ret;
+ }
+ }
if (scs_is_dynamic()) {
s = find_section(hdr, sechdrs, ".init.eh_frame");
if (s) {
- ret = __pi_scs_patch((void *)s->sh_addr, s->sh_size);
- if (ret)
+ /*
+ * Because we can reject modules that are malformed
+ * so SCS patching fails, skip dry run and try to patch
+ * it in place. If patching fails, the module would not
+ * be loaded anyway.
+ */
+ ret = __pi_scs_patch((void *)s->sh_addr, s->sh_size, true);
+ if (ret) {
pr_err("module %s: error occurred during dynamic SCS patching (%d)\n",
me->name, ret);
+ return -ENOEXEC;
+ }
}
}
diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c
index 43f7a2f39403..32148bf09c1d 100644
--- a/arch/arm64/kernel/mte.c
+++ b/arch/arm64/kernel/mte.c
@@ -476,7 +476,8 @@ static int __access_remote_tags(struct mm_struct *mm, unsigned long addr,
folio = page_folio(page);
if (folio_test_hugetlb(folio))
- WARN_ON_ONCE(!folio_test_hugetlb_mte_tagged(folio));
+ WARN_ON_ONCE(!folio_test_hugetlb_mte_tagged(folio) &&
+ !is_huge_zero_folio(folio));
else
WARN_ON_ONCE(!page_mte_tagged(page) && !is_zero_page(page));
diff --git a/arch/arm64/kernel/pi/map_kernel.c b/arch/arm64/kernel/pi/map_kernel.c
index e8ddbde31a83..659297f87cfa 100644
--- a/arch/arm64/kernel/pi/map_kernel.c
+++ b/arch/arm64/kernel/pi/map_kernel.c
@@ -104,7 +104,7 @@ static void __init map_kernel(u64 kaslr_offset, u64 va_offset, int root_level)
if (enable_scs) {
scs_patch(__eh_frame_start + va_offset,
- __eh_frame_end - __eh_frame_start);
+ __eh_frame_end - __eh_frame_start, false);
asm("ic ialluis");
dynamic_scs_is_enabled = true;
diff --git a/arch/arm64/kernel/pi/patch-scs.c b/arch/arm64/kernel/pi/patch-scs.c
index 55d0cd64ef71..bbe7d30ed12b 100644
--- a/arch/arm64/kernel/pi/patch-scs.c
+++ b/arch/arm64/kernel/pi/patch-scs.c
@@ -225,7 +225,7 @@ static int scs_handle_fde_frame(const struct eh_frame *frame,
return 0;
}
-int scs_patch(const u8 eh_frame[], int size)
+int scs_patch(const u8 eh_frame[], int size, bool skip_dry_run)
{
int code_alignment_factor = 1;
bool fde_use_sdata8 = false;
@@ -277,11 +277,13 @@ int scs_patch(const u8 eh_frame[], int size)
}
} else {
ret = scs_handle_fde_frame(frame, code_alignment_factor,
- fde_use_sdata8, true);
+ fde_use_sdata8, !skip_dry_run);
if (ret)
return ret;
- scs_handle_fde_frame(frame, code_alignment_factor,
- fde_use_sdata8, false);
+
+ if (!skip_dry_run)
+ scs_handle_fde_frame(frame, code_alignment_factor,
+ fde_use_sdata8, false);
}
p += sizeof(frame->size) + frame->size;
diff --git a/arch/arm64/kernel/pi/pi.h b/arch/arm64/kernel/pi/pi.h
index 08ef9f80456b..aec3172d4003 100644
--- a/arch/arm64/kernel/pi/pi.h
+++ b/arch/arm64/kernel/pi/pi.h
@@ -27,7 +27,7 @@ extern pgd_t init_pg_dir[], init_pg_end[];
void init_feature_override(u64 boot_status, const void *fdt, int chosen);
u64 kaslr_early_init(void *fdt, int chosen);
void relocate_kernel(u64 offset);
-int scs_patch(const u8 eh_frame[], int size);
+int scs_patch(const u8 eh_frame[], int size, bool skip_dry_run);
void map_range(phys_addr_t *pte, u64 start, u64 end, phys_addr_t pa,
pgprot_t prot, int level, pte_t *tbl, bool may_use_cont,
diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c
index 8ab6104a4883..43a0361a8bf0 100644
--- a/arch/arm64/kernel/probes/kprobes.c
+++ b/arch/arm64/kernel/probes/kprobes.c
@@ -49,7 +49,10 @@ void *alloc_insn_page(void)
addr = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE);
if (!addr)
return NULL;
- set_memory_rox((unsigned long)addr, 1);
+ if (set_memory_rox((unsigned long)addr, 1)) {
+ execmem_free(addr);
+ return NULL;
+ }
return addr;
}
diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c
index f9a32dfde006..80a580e019c5 100644
--- a/arch/arm64/kernel/proton-pack.c
+++ b/arch/arm64/kernel/proton-pack.c
@@ -91,12 +91,7 @@ early_param("nospectre_v2", parse_spectre_v2_param);
static bool spectre_v2_mitigations_off(void)
{
- bool ret = __nospectre_v2 || cpu_mitigations_off();
-
- if (ret)
- pr_info_once("spectre-v2 mitigation disabled by command line option\n");
-
- return ret;
+ return __nospectre_v2 || cpu_mitigations_off();
}
static const char *get_bhb_affected_string(enum mitigation_state bhb_state)
@@ -421,13 +416,8 @@ early_param("ssbd", parse_spectre_v4_param);
*/
static bool spectre_v4_mitigations_off(void)
{
- bool ret = cpu_mitigations_off() ||
- __spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_DISABLED;
-
- if (ret)
- pr_info_once("spectre-v4 mitigation disabled by command-line option\n");
-
- return ret;
+ return cpu_mitigations_off() ||
+ __spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_DISABLED;
}
/* Do we need to toggle the mitigation state on entry to/exit from the kernel? */
@@ -1043,9 +1033,7 @@ void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *entry)
if (arm64_get_spectre_v2_state() == SPECTRE_VULNERABLE) {
/* No point mitigating Spectre-BHB alone. */
} else if (!IS_ENABLED(CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY)) {
- pr_info_once("spectre-bhb mitigation disabled by compile time option\n");
- } else if (cpu_mitigations_off() || __nospectre_bhb) {
- pr_info_once("spectre-bhb mitigation disabled by command line option\n");
+ /* Do nothing */
} else if (supports_ecbhb(SCOPE_LOCAL_CPU)) {
state = SPECTRE_MITIGATED;
set_bit(BHB_HW, &system_bhb_mitigations);
@@ -1199,3 +1187,18 @@ void unpriv_ebpf_notify(int new_state)
pr_err("WARNING: %s", EBPF_WARN);
}
#endif
+
+void spectre_print_disabled_mitigations(void)
+{
+ /* Keep a single copy of the common message suffix to avoid duplication. */
+ const char *spectre_disabled_suffix = "mitigation disabled by command-line option\n";
+
+ if (spectre_v2_mitigations_off())
+ pr_info("spectre-v2 %s", spectre_disabled_suffix);
+
+ if (spectre_v4_mitigations_off())
+ pr_info("spectre-v4 %s", spectre_disabled_suffix);
+
+ if (__nospectre_bhb || cpu_mitigations_off())
+ pr_info("spectre-bhb %s", spectre_disabled_suffix);
+}
diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile
index ffa3536581f6..9d0efed91414 100644
--- a/arch/arm64/kernel/vdso32/Makefile
+++ b/arch/arm64/kernel/vdso32/Makefile
@@ -63,7 +63,7 @@ VDSO_CFLAGS += -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \
$(filter -Werror,$(KBUILD_CPPFLAGS)) \
-Werror-implicit-function-declaration \
-Wno-format-security \
- -std=gnu11
+ -std=gnu11 -fms-extensions
VDSO_CFLAGS += -O2
# Some useful compiler-dependent flags from top-level Makefile
VDSO_CFLAGS += $(call cc32-option,-Wno-pointer-sign)
@@ -71,6 +71,7 @@ VDSO_CFLAGS += -fno-strict-overflow
VDSO_CFLAGS += $(call cc32-option,-Werror=strict-prototypes)
VDSO_CFLAGS += -Werror=date-time
VDSO_CFLAGS += $(call cc32-option,-Werror=incompatible-pointer-types)
+VDSO_CFLAGS += $(if $(CONFIG_CC_IS_CLANG),-Wno-microsoft-anon-tag)
# Compile as THUMB2 or ARM. Unwinding via frame-pointers in THUMB2 is
# unreliable.
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 870953b4a8a7..052bf0d4d0b0 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -624,6 +624,7 @@ nommu:
kvm_timer_vcpu_load(vcpu);
kvm_vgic_load(vcpu);
kvm_vcpu_load_debug(vcpu);
+ kvm_vcpu_load_fgt(vcpu);
if (has_vhe())
kvm_vcpu_load_vhe(vcpu);
kvm_arch_vcpu_load_fp(vcpu);
@@ -642,7 +643,6 @@ nommu:
vcpu->arch.hcr_el2 |= HCR_TWI;
vcpu_set_pauth_traps(vcpu);
- kvm_vcpu_load_fgt(vcpu);
if (is_protected_kvm_enabled()) {
kvm_call_hyp_nvhe(__pkvm_vcpu_load,
diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c
index 4e16f9b96f63..58b7d0c477d7 100644
--- a/arch/arm64/kvm/hyp/nvhe/ffa.c
+++ b/arch/arm64/kvm/hyp/nvhe/ffa.c
@@ -479,7 +479,7 @@ static void __do_ffa_mem_xfer(const u64 func_id,
struct ffa_mem_region_attributes *ep_mem_access;
struct ffa_composite_mem_region *reg;
struct ffa_mem_region *buf;
- u32 offset, nr_ranges;
+ u32 offset, nr_ranges, checked_offset;
int ret = 0;
if (addr_mbz || npages_mbz || fraglen > len ||
@@ -516,7 +516,12 @@ static void __do_ffa_mem_xfer(const u64 func_id,
goto out_unlock;
}
- if (fraglen < offset + sizeof(struct ffa_composite_mem_region)) {
+ if (check_add_overflow(offset, sizeof(struct ffa_composite_mem_region), &checked_offset)) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out_unlock;
+ }
+
+ if (fraglen < checked_offset) {
ret = FFA_RET_INVALID_PARAMETERS;
goto out_unlock;
}
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index ddc8beb55eee..49db32f3ddf7 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -367,6 +367,19 @@ static int host_stage2_unmap_dev_all(void)
return kvm_pgtable_stage2_unmap(pgt, addr, BIT(pgt->ia_bits) - addr);
}
+/*
+ * Ensure the PFN range is contained within PA-range.
+ *
+ * This check is also robust to overflows and is therefore a requirement before
+ * using a pfn/nr_pages pair from an untrusted source.
+ */
+static bool pfn_range_is_valid(u64 pfn, u64 nr_pages)
+{
+ u64 limit = BIT(kvm_phys_shift(&host_mmu.arch.mmu) - PAGE_SHIFT);
+
+ return pfn < limit && ((limit - pfn) >= nr_pages);
+}
+
struct kvm_mem_range {
u64 start;
u64 end;
@@ -776,6 +789,9 @@ int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages)
void *virt = __hyp_va(phys);
int ret;
+ if (!pfn_range_is_valid(pfn, nr_pages))
+ return -EINVAL;
+
host_lock_component();
hyp_lock_component();
@@ -804,6 +820,9 @@ int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages)
u64 virt = (u64)__hyp_va(phys);
int ret;
+ if (!pfn_range_is_valid(pfn, nr_pages))
+ return -EINVAL;
+
host_lock_component();
hyp_lock_component();
@@ -887,6 +906,9 @@ int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages)
u64 size = PAGE_SIZE * nr_pages;
int ret;
+ if (!pfn_range_is_valid(pfn, nr_pages))
+ return -EINVAL;
+
host_lock_component();
ret = __host_check_page_state_range(phys, size, PKVM_PAGE_OWNED);
if (!ret)
@@ -902,6 +924,9 @@ int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages)
u64 size = PAGE_SIZE * nr_pages;
int ret;
+ if (!pfn_range_is_valid(pfn, nr_pages))
+ return -EINVAL;
+
host_lock_component();
ret = __host_check_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED);
if (!ret)
@@ -945,6 +970,9 @@ int __pkvm_host_share_guest(u64 pfn, u64 gfn, u64 nr_pages, struct pkvm_hyp_vcpu
if (prot & ~KVM_PGTABLE_PROT_RWX)
return -EINVAL;
+ if (!pfn_range_is_valid(pfn, nr_pages))
+ return -EINVAL;
+
ret = __guest_check_transition_size(phys, ipa, nr_pages, &size);
if (ret)
return ret;
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index e67eb39ddc11..ec3fbe0b8d52 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -2595,19 +2595,23 @@ static bool bad_redir_trap(struct kvm_vcpu *vcpu,
.val = 0, \
}
-/* sys_reg_desc initialiser for known cpufeature ID registers */
-#define AA32_ID_SANITISED(name) { \
- ID_DESC(name), \
- .visibility = aa32_id_visibility, \
- .val = 0, \
-}
-
/* sys_reg_desc initialiser for writable ID registers */
#define ID_WRITABLE(name, mask) { \
ID_DESC(name), \
.val = mask, \
}
+/*
+ * 32bit ID regs are fully writable when the guest is 32bit
+ * capable. Nothing in the KVM code should rely on 32bit features
+ * anyway, only 64bit, so let the VMM do its worse.
+ */
+#define AA32_ID_WRITABLE(name) { \
+ ID_DESC(name), \
+ .visibility = aa32_id_visibility, \
+ .val = GENMASK(31, 0), \
+}
+
/* sys_reg_desc initialiser for cpufeature ID registers that need filtering */
#define ID_FILTERED(sysreg, name, mask) { \
ID_DESC(sysreg), \
@@ -3128,40 +3132,39 @@ static const struct sys_reg_desc sys_reg_descs[] = {
/* AArch64 mappings of the AArch32 ID registers */
/* CRm=1 */
- AA32_ID_SANITISED(ID_PFR0_EL1),
- AA32_ID_SANITISED(ID_PFR1_EL1),
+ AA32_ID_WRITABLE(ID_PFR0_EL1),
+ AA32_ID_WRITABLE(ID_PFR1_EL1),
{ SYS_DESC(SYS_ID_DFR0_EL1),
.access = access_id_reg,
.get_user = get_id_reg,
.set_user = set_id_dfr0_el1,
.visibility = aa32_id_visibility,
.reset = read_sanitised_id_dfr0_el1,
- .val = ID_DFR0_EL1_PerfMon_MASK |
- ID_DFR0_EL1_CopDbg_MASK, },
+ .val = GENMASK(31, 0) },
ID_HIDDEN(ID_AFR0_EL1),
- AA32_ID_SANITISED(ID_MMFR0_EL1),
- AA32_ID_SANITISED(ID_MMFR1_EL1),
- AA32_ID_SANITISED(ID_MMFR2_EL1),
- AA32_ID_SANITISED(ID_MMFR3_EL1),
+ AA32_ID_WRITABLE(ID_MMFR0_EL1),
+ AA32_ID_WRITABLE(ID_MMFR1_EL1),
+ AA32_ID_WRITABLE(ID_MMFR2_EL1),
+ AA32_ID_WRITABLE(ID_MMFR3_EL1),
/* CRm=2 */
- AA32_ID_SANITISED(ID_ISAR0_EL1),
- AA32_ID_SANITISED(ID_ISAR1_EL1),
- AA32_ID_SANITISED(ID_ISAR2_EL1),
- AA32_ID_SANITISED(ID_ISAR3_EL1),
- AA32_ID_SANITISED(ID_ISAR4_EL1),
- AA32_ID_SANITISED(ID_ISAR5_EL1),
- AA32_ID_SANITISED(ID_MMFR4_EL1),
- AA32_ID_SANITISED(ID_ISAR6_EL1),
+ AA32_ID_WRITABLE(ID_ISAR0_EL1),
+ AA32_ID_WRITABLE(ID_ISAR1_EL1),
+ AA32_ID_WRITABLE(ID_ISAR2_EL1),
+ AA32_ID_WRITABLE(ID_ISAR3_EL1),
+ AA32_ID_WRITABLE(ID_ISAR4_EL1),
+ AA32_ID_WRITABLE(ID_ISAR5_EL1),
+ AA32_ID_WRITABLE(ID_MMFR4_EL1),
+ AA32_ID_WRITABLE(ID_ISAR6_EL1),
/* CRm=3 */
- AA32_ID_SANITISED(MVFR0_EL1),
- AA32_ID_SANITISED(MVFR1_EL1),
- AA32_ID_SANITISED(MVFR2_EL1),
+ AA32_ID_WRITABLE(MVFR0_EL1),
+ AA32_ID_WRITABLE(MVFR1_EL1),
+ AA32_ID_WRITABLE(MVFR2_EL1),
ID_UNALLOCATED(3,3),
- AA32_ID_SANITISED(ID_PFR2_EL1),
+ AA32_ID_WRITABLE(ID_PFR2_EL1),
ID_HIDDEN(ID_DFR1_EL1),
- AA32_ID_SANITISED(ID_MMFR5_EL1),
+ AA32_ID_WRITABLE(ID_MMFR5_EL1),
ID_UNALLOCATED(3,7),
/* AArch64 ID registers */
@@ -5606,11 +5609,17 @@ int kvm_finalize_sys_regs(struct kvm_vcpu *vcpu)
guard(mutex)(&kvm->arch.config_lock);
- if (!(static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif) &&
- irqchip_in_kernel(kvm) &&
- kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)) {
- kvm->arch.id_regs[IDREG_IDX(SYS_ID_AA64PFR0_EL1)] &= ~ID_AA64PFR0_EL1_GIC_MASK;
- kvm->arch.id_regs[IDREG_IDX(SYS_ID_PFR1_EL1)] &= ~ID_PFR1_EL1_GIC_MASK;
+ /*
+ * This hacks into the ID registers, so only perform it when the
+ * first vcpu runs, or the kvm_set_vm_id_reg() helper will scream.
+ */
+ if (!irqchip_in_kernel(kvm) && !kvm_vm_has_ran_once(kvm)) {
+ u64 val;
+
+ val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1) & ~ID_AA64PFR0_EL1_GIC;
+ kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1, val);
+ val = kvm_read_vm_id_reg(kvm, SYS_ID_PFR1_EL1) & ~ID_PFR1_EL1_GIC;
+ kvm_set_vm_id_reg(kvm, SYS_ID_PFR1_EL1, val);
}
if (vcpu_has_nv(vcpu)) {
diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c
index 4c1209261b65..bb92853d1fd3 100644
--- a/arch/arm64/kvm/vgic/vgic-debug.c
+++ b/arch/arm64/kvm/vgic/vgic-debug.c
@@ -64,29 +64,37 @@ static void iter_next(struct kvm *kvm, struct vgic_state_iter *iter)
static int iter_mark_lpis(struct kvm *kvm)
{
struct vgic_dist *dist = &kvm->arch.vgic;
+ unsigned long intid, flags;
struct vgic_irq *irq;
- unsigned long intid;
int nr_lpis = 0;
+ xa_lock_irqsave(&dist->lpi_xa, flags);
+
xa_for_each(&dist->lpi_xa, intid, irq) {
if (!vgic_try_get_irq_ref(irq))
continue;
- xa_set_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER);
+ __xa_set_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER);
nr_lpis++;
}
+ xa_unlock_irqrestore(&dist->lpi_xa, flags);
+
return nr_lpis;
}
static void iter_unmark_lpis(struct kvm *kvm)
{
struct vgic_dist *dist = &kvm->arch.vgic;
+ unsigned long intid, flags;
struct vgic_irq *irq;
- unsigned long intid;
xa_for_each_marked(&dist->lpi_xa, intid, irq, LPI_XA_MARK_DEBUG_ITER) {
- xa_clear_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER);
+ xa_lock_irqsave(&dist->lpi_xa, flags);
+ __xa_clear_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER);
+ xa_unlock_irqrestore(&dist->lpi_xa, flags);
+
+ /* vgic_put_irq() expects to be called outside of the xa_lock */
vgic_put_irq(kvm, irq);
}
}
diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c
index 1796b1a22a72..da62edbc1205 100644
--- a/arch/arm64/kvm/vgic/vgic-init.c
+++ b/arch/arm64/kvm/vgic/vgic-init.c
@@ -53,7 +53,7 @@ void kvm_vgic_early_init(struct kvm *kvm)
{
struct vgic_dist *dist = &kvm->arch.vgic;
- xa_init(&dist->lpi_xa);
+ xa_init_flags(&dist->lpi_xa, XA_FLAGS_LOCK_IRQ);
}
/* CREATION */
@@ -71,6 +71,7 @@ static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type);
int kvm_vgic_create(struct kvm *kvm, u32 type)
{
struct kvm_vcpu *vcpu;
+ u64 aa64pfr0, pfr1;
unsigned long i;
int ret;
@@ -161,10 +162,19 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF;
- if (type == KVM_DEV_TYPE_ARM_VGIC_V2)
+ aa64pfr0 = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1) & ~ID_AA64PFR0_EL1_GIC;
+ pfr1 = kvm_read_vm_id_reg(kvm, SYS_ID_PFR1_EL1) & ~ID_PFR1_EL1_GIC;
+
+ if (type == KVM_DEV_TYPE_ARM_VGIC_V2) {
kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF;
- else
+ } else {
INIT_LIST_HEAD(&kvm->arch.vgic.rd_regions);
+ aa64pfr0 |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, GIC, IMP);
+ pfr1 |= SYS_FIELD_PREP_ENUM(ID_PFR1_EL1, GIC, GICv3);
+ }
+
+ kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1, aa64pfr0);
+ kvm_set_vm_id_reg(kvm, SYS_ID_PFR1_EL1, pfr1);
if (type == KVM_DEV_TYPE_ARM_VGIC_V3)
kvm->arch.vgic.nassgicap = system_supports_direct_sgis();
diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
index ce3e3ed3f29f..3f1c4b10fed9 100644
--- a/arch/arm64/kvm/vgic/vgic-its.c
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@@ -78,6 +78,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
{
struct vgic_dist *dist = &kvm->arch.vgic;
struct vgic_irq *irq = vgic_get_irq(kvm, intid), *oldirq;
+ unsigned long flags;
int ret;
/* In this case there is no put, since we keep the reference. */
@@ -88,7 +89,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
if (!irq)
return ERR_PTR(-ENOMEM);
- ret = xa_reserve(&dist->lpi_xa, intid, GFP_KERNEL_ACCOUNT);
+ ret = xa_reserve_irq(&dist->lpi_xa, intid, GFP_KERNEL_ACCOUNT);
if (ret) {
kfree(irq);
return ERR_PTR(ret);
@@ -103,7 +104,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
irq->target_vcpu = vcpu;
irq->group = 1;
- xa_lock(&dist->lpi_xa);
+ xa_lock_irqsave(&dist->lpi_xa, flags);
/*
* There could be a race with another vgic_add_lpi(), so we need to
@@ -114,21 +115,18 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
/* Someone was faster with adding this LPI, lets use that. */
kfree(irq);
irq = oldirq;
-
- goto out_unlock;
+ } else {
+ ret = xa_err(__xa_store(&dist->lpi_xa, intid, irq, 0));
}
- ret = xa_err(__xa_store(&dist->lpi_xa, intid, irq, 0));
+ xa_unlock_irqrestore(&dist->lpi_xa, flags);
+
if (ret) {
xa_release(&dist->lpi_xa, intid);
kfree(irq);
- }
-
-out_unlock:
- xa_unlock(&dist->lpi_xa);
- if (ret)
return ERR_PTR(ret);
+ }
/*
* We "cache" the configuration table entries in our struct vgic_irq's.
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index 6fbb4b099855..2f75ef14d339 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -301,7 +301,8 @@ void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu)
return;
/* Hide GICv3 sysreg if necessary */
- if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2) {
+ if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2 ||
+ !irqchip_in_kernel(vcpu->kvm)) {
vgic_v3->vgic_hcr |= (ICH_HCR_EL2_TALL0 | ICH_HCR_EL2_TALL1 |
ICH_HCR_EL2_TC);
return;
diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
index 6dd5a10081e2..8d20c53faef0 100644
--- a/arch/arm64/kvm/vgic/vgic.c
+++ b/arch/arm64/kvm/vgic/vgic.c
@@ -28,7 +28,7 @@ struct vgic_global kvm_vgic_global_state __ro_after_init = {
* kvm->arch.config_lock (mutex)
* its->cmd_lock (mutex)
* its->its_lock (mutex)
- * vgic_dist->lpi_xa.xa_lock
+ * vgic_dist->lpi_xa.xa_lock must be taken with IRQs disabled
* vgic_cpu->ap_list_lock must be taken with IRQs disabled
* vgic_irq->irq_lock must be taken with IRQs disabled
*
@@ -141,32 +141,39 @@ static __must_check bool vgic_put_irq_norelease(struct kvm *kvm, struct vgic_irq
void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq)
{
struct vgic_dist *dist = &kvm->arch.vgic;
+ unsigned long flags;
- if (irq->intid >= VGIC_MIN_LPI)
- might_lock(&dist->lpi_xa.xa_lock);
+ /*
+ * Normally the lock is only taken when the refcount drops to 0.
+ * Acquire/release it early on lockdep kernels to make locking issues
+ * in rare release paths a bit more obvious.
+ */
+ if (IS_ENABLED(CONFIG_LOCKDEP) && irq->intid >= VGIC_MIN_LPI) {
+ guard(spinlock_irqsave)(&dist->lpi_xa.xa_lock);
+ }
if (!__vgic_put_irq(kvm, irq))
return;
- xa_lock(&dist->lpi_xa);
+ xa_lock_irqsave(&dist->lpi_xa, flags);
vgic_release_lpi_locked(dist, irq);
- xa_unlock(&dist->lpi_xa);
+ xa_unlock_irqrestore(&dist->lpi_xa, flags);
}
static void vgic_release_deleted_lpis(struct kvm *kvm)
{
struct vgic_dist *dist = &kvm->arch.vgic;
- unsigned long intid;
+ unsigned long flags, intid;
struct vgic_irq *irq;
- xa_lock(&dist->lpi_xa);
+ xa_lock_irqsave(&dist->lpi_xa, flags);
xa_for_each(&dist->lpi_xa, intid, irq) {
if (irq->pending_release)
vgic_release_lpi_locked(dist, irq);
}
- xa_unlock(&dist->lpi_xa);
+ xa_unlock_irqrestore(&dist->lpi_xa, flags);
}
void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu)
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index d816ff44faff..a193b6a5d1e6 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -967,10 +967,21 @@ struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma,
return vma_alloc_folio(flags, 0, vma, vaddr);
}
-void tag_clear_highpage(struct page *page)
+bool tag_clear_highpages(struct page *page, int numpages)
{
- /* Newly allocated page, shouldn't have been tagged yet */
- WARN_ON_ONCE(!try_page_mte_tagging(page));
- mte_zero_clear_page_tags(page_address(page));
- set_page_mte_tagged(page);
+ /*
+ * Check if MTE is supported and fall back to clear_highpage().
+ * get_huge_zero_folio() unconditionally passes __GFP_ZEROTAGS and
+ * post_alloc_hook() will invoke tag_clear_highpages().
+ */
+ if (!system_supports_mte())
+ return false;
+
+ /* Newly allocated pages, shouldn't have been tagged yet */
+ for (int i = 0; i < numpages; i++, page++) {
+ WARN_ON_ONCE(!try_page_mte_tagging(page));
+ mte_zero_clear_page_tags(page_address(page));
+ set_page_mte_tagged(page);
+ }
+ return true;
}
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index b8d37eb037fc..2ba01dc8ef82 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -708,6 +708,30 @@ out:
return ret;
}
+static inline bool force_pte_mapping(void)
+{
+ const bool bbml2 = system_capabilities_finalized() ?
+ system_supports_bbml2_noabort() : cpu_supports_bbml2_noabort();
+
+ if (debug_pagealloc_enabled())
+ return true;
+ if (bbml2)
+ return false;
+ return rodata_full || arm64_kfence_can_set_direct_map() || is_realm_world();
+}
+
+static inline bool split_leaf_mapping_possible(void)
+{
+ /*
+ * !BBML2_NOABORT systems should never run into scenarios where we would
+ * have to split. So exit early and let calling code detect it and raise
+ * a warning.
+ */
+ if (!system_supports_bbml2_noabort())
+ return false;
+ return !force_pte_mapping();
+}
+
static DEFINE_MUTEX(pgtable_split_lock);
int split_kernel_leaf_mapping(unsigned long start, unsigned long end)
@@ -715,12 +739,11 @@ int split_kernel_leaf_mapping(unsigned long start, unsigned long end)
int ret;
/*
- * !BBML2_NOABORT systems should not be trying to change permissions on
- * anything that is not pte-mapped in the first place. Just return early
- * and let the permission change code raise a warning if not already
- * pte-mapped.
+ * Exit early if the region is within a pte-mapped area or if we can't
+ * split. For the latter case, the permission change code will raise a
+ * warning if not already pte-mapped.
*/
- if (!system_supports_bbml2_noabort())
+ if (!split_leaf_mapping_possible() || is_kfence_address((void *)start))
return 0;
/*
@@ -758,30 +781,30 @@ int split_kernel_leaf_mapping(unsigned long start, unsigned long end)
return ret;
}
-static int __init split_to_ptes_pud_entry(pud_t *pudp, unsigned long addr,
- unsigned long next,
- struct mm_walk *walk)
+static int split_to_ptes_pud_entry(pud_t *pudp, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
{
+ gfp_t gfp = *(gfp_t *)walk->private;
pud_t pud = pudp_get(pudp);
int ret = 0;
if (pud_leaf(pud))
- ret = split_pud(pudp, pud, GFP_ATOMIC, false);
+ ret = split_pud(pudp, pud, gfp, false);
return ret;
}
-static int __init split_to_ptes_pmd_entry(pmd_t *pmdp, unsigned long addr,
- unsigned long next,
- struct mm_walk *walk)
+static int split_to_ptes_pmd_entry(pmd_t *pmdp, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
{
+ gfp_t gfp = *(gfp_t *)walk->private;
pmd_t pmd = pmdp_get(pmdp);
int ret = 0;
if (pmd_leaf(pmd)) {
if (pmd_cont(pmd))
split_contpmd(pmdp);
- ret = split_pmd(pmdp, pmd, GFP_ATOMIC, false);
+ ret = split_pmd(pmdp, pmd, gfp, false);
/*
* We have split the pmd directly to ptes so there is no need to
@@ -793,9 +816,8 @@ static int __init split_to_ptes_pmd_entry(pmd_t *pmdp, unsigned long addr,
return ret;
}
-static int __init split_to_ptes_pte_entry(pte_t *ptep, unsigned long addr,
- unsigned long next,
- struct mm_walk *walk)
+static int split_to_ptes_pte_entry(pte_t *ptep, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
{
pte_t pte = __ptep_get(ptep);
@@ -805,12 +827,24 @@ static int __init split_to_ptes_pte_entry(pte_t *ptep, unsigned long addr,
return 0;
}
-static const struct mm_walk_ops split_to_ptes_ops __initconst = {
+static const struct mm_walk_ops split_to_ptes_ops = {
.pud_entry = split_to_ptes_pud_entry,
.pmd_entry = split_to_ptes_pmd_entry,
.pte_entry = split_to_ptes_pte_entry,
};
+static int range_split_to_ptes(unsigned long start, unsigned long end, gfp_t gfp)
+{
+ int ret;
+
+ arch_enter_lazy_mmu_mode();
+ ret = walk_kernel_page_table_range_lockless(start, end,
+ &split_to_ptes_ops, NULL, &gfp);
+ arch_leave_lazy_mmu_mode();
+
+ return ret;
+}
+
static bool linear_map_requires_bbml2 __initdata;
u32 idmap_kpti_bbml2_flag;
@@ -847,11 +881,9 @@ static int __init linear_map_split_to_ptes(void *__unused)
* PTE. The kernel alias remains static throughout runtime so
* can continue to be safely mapped with large mappings.
*/
- ret = walk_kernel_page_table_range_lockless(lstart, kstart,
- &split_to_ptes_ops, NULL, NULL);
+ ret = range_split_to_ptes(lstart, kstart, GFP_ATOMIC);
if (!ret)
- ret = walk_kernel_page_table_range_lockless(kend, lend,
- &split_to_ptes_ops, NULL, NULL);
+ ret = range_split_to_ptes(kend, lend, GFP_ATOMIC);
if (ret)
panic("Failed to split linear map\n");
flush_tlb_kernel_range(lstart, lend);
@@ -1002,6 +1034,33 @@ static void __init arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp)
memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE);
__kfence_pool = phys_to_virt(kfence_pool);
}
+
+bool arch_kfence_init_pool(void)
+{
+ unsigned long start = (unsigned long)__kfence_pool;
+ unsigned long end = start + KFENCE_POOL_SIZE;
+ int ret;
+
+ /* Exit early if we know the linear map is already pte-mapped. */
+ if (!split_leaf_mapping_possible())
+ return true;
+
+ /* Kfence pool is already pte-mapped for the early init case. */
+ if (kfence_early_init)
+ return true;
+
+ mutex_lock(&pgtable_split_lock);
+ ret = range_split_to_ptes(start, end, GFP_PGTABLE_KERNEL);
+ mutex_unlock(&pgtable_split_lock);
+
+ /*
+ * Since the system supports bbml2_noabort, tlb invalidation is not
+ * required here; the pgtable mappings have been split to pte but larger
+ * entries may safely linger in the TLB.
+ */
+
+ return !ret;
+}
#else /* CONFIG_KFENCE */
static inline phys_addr_t arm64_kfence_alloc_pool(void) { return 0; }
@@ -1009,16 +1068,6 @@ static inline void arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp) {
#endif /* CONFIG_KFENCE */
-static inline bool force_pte_mapping(void)
-{
- bool bbml2 = system_capabilities_finalized() ?
- system_supports_bbml2_noabort() : cpu_supports_bbml2_noabort();
-
- return (!bbml2 && (rodata_full || arm64_kfence_can_set_direct_map() ||
- is_realm_world())) ||
- debug_pagealloc_enabled();
-}
-
static void __init map_mem(pgd_t *pgdp)
{
static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN);
diff --git a/arch/arm64/tools/syscall_32.tbl b/arch/arm64/tools/syscall_32.tbl
index 8d9088bc577d..8cdfe5d4dac9 100644
--- a/arch/arm64/tools/syscall_32.tbl
+++ b/arch/arm64/tools/syscall_32.tbl
@@ -481,3 +481,4 @@
467 common open_tree_attr sys_open_tree_attr
468 common file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr
+470 common listns sys_listns
diff --git a/arch/loongarch/include/asm/cpu-features.h b/arch/loongarch/include/asm/cpu-features.h
index fc83bb32f9f0..bd5f0457ad21 100644
--- a/arch/loongarch/include/asm/cpu-features.h
+++ b/arch/loongarch/include/asm/cpu-features.h
@@ -67,6 +67,8 @@
#define cpu_has_hypervisor cpu_opt(LOONGARCH_CPU_HYPERVISOR)
#define cpu_has_ptw cpu_opt(LOONGARCH_CPU_PTW)
#define cpu_has_lspw cpu_opt(LOONGARCH_CPU_LSPW)
+#define cpu_has_msgint cpu_opt(LOONGARCH_CPU_MSGINT)
#define cpu_has_avecint cpu_opt(LOONGARCH_CPU_AVECINT)
+#define cpu_has_redirectint cpu_opt(LOONGARCH_CPU_REDIRECTINT)
#endif /* __ASM_CPU_FEATURES_H */
diff --git a/arch/loongarch/include/asm/cpu.h b/arch/loongarch/include/asm/cpu.h
index dfb982fe8701..f3efb00b6141 100644
--- a/arch/loongarch/include/asm/cpu.h
+++ b/arch/loongarch/include/asm/cpu.h
@@ -55,6 +55,27 @@ enum cpu_type_enum {
CPU_LAST
};
+static inline char *id_to_core_name(unsigned int id)
+{
+ if ((id & PRID_COMP_MASK) != PRID_COMP_LOONGSON)
+ return "Unknown";
+
+ switch (id & PRID_SERIES_MASK) {
+ case PRID_SERIES_LA132:
+ return "LA132";
+ case PRID_SERIES_LA264:
+ return "LA264";
+ case PRID_SERIES_LA364:
+ return "LA364";
+ case PRID_SERIES_LA464:
+ return "LA464";
+ case PRID_SERIES_LA664:
+ return "LA664";
+ default:
+ return "Unknown";
+ }
+}
+
#endif /* !__ASSEMBLER__ */
/*
@@ -101,7 +122,9 @@ enum cpu_type_enum {
#define CPU_FEATURE_HYPERVISOR 26 /* CPU has hypervisor (running in VM) */
#define CPU_FEATURE_PTW 27 /* CPU has hardware page table walker */
#define CPU_FEATURE_LSPW 28 /* CPU has LSPW (lddir/ldpte instructions) */
-#define CPU_FEATURE_AVECINT 29 /* CPU has AVEC interrupt */
+#define CPU_FEATURE_MSGINT 29 /* CPU has MSG interrupt */
+#define CPU_FEATURE_AVECINT 30 /* CPU has AVEC interrupt */
+#define CPU_FEATURE_REDIRECTINT 31 /* CPU has interrupt remapping */
#define LOONGARCH_CPU_CPUCFG BIT_ULL(CPU_FEATURE_CPUCFG)
#define LOONGARCH_CPU_LAM BIT_ULL(CPU_FEATURE_LAM)
@@ -132,6 +155,8 @@ enum cpu_type_enum {
#define LOONGARCH_CPU_HYPERVISOR BIT_ULL(CPU_FEATURE_HYPERVISOR)
#define LOONGARCH_CPU_PTW BIT_ULL(CPU_FEATURE_PTW)
#define LOONGARCH_CPU_LSPW BIT_ULL(CPU_FEATURE_LSPW)
+#define LOONGARCH_CPU_MSGINT BIT_ULL(CPU_FEATURE_MSGINT)
#define LOONGARCH_CPU_AVECINT BIT_ULL(CPU_FEATURE_AVECINT)
+#define LOONGARCH_CPU_REDIRECTINT BIT_ULL(CPU_FEATURE_REDIRECTINT)
#endif /* _ASM_CPU_H */
diff --git a/arch/loongarch/include/asm/hw_breakpoint.h b/arch/loongarch/include/asm/hw_breakpoint.h
index 13b2462f3d8c..5faa97a87a9e 100644
--- a/arch/loongarch/include/asm/hw_breakpoint.h
+++ b/arch/loongarch/include/asm/hw_breakpoint.h
@@ -134,13 +134,13 @@ static inline void hw_breakpoint_thread_switch(struct task_struct *next)
/* Determine number of BRP registers available. */
static inline int get_num_brps(void)
{
- return csr_read64(LOONGARCH_CSR_FWPC) & CSR_FWPC_NUM;
+ return csr_read32(LOONGARCH_CSR_FWPC) & CSR_FWPC_NUM;
}
/* Determine number of WRP registers available. */
static inline int get_num_wrps(void)
{
- return csr_read64(LOONGARCH_CSR_MWPC) & CSR_MWPC_NUM;
+ return csr_read32(LOONGARCH_CSR_MWPC) & CSR_MWPC_NUM;
}
#endif /* __KERNEL__ */
diff --git a/arch/loongarch/include/asm/io.h b/arch/loongarch/include/asm/io.h
index eaff72b38dc8..0130185e0349 100644
--- a/arch/loongarch/include/asm/io.h
+++ b/arch/loongarch/include/asm/io.h
@@ -14,7 +14,7 @@
#include <asm/pgtable-bits.h>
#include <asm/string.h>
-extern void __init __iomem *early_ioremap(u64 phys_addr, unsigned long size);
+extern void __init __iomem *early_ioremap(phys_addr_t phys_addr, unsigned long size);
extern void __init early_iounmap(void __iomem *addr, unsigned long size);
#define early_memremap early_ioremap
@@ -25,6 +25,9 @@ extern void __init early_iounmap(void __iomem *addr, unsigned long size);
static inline void __iomem *ioremap_prot(phys_addr_t offset, unsigned long size,
pgprot_t prot)
{
+ if (offset > TO_PHYS_MASK)
+ return NULL;
+
switch (pgprot_val(prot) & _CACHE_MASK) {
case _CACHE_CC:
return (void __iomem *)(unsigned long)(CACHE_BASE + offset);
diff --git a/arch/loongarch/include/asm/loongarch.h b/arch/loongarch/include/asm/loongarch.h
index 09dfd7eb406e..3de03cb864b2 100644
--- a/arch/loongarch/include/asm/loongarch.h
+++ b/arch/loongarch/include/asm/loongarch.h
@@ -128,6 +128,7 @@
#define CPUCFG6_PMNUM GENMASK(7, 4)
#define CPUCFG6_PMNUM_SHIFT 4
#define CPUCFG6_PMBITS GENMASK(13, 8)
+#define CPUCFG6_PMBITS_SHIFT 8
#define CPUCFG6_UPM BIT(14)
#define LOONGARCH_CPUCFG16 0x10
@@ -1137,6 +1138,7 @@
#define IOCSRF_FLATMODE BIT_ULL(10)
#define IOCSRF_VM BIT_ULL(11)
#define IOCSRF_AVEC BIT_ULL(15)
+#define IOCSRF_REDIRECT BIT_ULL(16)
#define LOONGARCH_IOCSR_VENDOR 0x10
diff --git a/arch/loongarch/include/asm/pgalloc.h b/arch/loongarch/include/asm/pgalloc.h
index 1c63a9d9a6d3..08dcc698ec18 100644
--- a/arch/loongarch/include/asm/pgalloc.h
+++ b/arch/loongarch/include/asm/pgalloc.h
@@ -88,7 +88,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address)
{
pud_t *pud;
- struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 0);
+ struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, 0);
if (!ptdesc)
return NULL;
diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h
index bd128696e96d..03fb60432fde 100644
--- a/arch/loongarch/include/asm/pgtable.h
+++ b/arch/loongarch/include/asm/pgtable.h
@@ -424,6 +424,9 @@ static inline unsigned long pte_accessible(struct mm_struct *mm, pte_t a)
static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
{
+ if (pte_val(pte) & _PAGE_DIRTY)
+ pte_val(pte) |= _PAGE_MODIFIED;
+
return __pte((pte_val(pte) & _PAGE_CHG_MASK) |
(pgprot_val(newprot) & ~_PAGE_CHG_MASK));
}
@@ -547,9 +550,11 @@ static inline struct page *pmd_page(pmd_t pmd)
static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
{
- pmd_val(pmd) = (pmd_val(pmd) & _HPAGE_CHG_MASK) |
- (pgprot_val(newprot) & ~_HPAGE_CHG_MASK);
- return pmd;
+ if (pmd_val(pmd) & _PAGE_DIRTY)
+ pmd_val(pmd) |= _PAGE_MODIFIED;
+
+ return __pmd((pmd_val(pmd) & _HPAGE_CHG_MASK) |
+ (pgprot_val(newprot) & ~_HPAGE_CHG_MASK));
}
static inline pmd_t pmd_mkinvalid(pmd_t pmd)
diff --git a/arch/loongarch/include/uapi/asm/ptrace.h b/arch/loongarch/include/uapi/asm/ptrace.h
index aafb3cd9e943..215e0f9e8aa3 100644
--- a/arch/loongarch/include/uapi/asm/ptrace.h
+++ b/arch/loongarch/include/uapi/asm/ptrace.h
@@ -10,10 +10,6 @@
#include <linux/types.h>
-#ifndef __KERNEL__
-#include <stdint.h>
-#endif
-
/*
* For PTRACE_{POKE,PEEK}USR. 0 - 31 are GPRs,
* 32 is syscall's original ARG0, 33 is PC, 34 is BADVADDR.
@@ -41,44 +37,44 @@ struct user_pt_regs {
} __attribute__((aligned(8)));
struct user_fp_state {
- uint64_t fpr[32];
- uint64_t fcc;
- uint32_t fcsr;
+ __u64 fpr[32];
+ __u64 fcc;
+ __u32 fcsr;
};
struct user_lsx_state {
/* 32 registers, 128 bits width per register. */
- uint64_t vregs[32*2];
+ __u64 vregs[32*2];
};
struct user_lasx_state {
/* 32 registers, 256 bits width per register. */
- uint64_t vregs[32*4];
+ __u64 vregs[32*4];
};
struct user_lbt_state {
- uint64_t scr[4];
- uint32_t eflags;
- uint32_t ftop;
+ __u64 scr[4];
+ __u32 eflags;
+ __u32 ftop;
};
struct user_watch_state {
- uint64_t dbg_info;
+ __u64 dbg_info;
struct {
- uint64_t addr;
- uint64_t mask;
- uint32_t ctrl;
- uint32_t pad;
+ __u64 addr;
+ __u64 mask;
+ __u32 ctrl;
+ __u32 pad;
} dbg_regs[8];
};
struct user_watch_state_v2 {
- uint64_t dbg_info;
+ __u64 dbg_info;
struct {
- uint64_t addr;
- uint64_t mask;
- uint32_t ctrl;
- uint32_t pad;
+ __u64 addr;
+ __u64 mask;
+ __u32 ctrl;
+ __u32 pad;
} dbg_regs[14];
};
diff --git a/arch/loongarch/kernel/cpu-probe.c b/arch/loongarch/kernel/cpu-probe.c
index cbfce2872d71..a2060a24b39f 100644
--- a/arch/loongarch/kernel/cpu-probe.c
+++ b/arch/loongarch/kernel/cpu-probe.c
@@ -157,6 +157,8 @@ static void cpu_probe_common(struct cpuinfo_loongarch *c)
c->options |= LOONGARCH_CPU_TLB;
if (config & CPUCFG1_IOCSR)
c->options |= LOONGARCH_CPU_IOCSR;
+ if (config & CPUCFG1_MSGINT)
+ c->options |= LOONGARCH_CPU_MSGINT;
if (config & CPUCFG1_UAL) {
c->options |= LOONGARCH_CPU_UAL;
elf_hwcap |= HWCAP_LOONGARCH_UAL;
@@ -275,7 +277,7 @@ static inline void cpu_probe_loongson(struct cpuinfo_loongarch *c, unsigned int
uint32_t config;
uint64_t *vendor = (void *)(&cpu_full_name[VENDOR_OFFSET]);
uint64_t *cpuname = (void *)(&cpu_full_name[CPUNAME_OFFSET]);
- const char *core_name = "Unknown";
+ const char *core_name = id_to_core_name(c->processor_id);
switch (BIT(fls(c->isa_level) - 1)) {
case LOONGARCH_CPU_ISA_LA32R:
@@ -289,35 +291,23 @@ static inline void cpu_probe_loongson(struct cpuinfo_loongarch *c, unsigned int
break;
}
- switch (c->processor_id & PRID_SERIES_MASK) {
- case PRID_SERIES_LA132:
- core_name = "LA132";
- break;
- case PRID_SERIES_LA264:
- core_name = "LA264";
- break;
- case PRID_SERIES_LA364:
- core_name = "LA364";
- break;
- case PRID_SERIES_LA464:
- core_name = "LA464";
- break;
- case PRID_SERIES_LA664:
- core_name = "LA664";
- break;
- }
-
pr_info("%s Processor probed (%s Core)\n", __cpu_family[cpu], core_name);
- if (!cpu_has_iocsr)
+ if (!cpu_has_iocsr) {
+ __cpu_full_name[cpu] = "Unknown";
return;
-
- if (!__cpu_full_name[cpu])
- __cpu_full_name[cpu] = cpu_full_name;
+ }
*vendor = iocsr_read64(LOONGARCH_IOCSR_VENDOR);
*cpuname = iocsr_read64(LOONGARCH_IOCSR_CPUNAME);
+ if (!__cpu_full_name[cpu]) {
+ if (((char *)vendor)[0] == 0)
+ __cpu_full_name[cpu] = "Unknown";
+ else
+ __cpu_full_name[cpu] = cpu_full_name;
+ }
+
config = iocsr_read32(LOONGARCH_IOCSR_FEATURES);
if (config & IOCSRF_CSRIPI)
c->options |= LOONGARCH_CPU_CSRIPI;
@@ -331,6 +321,8 @@ static inline void cpu_probe_loongson(struct cpuinfo_loongarch *c, unsigned int
c->options |= LOONGARCH_CPU_EIODECODE;
if (config & IOCSRF_AVEC)
c->options |= LOONGARCH_CPU_AVECINT;
+ if (config & IOCSRF_REDIRECT)
+ c->options |= LOONGARCH_CPU_REDIRECTINT;
if (config & IOCSRF_VM)
c->options |= LOONGARCH_CPU_HYPERVISOR;
}
diff --git a/arch/loongarch/kernel/kexec_efi.c b/arch/loongarch/kernel/kexec_efi.c
index 45121b914f8f..5ee78ebb1546 100644
--- a/arch/loongarch/kernel/kexec_efi.c
+++ b/arch/loongarch/kernel/kexec_efi.c
@@ -42,7 +42,7 @@ static void *efi_kexec_load(struct kimage *image,
{
int ret;
unsigned long text_offset, kernel_segment_number;
- struct kexec_buf kbuf;
+ struct kexec_buf kbuf = {};
struct kexec_segment *kernel_segment;
struct loongarch_image_header *h;
diff --git a/arch/loongarch/kernel/kexec_elf.c b/arch/loongarch/kernel/kexec_elf.c
index 97b2f049801a..1b6b64744c7f 100644
--- a/arch/loongarch/kernel/kexec_elf.c
+++ b/arch/loongarch/kernel/kexec_elf.c
@@ -59,7 +59,7 @@ static void *elf_kexec_load(struct kimage *image,
int ret;
unsigned long text_offset, kernel_segment_number;
struct elfhdr ehdr;
- struct kexec_buf kbuf;
+ struct kexec_buf kbuf = {};
struct kexec_elf_info elf_info;
struct kexec_segment *kernel_segment;
diff --git a/arch/loongarch/kernel/machine_kexec.c b/arch/loongarch/kernel/machine_kexec.c
index e4b2bbc47e62..d7fafda1d541 100644
--- a/arch/loongarch/kernel/machine_kexec.c
+++ b/arch/loongarch/kernel/machine_kexec.c
@@ -39,34 +39,12 @@ static unsigned long systable_ptr;
static unsigned long start_addr;
static unsigned long first_ind_entry;
-static void kexec_image_info(const struct kimage *kimage)
-{
- unsigned long i;
-
- pr_debug("kexec kimage info:\n");
- pr_debug("\ttype: %d\n", kimage->type);
- pr_debug("\tstart: %lx\n", kimage->start);
- pr_debug("\thead: %lx\n", kimage->head);
- pr_debug("\tnr_segments: %lu\n", kimage->nr_segments);
-
- for (i = 0; i < kimage->nr_segments; i++) {
- pr_debug("\t segment[%lu]: %016lx - %016lx", i,
- kimage->segment[i].mem,
- kimage->segment[i].mem + kimage->segment[i].memsz);
- pr_debug("\t\t0x%lx bytes, %lu pages\n",
- (unsigned long)kimage->segment[i].memsz,
- (unsigned long)kimage->segment[i].memsz / PAGE_SIZE);
- }
-}
-
int machine_kexec_prepare(struct kimage *kimage)
{
int i;
char *bootloader = "kexec";
void *cmdline_ptr = (void *)KEXEC_CMDLINE_ADDR;
- kexec_image_info(kimage);
-
kimage->arch.efi_boot = fw_arg0;
kimage->arch.systable_ptr = fw_arg2;
@@ -259,6 +237,7 @@ void machine_crash_shutdown(struct pt_regs *regs)
#ifdef CONFIG_SMP
crash_smp_send_stop();
#endif
+ machine_kexec_mask_interrupts();
cpumask_set_cpu(crashing_cpu, &cpus_in_crash);
pr_info("Starting crashdump kernel...\n");
@@ -296,6 +275,7 @@ void machine_kexec(struct kimage *image)
/* We do not want to be bothered. */
local_irq_disable();
+ machine_kexec_mask_interrupts();
pr_notice("EFI boot flag: 0x%lx\n", efi_boot);
pr_notice("Command line addr: 0x%lx\n", cmdline_ptr);
diff --git a/arch/loongarch/kernel/machine_kexec_file.c b/arch/loongarch/kernel/machine_kexec_file.c
index dda236b51a88..fb57026f5f25 100644
--- a/arch/loongarch/kernel/machine_kexec_file.c
+++ b/arch/loongarch/kernel/machine_kexec_file.c
@@ -143,7 +143,7 @@ int load_other_segments(struct kimage *image,
unsigned long initrd_load_addr = 0;
unsigned long orig_segments = image->nr_segments;
char *modified_cmdline = NULL;
- struct kexec_buf kbuf;
+ struct kexec_buf kbuf = {};
kbuf.image = image;
/* Don't allocate anything below the kernel */
diff --git a/arch/loongarch/kernel/mem.c b/arch/loongarch/kernel/mem.c
index aed901c57fb4..8ab1ffedc52c 100644
--- a/arch/loongarch/kernel/mem.c
+++ b/arch/loongarch/kernel/mem.c
@@ -13,7 +13,7 @@
void __init memblock_init(void)
{
u32 mem_type;
- u64 mem_start, mem_end, mem_size;
+ u64 mem_start, mem_size;
efi_memory_desc_t *md;
/* Parse memory information */
@@ -21,7 +21,6 @@ void __init memblock_init(void)
mem_type = md->type;
mem_start = md->phys_addr;
mem_size = md->num_pages << EFI_PAGE_SHIFT;
- mem_end = mem_start + mem_size;
switch (mem_type) {
case EFI_LOADER_CODE:
@@ -31,8 +30,6 @@ void __init memblock_init(void)
case EFI_PERSISTENT_MEMORY:
case EFI_CONVENTIONAL_MEMORY:
memblock_add(mem_start, mem_size);
- if (max_low_pfn < (mem_end >> PAGE_SHIFT))
- max_low_pfn = mem_end >> PAGE_SHIFT;
break;
case EFI_PAL_CODE:
case EFI_UNUSABLE_MEMORY:
@@ -49,6 +46,8 @@ void __init memblock_init(void)
}
}
+ max_pfn = PFN_DOWN(memblock_end_of_DRAM());
+ max_low_pfn = min(PFN_DOWN(HIGHMEM_START), max_pfn);
memblock_set_current_limit(PFN_PHYS(max_low_pfn));
/* Reserve the first 2MB */
diff --git a/arch/loongarch/kernel/numa.c b/arch/loongarch/kernel/numa.c
index d6e73e8f9c0b..8b89898e20df 100644
--- a/arch/loongarch/kernel/numa.c
+++ b/arch/loongarch/kernel/numa.c
@@ -158,35 +158,9 @@ static void __init node_mem_init(unsigned int node)
#ifdef CONFIG_ACPI_NUMA
-/*
- * add_numamem_region
- *
- * Add a uasable memory region described by BIOS. The
- * routine gets each intersection between BIOS's region
- * and node's region, and adds them into node's memblock
- * pool.
- *
- */
-static void __init add_numamem_region(u64 start, u64 end, u32 type)
-{
- u32 node = pa_to_nid(start);
- u64 size = end - start;
- static unsigned long num_physpages;
-
- if (start >= end) {
- pr_debug("Invalid region: %016llx-%016llx\n", start, end);
- return;
- }
-
- num_physpages += (size >> PAGE_SHIFT);
- pr_info("Node%d: mem_type:%d, mem_start:0x%llx, mem_size:0x%llx Bytes\n",
- node, type, start, size);
- pr_info(" start_pfn:0x%llx, end_pfn:0x%llx, num_physpages:0x%lx\n",
- start >> PAGE_SHIFT, end >> PAGE_SHIFT, num_physpages);
- memblock_set_node(start, size, &memblock.memory, node);
-}
+static unsigned long num_physpages;
-static void __init init_node_memblock(void)
+static void __init info_node_memblock(void)
{
u32 mem_type;
u64 mem_end, mem_start, mem_size;
@@ -206,12 +180,20 @@ static void __init init_node_memblock(void)
case EFI_BOOT_SERVICES_DATA:
case EFI_PERSISTENT_MEMORY:
case EFI_CONVENTIONAL_MEMORY:
- add_numamem_region(mem_start, mem_end, mem_type);
+ num_physpages += (mem_size >> PAGE_SHIFT);
+ pr_info("Node%d: mem_type:%d, mem_start:0x%llx, mem_size:0x%llx Bytes\n",
+ (u32)pa_to_nid(mem_start), mem_type, mem_start, mem_size);
+ pr_info(" start_pfn:0x%llx, end_pfn:0x%llx, num_physpages:0x%lx\n",
+ mem_start >> PAGE_SHIFT, mem_end >> PAGE_SHIFT, num_physpages);
break;
case EFI_PAL_CODE:
case EFI_UNUSABLE_MEMORY:
case EFI_ACPI_RECLAIM_MEMORY:
- add_numamem_region(mem_start, mem_end, mem_type);
+ num_physpages += (mem_size >> PAGE_SHIFT);
+ pr_info("Node%d: mem_type:%d, mem_start:0x%llx, mem_size:0x%llx Bytes\n",
+ (u32)pa_to_nid(mem_start), mem_type, mem_start, mem_size);
+ pr_info(" start_pfn:0x%llx, end_pfn:0x%llx, num_physpages:0x%lx\n",
+ mem_start >> PAGE_SHIFT, mem_end >> PAGE_SHIFT, num_physpages);
fallthrough;
case EFI_RESERVED_TYPE:
case EFI_RUNTIME_SERVICES_CODE:
@@ -249,22 +231,16 @@ int __init init_numa_memory(void)
for (i = 0; i < NR_CPUS; i++)
set_cpuid_to_node(i, NUMA_NO_NODE);
- numa_reset_distance();
- nodes_clear(numa_nodes_parsed);
- nodes_clear(node_possible_map);
- nodes_clear(node_online_map);
- WARN_ON(memblock_clear_hotplug(0, PHYS_ADDR_MAX));
-
/* Parse SRAT and SLIT if provided by firmware. */
- ret = acpi_disabled ? fake_numa_init() : acpi_numa_init();
+ if (!acpi_disabled)
+ ret = numa_memblks_init(acpi_numa_init, false);
+ else
+ ret = numa_memblks_init(fake_numa_init, false);
+
if (ret < 0)
return ret;
- node_possible_map = numa_nodes_parsed;
- if (WARN_ON(nodes_empty(node_possible_map)))
- return -EINVAL;
-
- init_node_memblock();
+ info_node_memblock();
if (!memblock_validate_numa_coverage(SZ_1M))
return -EINVAL;
@@ -272,7 +248,8 @@ int __init init_numa_memory(void)
node_mem_init(node);
node_set_online(node);
}
- max_low_pfn = PHYS_PFN(memblock_end_of_DRAM());
+ max_pfn = PFN_DOWN(memblock_end_of_DRAM());
+ max_low_pfn = min(PFN_DOWN(HIGHMEM_START), max_pfn);
setup_nr_node_ids();
loongson_sysconf.nr_nodes = nr_node_ids;
@@ -283,26 +260,6 @@ int __init init_numa_memory(void)
#endif
-void __init paging_init(void)
-{
- unsigned int node;
- unsigned long zones_size[MAX_NR_ZONES] = {0, };
-
- for_each_online_node(node) {
- unsigned long start_pfn, end_pfn;
-
- get_pfn_range_for_nid(node, &start_pfn, &end_pfn);
-
- if (end_pfn > max_low_pfn)
- max_low_pfn = end_pfn;
- }
-#ifdef CONFIG_ZONE_DMA32
- zones_size[ZONE_DMA32] = MAX_DMA32_PFN;
-#endif
- zones_size[ZONE_NORMAL] = max_low_pfn;
- free_area_init(zones_size);
-}
-
int pcibus_to_node(struct pci_bus *bus)
{
return dev_to_node(&bus->dev);
diff --git a/arch/loongarch/kernel/perf_event.c b/arch/loongarch/kernel/perf_event.c
index 8ad098703488..9d257c8519c9 100644
--- a/arch/loongarch/kernel/perf_event.c
+++ b/arch/loongarch/kernel/perf_event.c
@@ -845,13 +845,14 @@ static const struct loongarch_perf_event *loongarch_pmu_map_raw_event(u64 config
static int __init init_hw_perf_events(void)
{
- int counters;
+ int bits, counters;
if (!cpu_has_pmp)
return -ENODEV;
pr_info("Performance counters: ");
- counters = ((read_cpucfg(LOONGARCH_CPUCFG6) & CPUCFG6_PMNUM) >> 4) + 1;
+ bits = ((read_cpucfg(LOONGARCH_CPUCFG6) & CPUCFG6_PMBITS) >> CPUCFG6_PMBITS_SHIFT) + 1;
+ counters = ((read_cpucfg(LOONGARCH_CPUCFG6) & CPUCFG6_PMNUM) >> CPUCFG6_PMNUM_SHIFT) + 1;
loongarch_pmu.num_counters = counters;
loongarch_pmu.max_period = (1ULL << 63) - 1;
@@ -867,7 +868,7 @@ static int __init init_hw_perf_events(void)
on_each_cpu(reset_counters, NULL, 1);
pr_cont("%s PMU enabled, %d %d-bit counters available to each CPU.\n",
- loongarch_pmu.name, counters, 64);
+ loongarch_pmu.name, counters, bits);
perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
diff --git a/arch/loongarch/kernel/proc.c b/arch/loongarch/kernel/proc.c
index cea30768ae92..63d2b7e7e844 100644
--- a/arch/loongarch/kernel/proc.c
+++ b/arch/loongarch/kernel/proc.c
@@ -17,6 +17,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
{
unsigned long n = (unsigned long) v - 1;
unsigned int isa = cpu_data[n].isa_level;
+ unsigned int prid = cpu_data[n].processor_id;
unsigned int version = cpu_data[n].processor_id & 0xff;
unsigned int fp_version = cpu_data[n].fpu_vers;
@@ -37,6 +38,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
seq_printf(m, "global_id\t\t: %d\n", cpu_data[n].global_id);
seq_printf(m, "CPU Family\t\t: %s\n", __cpu_family[n]);
seq_printf(m, "Model Name\t\t: %s\n", __cpu_full_name[n]);
+ seq_printf(m, "PRID\t\t\t: %s (%08x)\n", id_to_core_name(prid), prid);
seq_printf(m, "CPU Revision\t\t: 0x%02x\n", version);
seq_printf(m, "FPU Revision\t\t: 0x%02x\n", fp_version);
seq_printf(m, "CPU MHz\t\t\t: %llu.%02llu\n",
diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c
index 69c17d162fff..25a87378e48e 100644
--- a/arch/loongarch/kernel/setup.c
+++ b/arch/loongarch/kernel/setup.c
@@ -294,8 +294,6 @@ static void __init fdt_setup(void)
early_init_dt_scan(fdt_pointer, __pa(fdt_pointer));
early_init_fdt_reserve_self();
-
- max_low_pfn = PFN_PHYS(memblock_end_of_DRAM());
#endif
}
@@ -390,7 +388,8 @@ static void __init check_kernel_sections_mem(void)
static void __init arch_mem_init(char **cmdline_p)
{
/* Recalculate max_low_pfn for "mem=xxx" */
- max_pfn = max_low_pfn = PHYS_PFN(memblock_end_of_DRAM());
+ max_pfn = PFN_DOWN(memblock_end_of_DRAM());
+ max_low_pfn = min(PFN_DOWN(HIGHMEM_START), max_pfn);
if (usermem)
pr_info("User-defined physical RAM map overwrite\n");
diff --git a/arch/loongarch/kernel/traps.c b/arch/loongarch/kernel/traps.c
index 3d9be6ca7ec5..da5926fead4a 100644
--- a/arch/loongarch/kernel/traps.c
+++ b/arch/loongarch/kernel/traps.c
@@ -1131,8 +1131,8 @@ static void configure_exception_vector(void)
tlbrentry = (unsigned long)exception_handlers + 80*VECSIZE;
csr_write64(eentry, LOONGARCH_CSR_EENTRY);
- csr_write64(eentry, LOONGARCH_CSR_MERRENTRY);
- csr_write64(tlbrentry, LOONGARCH_CSR_TLBRENTRY);
+ csr_write64(__pa(eentry), LOONGARCH_CSR_MERRENTRY);
+ csr_write64(__pa(tlbrentry), LOONGARCH_CSR_TLBRENTRY);
}
void per_cpu_trap_init(int cpu)
diff --git a/arch/loongarch/kvm/intc/eiointc.c b/arch/loongarch/kvm/intc/eiointc.c
index c32333695381..a1cc116b4dac 100644
--- a/arch/loongarch/kvm/intc/eiointc.c
+++ b/arch/loongarch/kvm/intc/eiointc.c
@@ -439,7 +439,7 @@ static int kvm_eiointc_ctrl_access(struct kvm_device *dev,
spin_lock_irqsave(&s->lock, flags);
switch (type) {
case KVM_DEV_LOONGARCH_EXTIOI_CTRL_INIT_NUM_CPU:
- if (val >= EIOINTC_ROUTE_MAX_VCPUS)
+ if (val > EIOINTC_ROUTE_MAX_VCPUS)
ret = -EINVAL;
else
s->num_cpu = val;
diff --git a/arch/loongarch/kvm/mmu.c b/arch/loongarch/kvm/mmu.c
index 7c8143e79c12..a7fa458e3360 100644
--- a/arch/loongarch/kvm/mmu.c
+++ b/arch/loongarch/kvm/mmu.c
@@ -857,7 +857,7 @@ retry:
if (writeable) {
prot_bits = kvm_pte_mkwriteable(prot_bits);
- if (write)
+ if (write || !kvm_slot_dirty_track_enabled(memslot))
prot_bits = kvm_pte_mkdirty(prot_bits);
}
diff --git a/arch/loongarch/kvm/timer.c b/arch/loongarch/kvm/timer.c
index 32dc213374be..29c2aaba63c3 100644
--- a/arch/loongarch/kvm/timer.c
+++ b/arch/loongarch/kvm/timer.c
@@ -4,6 +4,7 @@
*/
#include <linux/kvm_host.h>
+#include <asm/delay.h>
#include <asm/kvm_csr.h>
#include <asm/kvm_vcpu.h>
@@ -95,6 +96,7 @@ void kvm_restore_timer(struct kvm_vcpu *vcpu)
* and set CSR TVAL with -1
*/
write_gcsr_timertick(0);
+ __delay(2); /* Wait cycles until timer interrupt injected */
/*
* Writing CSR_TINTCLR_TI to LOONGARCH_CSR_TINTCLR will clear
diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c
index 30e3b089a596..1245a6b35896 100644
--- a/arch/loongarch/kvm/vcpu.c
+++ b/arch/loongarch/kvm/vcpu.c
@@ -132,6 +132,9 @@ static void kvm_lose_pmu(struct kvm_vcpu *vcpu)
* Clear KVM_LARCH_PMU if the guest is not using PMU CSRs when
* exiting the guest, so that the next time trap into the guest.
* We don't need to deal with PMU CSRs contexts.
+ *
+ * Otherwise set the request bit KVM_REQ_PMU to restore guest PMU
+ * before entering guest VM
*/
val = kvm_read_sw_gcsr(csr, LOONGARCH_CSR_PERFCTRL0);
val |= kvm_read_sw_gcsr(csr, LOONGARCH_CSR_PERFCTRL1);
@@ -139,16 +142,12 @@ static void kvm_lose_pmu(struct kvm_vcpu *vcpu)
val |= kvm_read_sw_gcsr(csr, LOONGARCH_CSR_PERFCTRL3);
if (!(val & KVM_PMU_EVENT_ENABLED))
vcpu->arch.aux_inuse &= ~KVM_LARCH_PMU;
+ else
+ kvm_make_request(KVM_REQ_PMU, vcpu);
kvm_restore_host_pmu(vcpu);
}
-static void kvm_restore_pmu(struct kvm_vcpu *vcpu)
-{
- if ((vcpu->arch.aux_inuse & KVM_LARCH_PMU))
- kvm_make_request(KVM_REQ_PMU, vcpu);
-}
-
static void kvm_check_pmu(struct kvm_vcpu *vcpu)
{
if (kvm_check_request(KVM_REQ_PMU, vcpu)) {
@@ -299,7 +298,10 @@ static int kvm_pre_enter_guest(struct kvm_vcpu *vcpu)
vcpu->arch.aux_inuse &= ~KVM_LARCH_SWCSR_LATEST;
if (kvm_request_pending(vcpu) || xfer_to_guest_mode_work_pending()) {
- kvm_lose_pmu(vcpu);
+ if (vcpu->arch.aux_inuse & KVM_LARCH_PMU) {
+ kvm_lose_pmu(vcpu);
+ kvm_make_request(KVM_REQ_PMU, vcpu);
+ }
/* make sure the vcpu mode has been written */
smp_store_mb(vcpu->mode, OUTSIDE_GUEST_MODE);
local_irq_enable();
@@ -1604,9 +1606,6 @@ static int _kvm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
kvm_restore_timer(vcpu);
kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
- /* Restore hardware PMU CSRs */
- kvm_restore_pmu(vcpu);
-
/* Don't bother restoring registers multiple times unless necessary */
if (vcpu->arch.aux_inuse & KVM_LARCH_HWCSR_USABLE)
return 0;
diff --git a/arch/loongarch/mm/init.c b/arch/loongarch/mm/init.c
index c3e4586a7975..6bfd4b8dad1b 100644
--- a/arch/loongarch/mm/init.c
+++ b/arch/loongarch/mm/init.c
@@ -60,7 +60,6 @@ int __ref page_is_ram(unsigned long pfn)
return memblock_is_memory(addr) && !memblock_is_reserved(addr);
}
-#ifndef CONFIG_NUMA
void __init paging_init(void)
{
unsigned long max_zone_pfns[MAX_NR_ZONES];
@@ -72,7 +71,6 @@ void __init paging_init(void)
free_area_init(max_zone_pfns);
}
-#endif /* !CONFIG_NUMA */
void __ref free_initmem(void)
{
diff --git a/arch/loongarch/mm/ioremap.c b/arch/loongarch/mm/ioremap.c
index df949a3d0f34..27c336959fe8 100644
--- a/arch/loongarch/mm/ioremap.c
+++ b/arch/loongarch/mm/ioremap.c
@@ -6,7 +6,7 @@
#include <asm/io.h>
#include <asm-generic/early_ioremap.h>
-void __init __iomem *early_ioremap(u64 phys_addr, unsigned long size)
+void __init __iomem *early_ioremap(phys_addr_t phys_addr, unsigned long size)
{
return ((void __iomem *)TO_CACHE(phys_addr));
}
diff --git a/arch/loongarch/net/bpf_jit.c b/arch/loongarch/net/bpf_jit.c
index cbe53d0b7fb0..f97dc9936401 100644
--- a/arch/loongarch/net/bpf_jit.c
+++ b/arch/loongarch/net/bpf_jit.c
@@ -1624,6 +1624,9 @@ static int __arch_prepare_bpf_trampoline(struct jit_ctx *ctx, struct bpf_tramp_i
/* Direct jump skips 5 NOP instructions */
else if (is_bpf_text_address((unsigned long)orig_call))
orig_call += LOONGARCH_BPF_FENTRY_NBYTES;
+ /* Module tracing not supported - cause kernel lockups */
+ else if (is_module_text_address((unsigned long)orig_call))
+ return -ENOTSUPP;
if (flags & BPF_TRAMP_F_CALL_ORIG) {
move_addr(ctx, LOONGARCH_GPR_A0, (const u64)im);
diff --git a/arch/loongarch/pci/pci.c b/arch/loongarch/pci/pci.c
index 5bc9627a6cf9..d9fc5d520b37 100644
--- a/arch/loongarch/pci/pci.c
+++ b/arch/loongarch/pci/pci.c
@@ -50,11 +50,11 @@ static int __init pcibios_init(void)
*/
lsize = cpu_last_level_cache_line_size();
- BUG_ON(!lsize);
+ if (lsize) {
+ pci_dfl_cache_line_size = lsize >> 2;
- pci_dfl_cache_line_size = lsize >> 2;
-
- pr_debug("PCI: pci_cache_line_size set to %d bytes\n", lsize);
+ pr_debug("PCI: pci_cache_line_size set to %d bytes\n", lsize);
+ }
return 0;
}
diff --git a/arch/loongarch/vdso/Makefile b/arch/loongarch/vdso/Makefile
index d8316f993482..c0cc3ca5da9f 100644
--- a/arch/loongarch/vdso/Makefile
+++ b/arch/loongarch/vdso/Makefile
@@ -19,7 +19,7 @@ ccflags-vdso := \
cflags-vdso := $(ccflags-vdso) \
-isystem $(shell $(CC) -print-file-name=include) \
$(filter -W%,$(filter-out -Wa$(comma)%,$(KBUILD_CFLAGS))) \
- -std=gnu11 -O2 -g -fno-strict-aliasing -fno-common -fno-builtin \
+ -std=gnu11 -fms-extensions -O2 -g -fno-strict-aliasing -fno-common -fno-builtin \
-fno-stack-protector -fno-jump-tables -DDISABLE_BRANCH_PROFILING \
$(call cc-option, -fno-asynchronous-unwind-tables) \
$(call cc-option, -fno-stack-protector)
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index f41d38dfbf13..871a5d67bf41 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -469,3 +469,4 @@
467 common open_tree_attr sys_open_tree_attr
468 common file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr
+470 common listns sys_listns
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index 580af574fe73..022fc85d94b3 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -475,3 +475,4 @@
467 common open_tree_attr sys_open_tree_attr
468 common file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr
+470 common listns sys_listns
diff --git a/arch/mips/boot/dts/econet/en751221.dtsi b/arch/mips/boot/dts/econet/en751221.dtsi
index 66197e73d4f0..2abeef5b744a 100644
--- a/arch/mips/boot/dts/econet/en751221.dtsi
+++ b/arch/mips/boot/dts/econet/en751221.dtsi
@@ -18,7 +18,7 @@
cpu@0 {
device_type = "cpu";
- compatible = "mips,mips24KEc";
+ compatible = "mips,mips34Kc";
reg = <0>;
};
};
diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index 29191fa1801e..a3101f2268c6 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -692,7 +692,7 @@ unsigned long mips_stack_top(void)
/* Space for the VDSO, data page & GIC user page */
if (current->thread.abi) {
top -= PAGE_ALIGN(current->thread.abi->vdso->size);
- top -= PAGE_SIZE;
+ top -= VDSO_NR_PAGES * PAGE_SIZE;
top -= mips_gic_present() ? PAGE_SIZE : 0;
/* Space to randomize the VDSO base */
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index d824ffe9a014..8cedc83c3266 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -408,3 +408,4 @@
467 n32 open_tree_attr sys_open_tree_attr
468 n32 file_getattr sys_file_getattr
469 n32 file_setattr sys_file_setattr
+470 n32 listns sys_listns
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index 7a7049c2c307..9b92bddf06b5 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -384,3 +384,4 @@
467 n64 open_tree_attr sys_open_tree_attr
468 n64 file_getattr sys_file_getattr
469 n64 file_setattr sys_file_setattr
+470 n64 listns sys_listns
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index d330274f0601..f810b8a55716 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -457,3 +457,4 @@
467 o32 open_tree_attr sys_open_tree_attr
468 o32 file_getattr sys_file_getattr
469 o32 file_setattr sys_file_setattr
+470 o32 listns sys_listns
diff --git a/arch/mips/mm/tlb-r4k.c b/arch/mips/mm/tlb-r4k.c
index 347126dc010d..44a662536148 100644
--- a/arch/mips/mm/tlb-r4k.c
+++ b/arch/mips/mm/tlb-r4k.c
@@ -12,9 +12,11 @@
#include <linux/init.h>
#include <linux/sched.h>
#include <linux/smp.h>
+#include <linux/memblock.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/export.h>
+#include <linux/sort.h>
#include <asm/cpu.h>
#include <asm/cpu-type.h>
@@ -508,58 +510,95 @@ static int __init set_ntlb(char *str)
__setup("ntlb=", set_ntlb);
-/* Initialise all TLB entries with unique values */
-static void r4k_tlb_uniquify(void)
+
+/* Comparison function for EntryHi VPN fields. */
+static int r4k_vpn_cmp(const void *a, const void *b)
{
- int entry = num_wired_entries();
+ long v = *(unsigned long *)a - *(unsigned long *)b;
+ int s = sizeof(long) > sizeof(int) ? sizeof(long) * 8 - 1: 0;
+ return s ? (v != 0) | v >> s : v;
+}
+
+/*
+ * Initialise all TLB entries with unique values that do not clash with
+ * what we have been handed over and what we'll be using ourselves.
+ */
+static void __ref r4k_tlb_uniquify(void)
+{
+ int tlbsize = current_cpu_data.tlbsize;
+ bool use_slab = slab_is_available();
+ int start = num_wired_entries();
+ phys_addr_t tlb_vpn_size;
+ unsigned long *tlb_vpns;
+ unsigned long vpn_mask;
+ int cnt, ent, idx, i;
+
+ vpn_mask = GENMASK(cpu_vmbits - 1, 13);
+ vpn_mask |= IS_ENABLED(CONFIG_64BIT) ? 3ULL << 62 : 1 << 31;
+
+ tlb_vpn_size = tlbsize * sizeof(*tlb_vpns);
+ tlb_vpns = (use_slab ?
+ kmalloc(tlb_vpn_size, GFP_KERNEL) :
+ memblock_alloc_raw(tlb_vpn_size, sizeof(*tlb_vpns)));
+ if (WARN_ON(!tlb_vpns))
+ return; /* Pray local_flush_tlb_all() is good enough. */
htw_stop();
+
+ for (i = start, cnt = 0; i < tlbsize; i++, cnt++) {
+ unsigned long vpn;
+
+ write_c0_index(i);
+ mtc0_tlbr_hazard();
+ tlb_read();
+ tlb_read_hazard();
+ vpn = read_c0_entryhi();
+ vpn &= vpn_mask & PAGE_MASK;
+ tlb_vpns[cnt] = vpn;
+
+ /* Prevent any large pages from overlapping regular ones. */
+ write_c0_pagemask(read_c0_pagemask() & PM_DEFAULT_MASK);
+ mtc0_tlbw_hazard();
+ tlb_write_indexed();
+ tlbw_use_hazard();
+ }
+
+ sort(tlb_vpns, cnt, sizeof(tlb_vpns[0]), r4k_vpn_cmp, NULL);
+
+ write_c0_pagemask(PM_DEFAULT_MASK);
write_c0_entrylo0(0);
write_c0_entrylo1(0);
- while (entry < current_cpu_data.tlbsize) {
- unsigned long asid_mask = cpu_asid_mask(&current_cpu_data);
- unsigned long asid = 0;
- int idx;
+ idx = 0;
+ ent = tlbsize;
+ for (i = start; i < tlbsize; i++)
+ while (1) {
+ unsigned long entryhi, vpn;
- /* Skip wired MMID to make ginvt_mmid work */
- if (cpu_has_mmid)
- asid = MMID_KERNEL_WIRED + 1;
+ entryhi = UNIQUE_ENTRYHI(ent);
+ vpn = entryhi & vpn_mask & PAGE_MASK;
- /* Check for match before using UNIQUE_ENTRYHI */
- do {
- if (cpu_has_mmid) {
- write_c0_memorymapid(asid);
- write_c0_entryhi(UNIQUE_ENTRYHI(entry));
+ if (idx >= cnt || vpn < tlb_vpns[idx]) {
+ write_c0_entryhi(entryhi);
+ write_c0_index(i);
+ mtc0_tlbw_hazard();
+ tlb_write_indexed();
+ ent++;
+ break;
+ } else if (vpn == tlb_vpns[idx]) {
+ ent++;
} else {
- write_c0_entryhi(UNIQUE_ENTRYHI(entry) | asid);
+ idx++;
}
- mtc0_tlbw_hazard();
- tlb_probe();
- tlb_probe_hazard();
- idx = read_c0_index();
- /* No match or match is on current entry */
- if (idx < 0 || idx == entry)
- break;
- /*
- * If we hit a match, we need to try again with
- * a different ASID.
- */
- asid++;
- } while (asid < asid_mask);
-
- if (idx >= 0 && idx != entry)
- panic("Unable to uniquify TLB entry %d", idx);
-
- write_c0_index(entry);
- mtc0_tlbw_hazard();
- tlb_write_indexed();
- entry++;
- }
+ }
tlbw_use_hazard();
htw_start();
flush_micro_tlb();
+ if (use_slab)
+ kfree(tlb_vpns);
+ else
+ memblock_free(tlb_vpns, tlb_vpn_size);
}
/*
@@ -602,6 +641,7 @@ static void r4k_tlb_configure(void)
/* From this point on the ARC firmware is dead. */
r4k_tlb_uniquify();
+ local_flush_tlb_all();
/* Did I tell you that ARC SUCKS? */
}
diff --git a/arch/mips/mti-malta/malta-init.c b/arch/mips/mti-malta/malta-init.c
index 000d6d50520a..82b0fd8576a2 100644
--- a/arch/mips/mti-malta/malta-init.c
+++ b/arch/mips/mti-malta/malta-init.c
@@ -241,16 +241,22 @@ mips_pci_controller:
#endif
/*
- * Setup the Malta max (2GB) memory for PCI DMA in host bridge
- * in transparent addressing mode.
+ * Set up memory mapping in host bridge for PCI DMA masters,
+ * in transparent addressing mode. For EVA use the Malta
+ * maximum of 2 GiB memory in the alias space at 0x80000000
+ * as per PHYS_OFFSET. Otherwise use 256 MiB of memory in
+ * the regular space, avoiding mapping the PCI MMIO window
+ * for DMA as it seems to confuse the system controller's
+ * logic, causing PCI MMIO to stop working.
*/
- mask = PHYS_OFFSET | PCI_BASE_ADDRESS_MEM_PREFETCH;
- MSC_WRITE(MSC01_PCI_BAR0, mask);
- MSC_WRITE(MSC01_PCI_HEAD4, mask);
+ mask = PHYS_OFFSET ? PHYS_OFFSET : 0xf0000000;
+ MSC_WRITE(MSC01_PCI_BAR0,
+ mask | PCI_BASE_ADDRESS_MEM_PREFETCH);
+ MSC_WRITE(MSC01_PCI_HEAD4,
+ PHYS_OFFSET | PCI_BASE_ADDRESS_MEM_PREFETCH);
- mask &= MSC01_PCI_BAR0_SIZE_MSK;
MSC_WRITE(MSC01_PCI_P2SCMSKL, mask);
- MSC_WRITE(MSC01_PCI_P2SCMAPL, mask);
+ MSC_WRITE(MSC01_PCI_P2SCMAPL, PHYS_OFFSET);
/* Don't handle target retries indefinitely. */
if ((data & MSC01_PCI_CFG_MAXRTRY_MSK) ==
diff --git a/arch/parisc/boot/compressed/Makefile b/arch/parisc/boot/compressed/Makefile
index 17c42d718eb3..f8481e4e9d21 100644
--- a/arch/parisc/boot/compressed/Makefile
+++ b/arch/parisc/boot/compressed/Makefile
@@ -18,7 +18,7 @@ KBUILD_CFLAGS += -fno-PIE -mno-space-regs -mdisable-fpregs -Os
ifndef CONFIG_64BIT
KBUILD_CFLAGS += -mfast-indirect-calls
endif
-KBUILD_CFLAGS += -std=gnu11
+KBUILD_CFLAGS += -std=gnu11 -fms-extensions
LDFLAGS_vmlinux := -X -e startup --as-needed -T
$(obj)/vmlinux: $(obj)/vmlinux.lds $(addprefix $(obj)/, $(OBJECTS)) $(LIBGCC) FORCE
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index 88a788a7b18d..39bdacaa530b 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -468,3 +468,4 @@
467 common open_tree_attr sys_open_tree_attr
468 common file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr
+470 common listns sys_listns
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index e24f4d88885a..9537a61ebae0 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -137,6 +137,7 @@ config PPC
select ARCH_HAS_DMA_OPS if PPC64
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL
+ select ARCH_HAS_GIGANTIC_PAGE if ARCH_SUPPORTS_HUGETLBFS
select ARCH_HAS_KCOV
select ARCH_HAS_KERNEL_FPU_SUPPORT if PPC64 && PPC_FPU
select ARCH_HAS_MEMBARRIER_CALLBACKS
diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index c47b78c1d3e7..f1a4761ebd44 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -70,7 +70,7 @@ BOOTCPPFLAGS := -nostdinc $(LINUXINCLUDE)
BOOTCPPFLAGS += -isystem $(shell $(BOOTCC) -print-file-name=include)
BOOTCFLAGS := $(BOOTTARGETFLAGS) \
- -std=gnu11 \
+ -std=gnu11 -fms-extensions \
-Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \
-fno-strict-aliasing -O2 \
-msoft-float -mno-altivec -mno-vsx \
@@ -86,6 +86,7 @@ BOOTARFLAGS := -crD
ifdef CONFIG_CC_IS_CLANG
BOOTCFLAGS += $(CLANG_FLAGS)
+BOOTCFLAGS += -Wno-microsoft-anon-tag
BOOTAFLAGS += $(CLANG_FLAGS)
endif
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index b453e80dfc00..ec4458cdb97b 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -560,3 +560,4 @@
467 common open_tree_attr sys_open_tree_attr
468 common file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr
+470 common listns sys_listns
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 7b527d18aa5e..4c321a8ea896 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -423,7 +423,6 @@ config PPC_64S_HASH_MMU
config PPC_RADIX_MMU
bool "Radix MMU Support"
depends on PPC_BOOK3S_64
- select ARCH_HAS_GIGANTIC_PAGE
default y
help
Enable support for the Power ISA 3.0 Radix style MMU. Currently this
diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index 7ec60290abe6..78c4b6ce5f13 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -267,22 +267,11 @@ spufs_mkdir(struct inode *dir, struct dentry *dentry, unsigned int flags,
static int spufs_context_open(const struct path *path)
{
- int ret;
- struct file *filp;
-
- ret = get_unused_fd_flags(0);
- if (ret < 0)
- return ret;
-
- filp = dentry_open(path, O_RDONLY, current_cred());
- if (IS_ERR(filp)) {
- put_unused_fd(ret);
- return PTR_ERR(filp);
- }
-
- filp->f_op = &spufs_context_fops;
- fd_install(ret, filp);
- return ret;
+ FD_PREPARE(fdf, 0, dentry_open(path, O_RDONLY, current_cred()));
+ if (fdf.err)
+ return fdf.err;
+ fd_prepare_file(fdf)->f_op = &spufs_context_fops;
+ return fd_publish(fdf);
}
static struct spu_context *
@@ -508,26 +497,15 @@ static const struct file_operations spufs_gang_fops = {
static int spufs_gang_open(const struct path *path)
{
- int ret;
- struct file *filp;
-
- ret = get_unused_fd_flags(0);
- if (ret < 0)
- return ret;
-
/*
* get references for dget and mntget, will be released
* in error path of *_open().
*/
- filp = dentry_open(path, O_RDONLY, current_cred());
- if (IS_ERR(filp)) {
- put_unused_fd(ret);
- return PTR_ERR(filp);
- }
-
- filp->f_op = &spufs_gang_fops;
- fd_install(ret, filp);
- return ret;
+ FD_PREPARE(fdf, 0, dentry_open(path, O_RDONLY, current_cred()));
+ if (fdf.err)
+ return fdf.err;
+ fd_prepare_file(fdf)->f_op = &spufs_gang_fops;
+ return fd_publish(fdf);
}
static int spufs_create_gang(struct inode *inode,
diff --git a/arch/powerpc/platforms/pseries/papr-hvpipe.c b/arch/powerpc/platforms/pseries/papr-hvpipe.c
index 21a2f447c43f..dd7b668799d9 100644
--- a/arch/powerpc/platforms/pseries/papr-hvpipe.c
+++ b/arch/powerpc/platforms/pseries/papr-hvpipe.c
@@ -479,10 +479,7 @@ static const struct file_operations papr_hvpipe_handle_ops = {
static int papr_hvpipe_dev_create_handle(u32 srcID)
{
- struct hvpipe_source_info *src_info;
- struct file *file;
- long err;
- int fd;
+ struct hvpipe_source_info *src_info __free(kfree) = NULL;
spin_lock(&hvpipe_src_list_lock);
/*
@@ -506,20 +503,13 @@ static int papr_hvpipe_dev_create_handle(u32 srcID)
src_info->tsk = current;
init_waitqueue_head(&src_info->recv_wqh);
- fd = get_unused_fd_flags(O_RDONLY | O_CLOEXEC);
- if (fd < 0) {
- err = fd;
- goto free_buf;
- }
-
- file = anon_inode_getfile("[papr-hvpipe]",
- &papr_hvpipe_handle_ops, (void *)src_info,
- O_RDWR);
- if (IS_ERR(file)) {
- err = PTR_ERR(file);
- goto free_fd;
- }
+ FD_PREPARE(fdf, O_RDONLY | O_CLOEXEC,
+ anon_inode_getfile("[papr-hvpipe]", &papr_hvpipe_handle_ops,
+ (void *)src_info, O_RDWR));
+ if (fdf.err)
+ return fdf.err;
+ retain_and_null_ptr(src_info);
spin_lock(&hvpipe_src_list_lock);
/*
* If two processes are executing ioctl() for the same
@@ -528,22 +518,11 @@ static int papr_hvpipe_dev_create_handle(u32 srcID)
*/
if (hvpipe_find_source(srcID)) {
spin_unlock(&hvpipe_src_list_lock);
- err = -EALREADY;
- goto free_file;
+ return -EALREADY;
}
list_add(&src_info->list, &hvpipe_src_list);
spin_unlock(&hvpipe_src_list_lock);
-
- fd_install(fd, file);
- return fd;
-
-free_file:
- fput(file);
-free_fd:
- put_unused_fd(fd);
-free_buf:
- kfree(src_info);
- return err;
+ return fd_publish(fdf);
}
/*
diff --git a/arch/powerpc/platforms/pseries/papr-platform-dump.c b/arch/powerpc/platforms/pseries/papr-platform-dump.c
index f8d55eccdb6b..be633c9a0e75 100644
--- a/arch/powerpc/platforms/pseries/papr-platform-dump.c
+++ b/arch/powerpc/platforms/pseries/papr-platform-dump.c
@@ -303,8 +303,6 @@ static long papr_platform_dump_create_handle(u64 dump_tag)
{
struct ibm_platform_dump_params *params;
u64 param_dump_tag;
- struct file *file;
- long err;
int fd;
/*
@@ -334,34 +332,22 @@ static long papr_platform_dump_create_handle(u64 dump_tag)
params->dump_tag_lo = (u32)(dump_tag & 0x00000000ffffffffULL);
params->status = RTAS_IBM_PLATFORM_DUMP_START;
- fd = get_unused_fd_flags(O_RDONLY | O_CLOEXEC);
+ fd = FD_ADD(O_RDONLY | O_CLOEXEC,
+ anon_inode_getfile_fmode("[papr-platform-dump]",
+ &papr_platform_dump_handle_ops,
+ (void *)params, O_RDONLY,
+ FMODE_LSEEK | FMODE_PREAD));
if (fd < 0) {
- err = fd;
- goto free_area;
- }
-
- file = anon_inode_getfile_fmode("[papr-platform-dump]",
- &papr_platform_dump_handle_ops,
- (void *)params, O_RDONLY,
- FMODE_LSEEK | FMODE_PREAD);
- if (IS_ERR(file)) {
- err = PTR_ERR(file);
- goto put_fd;
+ rtas_work_area_free(params->work_area);
+ kfree(params);
+ return fd;
}
- fd_install(fd, file);
-
list_add(&params->list, &platform_dump_list);
pr_info("%s (%d) initiated platform dump for dump tag %llu\n",
current->comm, current->pid, dump_tag);
return fd;
-put_fd:
- put_unused_fd(fd);
-free_area:
- rtas_work_area_free(params->work_area);
- kfree(params);
- return err;
}
/*
diff --git a/arch/powerpc/platforms/pseries/papr-rtas-common.c b/arch/powerpc/platforms/pseries/papr-rtas-common.c
index 33c606e3378a..1630e0cd210c 100644
--- a/arch/powerpc/platforms/pseries/papr-rtas-common.c
+++ b/arch/powerpc/platforms/pseries/papr-rtas-common.c
@@ -205,35 +205,18 @@ long papr_rtas_setup_file_interface(struct papr_rtas_sequence *seq,
char *name)
{
const struct papr_rtas_blob *blob;
- struct file *file;
- long ret;
int fd;
blob = papr_rtas_retrieve(seq);
if (IS_ERR(blob))
return PTR_ERR(blob);
- fd = get_unused_fd_flags(O_RDONLY | O_CLOEXEC);
- if (fd < 0) {
- ret = fd;
- goto free_blob;
- }
-
- file = anon_inode_getfile_fmode(name, fops, (void *)blob,
- O_RDONLY, FMODE_LSEEK | FMODE_PREAD);
- if (IS_ERR(file)) {
- ret = PTR_ERR(file);
- goto put_fd;
- }
-
- fd_install(fd, file);
+ fd = FD_ADD(O_RDONLY | O_CLOEXEC,
+ anon_inode_getfile_fmode(name, fops, (void *)blob, O_RDONLY,
+ FMODE_LSEEK | FMODE_PREAD));
+ if (fd < 0)
+ papr_rtas_blob_free(blob);
return fd;
-
-put_fd:
- put_unused_fd(fd);
-free_blob:
- papr_rtas_blob_free(blob);
- return ret;
}
/*
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 22cda9c452d2..fadec20b87a8 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -367,7 +367,7 @@ config RISCV_NONSTANDARD_CACHE_OPS
systems to handle cache management.
config AS_HAS_INSN
- def_bool $(as-instr,.insn r 51$(comma) 0$(comma) 0$(comma) t0$(comma) t0$(comma) zero)
+ def_bool $(as-instr,.insn 0x100000f)
config AS_HAS_OPTION_ARCH
# https://github.com/llvm/llvm-project/commit/9e8ed3403c191ab9c4903e8eeb8f732ff8a43cb4
diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile
index ecf2fcce2d92..4c6de57f65ef 100644
--- a/arch/riscv/Makefile
+++ b/arch/riscv/Makefile
@@ -134,21 +134,6 @@ endif
CHECKFLAGS += -D__riscv -D__riscv_xlen=$(BITS)
# Default target when executing plain make
-boot := arch/riscv/boot
-ifeq ($(CONFIG_XIP_KERNEL),y)
-KBUILD_IMAGE := $(boot)/xipImage
-else
-ifeq ($(CONFIG_RISCV_M_MODE)$(CONFIG_SOC_CANAAN_K210),yy)
-KBUILD_IMAGE := $(boot)/loader.bin
-else
-ifeq ($(CONFIG_EFI_ZBOOT),)
-KBUILD_IMAGE := $(boot)/Image.gz
-else
-KBUILD_IMAGE := $(boot)/vmlinuz.efi
-endif
-endif
-endif
-
boot := arch/riscv/boot
boot-image-y := Image
boot-image-$(CONFIG_KERNEL_BZIP2) := Image.bz2
@@ -159,7 +144,7 @@ boot-image-$(CONFIG_KERNEL_LZO) := Image.lzo
boot-image-$(CONFIG_KERNEL_ZSTD) := Image.zst
boot-image-$(CONFIG_KERNEL_XZ) := Image.xz
ifdef CONFIG_RISCV_M_MODE
-boot-image-$(CONFIG_ARCH_CANAAN) := loader.bin
+boot-image-$(CONFIG_SOC_CANAAN_K210) := loader.bin
endif
boot-image-$(CONFIG_EFI_ZBOOT) := vmlinuz.efi
boot-image-$(CONFIG_XIP_KERNEL) := xipImage
diff --git a/arch/riscv/boot/dts/allwinner/sun20i-d1s.dtsi b/arch/riscv/boot/dts/allwinner/sun20i-d1s.dtsi
index 6367112e614a..a7442a508433 100644
--- a/arch/riscv/boot/dts/allwinner/sun20i-d1s.dtsi
+++ b/arch/riscv/boot/dts/allwinner/sun20i-d1s.dtsi
@@ -28,7 +28,7 @@
riscv,isa-base = "rv64i";
riscv,isa-extensions = "i", "m", "a", "f", "d", "c", "zicntr", "zicsr",
"zifencei", "zihpm", "xtheadvector";
- thead,vlenb = <128>;
+ thead,vlenb = <16>;
#cooling-cells = <2>;
cpu0_intc: interrupt-controller {
diff --git a/arch/riscv/include/asm/vendorid_list.h b/arch/riscv/include/asm/vendorid_list.h
index 3b09874d7a6d..7f5030ee1fcf 100644
--- a/arch/riscv/include/asm/vendorid_list.h
+++ b/arch/riscv/include/asm/vendorid_list.h
@@ -7,8 +7,8 @@
#define ANDES_VENDOR_ID 0x31e
#define MICROCHIP_VENDOR_ID 0x029
+#define MIPS_VENDOR_ID 0x127
#define SIFIVE_VENDOR_ID 0x489
#define THEAD_VENDOR_ID 0x5b7
-#define MIPS_VENDOR_ID 0x722
#endif
diff --git a/arch/riscv/kernel/sbi.c b/arch/riscv/kernel/sbi.c
index 5e8cde055264..c443337056ab 100644
--- a/arch/riscv/kernel/sbi.c
+++ b/arch/riscv/kernel/sbi.c
@@ -648,9 +648,9 @@ int sbi_debug_console_read(char *bytes, unsigned int num_bytes)
void __init sbi_init(void)
{
+ bool srst_power_off = false;
int ret;
- sbi_set_power_off();
ret = sbi_get_spec_version();
if (ret > 0)
sbi_spec_version = ret;
@@ -683,6 +683,7 @@ void __init sbi_init(void)
sbi_probe_extension(SBI_EXT_SRST)) {
pr_info("SBI SRST extension detected\n");
register_platform_power_off(sbi_srst_power_off);
+ srst_power_off = true;
sbi_srst_reboot_nb.notifier_call = sbi_srst_reboot;
sbi_srst_reboot_nb.priority = 192;
register_restart_handler(&sbi_srst_reboot_nb);
@@ -702,4 +703,7 @@ void __init sbi_init(void)
__sbi_send_ipi = __sbi_send_ipi_v01;
__sbi_rfence = __sbi_rfence_v01;
}
+
+ if (!srst_power_off)
+ sbi_set_power_off();
}
diff --git a/arch/riscv/kvm/aia_imsic.c b/arch/riscv/kvm/aia_imsic.c
index fda0346f0ea1..11422cb95a64 100644
--- a/arch/riscv/kvm/aia_imsic.c
+++ b/arch/riscv/kvm/aia_imsic.c
@@ -689,8 +689,20 @@ bool kvm_riscv_vcpu_aia_imsic_has_interrupt(struct kvm_vcpu *vcpu)
*/
read_lock_irqsave(&imsic->vsfile_lock, flags);
- if (imsic->vsfile_cpu > -1)
- ret = !!(csr_read(CSR_HGEIP) & BIT(imsic->vsfile_hgei));
+ if (imsic->vsfile_cpu > -1) {
+ /*
+ * This function is typically called from kvm_vcpu_block() via
+ * kvm_arch_vcpu_runnable() upon WFI trap. The kvm_vcpu_block()
+ * can be preempted and the blocking VCPU might resume on a
+ * different CPU. This means it is possible that current CPU
+ * does not match the imsic->vsfile_cpu hence this function
+ * must check imsic->vsfile_cpu before accessing HGEIP CSR.
+ */
+ if (imsic->vsfile_cpu != vcpu->cpu)
+ ret = true;
+ else
+ ret = !!(csr_read(CSR_HGEIP) & BIT(imsic->vsfile_hgei));
+ }
read_unlock_irqrestore(&imsic->vsfile_lock, flags);
return ret;
diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
index 525fb5a330c0..58f5f3536ffd 100644
--- a/arch/riscv/kvm/mmu.c
+++ b/arch/riscv/kvm/mmu.c
@@ -171,7 +171,6 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
enum kvm_mr_change change)
{
hva_t hva, reg_end, size;
- gpa_t base_gpa;
bool writable;
int ret = 0;
@@ -190,15 +189,13 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
hva = new->userspace_addr;
size = new->npages << PAGE_SHIFT;
reg_end = hva + size;
- base_gpa = new->base_gfn << PAGE_SHIFT;
writable = !(new->flags & KVM_MEM_READONLY);
mmap_read_lock(current->mm);
/*
* A memory region could potentially cover multiple VMAs, and
- * any holes between them, so iterate over all of them to find
- * out if we can map any of them right now.
+ * any holes between them, so iterate over all of them.
*
* +--------------------------------------------+
* +---------------+----------------+ +----------------+
@@ -209,7 +206,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
*/
do {
struct vm_area_struct *vma;
- hva_t vm_start, vm_end;
+ hva_t vm_end;
vma = find_vma_intersection(current->mm, hva, reg_end);
if (!vma)
@@ -225,36 +222,18 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
}
/* Take the intersection of this VMA with the memory region */
- vm_start = max(hva, vma->vm_start);
vm_end = min(reg_end, vma->vm_end);
if (vma->vm_flags & VM_PFNMAP) {
- gpa_t gpa = base_gpa + (vm_start - hva);
- phys_addr_t pa;
-
- pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT;
- pa += vm_start - vma->vm_start;
-
/* IO region dirty page logging not allowed */
if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
ret = -EINVAL;
goto out;
}
-
- ret = kvm_riscv_mmu_ioremap(kvm, gpa, pa, vm_end - vm_start,
- writable, false);
- if (ret)
- break;
}
hva = vm_end;
} while (hva < reg_end);
- if (change == KVM_MR_FLAGS_ONLY)
- goto out;
-
- if (ret)
- kvm_riscv_mmu_iounmap(kvm, base_gpa, size);
-
out:
mmap_read_unlock(current->mm);
return ret;
diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c
index bccb919ca615..5ce35aba6069 100644
--- a/arch/riscv/kvm/vcpu.c
+++ b/arch/riscv/kvm/vcpu.c
@@ -212,7 +212,7 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
{
- return (kvm_riscv_vcpu_has_interrupts(vcpu, -1UL) &&
+ return (kvm_riscv_vcpu_has_interrupts(vcpu, -1ULL) &&
!kvm_riscv_vcpu_stopped(vcpu) && !vcpu->arch.pause);
}
diff --git a/arch/s390/Makefile b/arch/s390/Makefile
index b4769241332b..8578361133a4 100644
--- a/arch/s390/Makefile
+++ b/arch/s390/Makefile
@@ -22,7 +22,7 @@ KBUILD_AFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -D__ASSEMBLY__
ifndef CONFIG_AS_IS_LLVM
KBUILD_AFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO),$(aflags_dwarf))
endif
-KBUILD_CFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -O2 -mpacked-stack -std=gnu11
+KBUILD_CFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -O2 -mpacked-stack -std=gnu11 -fms-extensions
KBUILD_CFLAGS_DECOMPRESSOR += -DDISABLE_BRANCH_PROFILING -D__NO_FORTIFY
KBUILD_CFLAGS_DECOMPRESSOR += -D__DECOMPRESSOR
KBUILD_CFLAGS_DECOMPRESSOR += -Wno-pointer-sign
@@ -35,6 +35,7 @@ KBUILD_CFLAGS_DECOMPRESSOR += $(call cc-disable-warning, address-of-packed-membe
KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO),-g)
KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO_DWARF4), $(call cc-option, -gdwarf-4,))
KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_CC_NO_ARRAY_BOUNDS),-Wno-array-bounds)
+KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_CC_IS_CLANG),-Wno-microsoft-anon-tag)
UTS_MACHINE := s390x
STACK_SIZE := $(if $(CONFIG_KASAN),65536,$(if $(CONFIG_KMSAN),65536,16384))
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index b7100c6a4054..6663f1619abb 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1154,17 +1154,15 @@ static inline pte_t pte_mkhuge(pte_t pte)
#define IPTE_NODAT 0x400
#define IPTE_GUEST_ASCE 0x800
-static __always_inline void __ptep_rdp(unsigned long addr, pte_t *ptep,
- unsigned long opt, unsigned long asce,
- int local)
+static __always_inline void __ptep_rdp(unsigned long addr, pte_t *ptep, int local)
{
unsigned long pto;
pto = __pa(ptep) & ~(PTRS_PER_PTE * sizeof(pte_t) - 1);
- asm volatile(".insn rrf,0xb98b0000,%[r1],%[r2],%[asce],%[m4]"
+ asm volatile(".insn rrf,0xb98b0000,%[r1],%[r2],%%r0,%[m4]"
: "+m" (*ptep)
- : [r1] "a" (pto), [r2] "a" ((addr & PAGE_MASK) | opt),
- [asce] "a" (asce), [m4] "i" (local));
+ : [r1] "a" (pto), [r2] "a" (addr & PAGE_MASK),
+ [m4] "i" (local));
}
static __always_inline void __ptep_ipte(unsigned long address, pte_t *ptep,
@@ -1348,7 +1346,7 @@ static inline void flush_tlb_fix_spurious_fault(struct vm_area_struct *vma,
* A local RDP can be used to do the flush.
*/
if (cpu_has_rdp() && !(pte_val(*ptep) & _PAGE_PROTECT))
- __ptep_rdp(address, ptep, 0, 0, 1);
+ __ptep_rdp(address, ptep, 1);
}
#define flush_tlb_fix_spurious_fault flush_tlb_fix_spurious_fault
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index 8a6744d658db..5863787ab036 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -472,3 +472,4 @@
467 common open_tree_attr sys_open_tree_attr sys_open_tree_attr
468 common file_getattr sys_file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr sys_file_setattr
+470 common listns sys_listns sys_listns
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 0fde20bbc50b..05974304d622 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -274,9 +274,9 @@ void ptep_reset_dat_prot(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
preempt_disable();
atomic_inc(&mm->context.flush_count);
if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
- __ptep_rdp(addr, ptep, 0, 0, 1);
+ __ptep_rdp(addr, ptep, 1);
else
- __ptep_rdp(addr, ptep, 0, 0, 0);
+ __ptep_rdp(addr, ptep, 0);
/*
* PTE is not invalidated by RDP, only _PAGE_PROTECT is cleared. That
* means it is still valid and active, and must not be changed according
diff --git a/arch/s390/purgatory/Makefile b/arch/s390/purgatory/Makefile
index bd39b36e7bd6..0c196a5b194a 100644
--- a/arch/s390/purgatory/Makefile
+++ b/arch/s390/purgatory/Makefile
@@ -13,7 +13,7 @@ CFLAGS_sha256.o := -D__NO_FORTIFY
$(obj)/mem.o: $(srctree)/arch/s390/lib/mem.S FORCE
$(call if_changed_rule,as_o_S)
-KBUILD_CFLAGS := -std=gnu11 -fno-strict-aliasing -Wall -Wstrict-prototypes
+KBUILD_CFLAGS := -std=gnu11 -fms-extensions -fno-strict-aliasing -Wall -Wstrict-prototypes
KBUILD_CFLAGS += -Wno-pointer-sign -Wno-sign-compare
KBUILD_CFLAGS += -fno-zero-initialized-in-bss -fno-builtin -ffreestanding
KBUILD_CFLAGS += -Os -m64 -msoft-float -fno-common
@@ -21,6 +21,7 @@ KBUILD_CFLAGS += -fno-stack-protector
KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
KBUILD_CFLAGS += -D__DISABLE_EXPORTS
KBUILD_CFLAGS += $(CLANG_FLAGS)
+KBUILD_CFLAGS += $(if $(CONFIG_CC_IS_CLANG),-Wno-microsoft-anon-tag)
KBUILD_CFLAGS += $(call cc-option,-fno-PIE)
KBUILD_AFLAGS := $(filter-out -DCC_USING_EXPOLINE,$(KBUILD_AFLAGS))
KBUILD_AFLAGS += -D__DISABLE_EXPORTS
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index 5e9c9eff5539..969c11325ade 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -473,3 +473,4 @@
467 common open_tree_attr sys_open_tree_attr
468 common file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr
+470 common listns sys_listns
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index ebb7d06d1044..39aa26b6a50b 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -515,3 +515,4 @@
467 common open_tree_attr sys_open_tree_attr
468 common file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr
+470 common listns sys_listns
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 1a27efcf3c20..1d403a3612ea 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -48,7 +48,8 @@ endif
# How to compile the 16-bit code. Note we always compile for -march=i386;
# that way we can complain to the user if the CPU is insufficient.
-REALMODE_CFLAGS := -std=gnu11 -m16 -g -Os -DDISABLE_BRANCH_PROFILING -D__DISABLE_EXPORTS \
+REALMODE_CFLAGS := -std=gnu11 -fms-extensions -m16 -g -Os \
+ -DDISABLE_BRANCH_PROFILING -D__DISABLE_EXPORTS \
-Wall -Wstrict-prototypes -march=i386 -mregparm=3 \
-fno-strict-aliasing -fomit-frame-pointer -fno-pic \
-mno-mmx -mno-sse $(call cc-option,-fcf-protection=none)
@@ -60,6 +61,7 @@ REALMODE_CFLAGS += $(cc_stack_align4)
REALMODE_CFLAGS += $(CLANG_FLAGS)
ifdef CONFIG_CC_IS_CLANG
REALMODE_CFLAGS += -Wno-gnu
+REALMODE_CFLAGS += -Wno-microsoft-anon-tag
endif
export REALMODE_CFLAGS
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 74657589264d..68f9d7a1683b 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -25,7 +25,7 @@ targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \
# avoid errors with '-march=i386', and future flags may depend on the target to
# be valid.
KBUILD_CFLAGS := -m$(BITS) -O2 $(CLANG_FLAGS)
-KBUILD_CFLAGS += -std=gnu11
+KBUILD_CFLAGS += -std=gnu11 -fms-extensions
KBUILD_CFLAGS += -fno-strict-aliasing -fPIE
KBUILD_CFLAGS += -Wundef
KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
@@ -36,7 +36,10 @@ KBUILD_CFLAGS += -mno-mmx -mno-sse
KBUILD_CFLAGS += -ffreestanding -fshort-wchar
KBUILD_CFLAGS += -fno-stack-protector
KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member)
-KBUILD_CFLAGS += $(call cc-disable-warning, gnu)
+ifdef CONFIG_CC_IS_CLANG
+KBUILD_CFLAGS += -Wno-gnu
+KBUILD_CFLAGS += -Wno-microsoft-anon-tag
+endif
KBUILD_CFLAGS += -Wno-pointer-sign
KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
KBUILD_CFLAGS += -D__DISABLE_EXPORTS
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 4877e16da69a..e979a3eac7a3 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -475,3 +475,4 @@
467 i386 open_tree_attr sys_open_tree_attr
468 i386 file_getattr sys_file_getattr
469 i386 file_setattr sys_file_setattr
+470 i386 listns sys_listns
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index ced2a1deecd7..8a4ac4841be6 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -394,6 +394,7 @@
467 common open_tree_attr sys_open_tree_attr
468 common file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr
+470 common listns sys_listns
#
# Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 745caa6c15a3..fa6c47b50989 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2789,13 +2789,13 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re
return;
}
- if (perf_callchain_store(entry, regs->ip))
- return;
-
- if (perf_hw_regs(regs))
+ if (perf_hw_regs(regs)) {
+ if (perf_callchain_store(entry, regs->ip))
+ return;
unwind_start(&state, current, regs, NULL);
- else
+ } else {
unwind_start(&state, current, NULL, (void *)regs->sp);
+ }
for (; !unwind_done(&state); unwind_next_frame(&state)) {
addr = unwind_get_return_address(&state);
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index d6c945cc5d07..e228e564b15e 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1325,8 +1325,6 @@ static void uncore_pci_sub_driver_init(void)
continue;
pmu = &type->pmus[UNCORE_PCI_DEV_IDX(ids->driver_data)];
- if (!pmu)
- continue;
if (uncore_pci_get_dev_die_info(pci_sub_dev, &die))
continue;
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 93156ac4ffe0..b08c95872eed 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -56,6 +56,11 @@ arch_ftrace_get_regs(struct ftrace_regs *fregs)
return &arch_ftrace_regs(fregs)->regs;
}
+#define arch_ftrace_partial_regs(regs) do { \
+ regs->flags &= ~X86_EFLAGS_FIXED; \
+ regs->cs = __KERNEL_CS; \
+} while (0)
+
#define arch_ftrace_fill_perf_regs(fregs, _regs) do { \
(_regs)->ip = arch_ftrace_regs(fregs)->regs.ip; \
(_regs)->sp = arch_ftrace_regs(fregs)->regs.sp; \
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index 9792e329343e..1baa86dfe029 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -93,6 +93,7 @@
#define EXIT_REASON_TPAUSE 68
#define EXIT_REASON_BUS_LOCK 74
#define EXIT_REASON_NOTIFY 75
+#define EXIT_REASON_SEAMCALL 76
#define EXIT_REASON_TDCALL 77
#define EXIT_REASON_MSR_READ_IMM 84
#define EXIT_REASON_MSR_WRITE_IMM 85
diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c
index 7047124490f6..d7c8ef1e354d 100644
--- a/arch/x86/kernel/acpi/cppc.c
+++ b/arch/x86/kernel/acpi/cppc.c
@@ -196,7 +196,7 @@ int amd_detect_prefcore(bool *detected)
break;
}
- for_each_present_cpu(cpu) {
+ for_each_online_cpu(cpu) {
u32 tmp;
int ret;
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 2ba9f2d42d8c..5d46709c58d0 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -1037,7 +1037,14 @@ static void init_amd_zen4(struct cpuinfo_x86 *c)
static const struct x86_cpu_id zen5_rdseed_microcode[] = {
ZEN_MODEL_STEP_UCODE(0x1a, 0x02, 0x1, 0x0b00215a),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x08, 0x1, 0x0b008121),
ZEN_MODEL_STEP_UCODE(0x1a, 0x11, 0x0, 0x0b101054),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x24, 0x0, 0x0b204037),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x44, 0x0, 0x0b404035),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x44, 0x1, 0x0b404108),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x60, 0x0, 0x0b600037),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x68, 0x0, 0x0b608038),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x70, 0x0, 0x0b700037),
{},
};
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index dc82153009da..a881bf4c2011 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -224,6 +224,7 @@ static bool need_sha_check(u32 cur_rev)
case 0xb1010: return cur_rev <= 0xb101046; break;
case 0xb2040: return cur_rev <= 0xb204031; break;
case 0xb4040: return cur_rev <= 0xb404031; break;
+ case 0xb4041: return cur_rev <= 0xb404101; break;
case 0xb6000: return cur_rev <= 0xb600031; break;
case 0xb6080: return cur_rev <= 0xb608031; break;
case 0xb7000: return cur_rev <= 0xb700031; break;
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index 367da3638167..823dbdd0eb41 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -354,12 +354,17 @@ SYM_CODE_START(return_to_handler)
UNWIND_HINT_UNDEFINED
ANNOTATE_NOENDBR
+ /* Restore return_to_handler value that got eaten by previous ret instruction. */
+ subq $8, %rsp
+ UNWIND_HINT_FUNC
+
/* Save ftrace_regs for function exit context */
subq $(FRAME_SIZE), %rsp
movq %rax, RAX(%rsp)
movq %rdx, RDX(%rsp)
movq %rbp, RBP(%rsp)
+ movq %rsp, RSP(%rsp)
movq %rsp, %rdi
call ftrace_return_to_handler
@@ -368,7 +373,8 @@ SYM_CODE_START(return_to_handler)
movq RDX(%rsp), %rdx
movq RAX(%rsp), %rax
- addq $(FRAME_SIZE), %rsp
+ addq $(FRAME_SIZE) + 8, %rsp
+
/*
* Jump back to the old return address. This cannot be JMP_NOSPEC rdi
* since IBT would demand that contain ENDBR, which simply isn't so for
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index f286b5706d7c..fef00546c885 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -216,7 +216,7 @@ static void avic_deactivate_vmcb(struct vcpu_svm *svm)
* This function is called from IOMMU driver to notify
* SVM to schedule in a particular vCPU of a particular VM.
*/
-int avic_ga_log_notifier(u32 ga_tag)
+static int avic_ga_log_notifier(u32 ga_tag)
{
unsigned long flags;
struct kvm_svm *kvm_svm;
@@ -788,7 +788,7 @@ int avic_init_vcpu(struct vcpu_svm *svm)
struct kvm_vcpu *vcpu = &svm->vcpu;
INIT_LIST_HEAD(&svm->ir_list);
- spin_lock_init(&svm->ir_list_lock);
+ raw_spin_lock_init(&svm->ir_list_lock);
if (!enable_apicv || !irqchip_in_kernel(vcpu->kvm))
return 0;
@@ -816,9 +816,9 @@ static void svm_ir_list_del(struct kvm_kernel_irqfd *irqfd)
if (!vcpu)
return;
- spin_lock_irqsave(&to_svm(vcpu)->ir_list_lock, flags);
+ raw_spin_lock_irqsave(&to_svm(vcpu)->ir_list_lock, flags);
list_del(&irqfd->vcpu_list);
- spin_unlock_irqrestore(&to_svm(vcpu)->ir_list_lock, flags);
+ raw_spin_unlock_irqrestore(&to_svm(vcpu)->ir_list_lock, flags);
}
int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
@@ -855,7 +855,7 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
* list of IRQs being posted to the vCPU, to ensure the IRTE
* isn't programmed with stale pCPU/IsRunning information.
*/
- guard(spinlock_irqsave)(&svm->ir_list_lock);
+ guard(raw_spinlock_irqsave)(&svm->ir_list_lock);
/*
* Update the target pCPU for IOMMU doorbells if the vCPU is
@@ -972,7 +972,7 @@ static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu,
* up-to-date entry information, or that this task will wait until
* svm_ir_list_add() completes to set the new target pCPU.
*/
- spin_lock_irqsave(&svm->ir_list_lock, flags);
+ raw_spin_lock_irqsave(&svm->ir_list_lock, flags);
entry = svm->avic_physical_id_entry;
WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
@@ -997,7 +997,7 @@ static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu,
avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, action);
- spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+ raw_spin_unlock_irqrestore(&svm->ir_list_lock, flags);
}
void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -1035,7 +1035,7 @@ static void __avic_vcpu_put(struct kvm_vcpu *vcpu, enum avic_vcpu_action action)
* or that this task will wait until svm_ir_list_add() completes to
* mark the vCPU as not running.
*/
- spin_lock_irqsave(&svm->ir_list_lock, flags);
+ raw_spin_lock_irqsave(&svm->ir_list_lock, flags);
avic_update_iommu_vcpu_affinity(vcpu, -1, action);
@@ -1059,7 +1059,7 @@ static void __avic_vcpu_put(struct kvm_vcpu *vcpu, enum avic_vcpu_action action)
svm->avic_physical_id_entry = entry;
- spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+ raw_spin_unlock_irqrestore(&svm->ir_list_lock, flags);
}
void avic_vcpu_put(struct kvm_vcpu *vcpu)
@@ -1243,3 +1243,9 @@ bool __init avic_hardware_setup(void)
return true;
}
+
+void avic_hardware_unsetup(void)
+{
+ if (avic)
+ amd_iommu_register_ga_log_notifier(NULL);
+}
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index a6443feab252..da6e80b3ac35 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -677,11 +677,10 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
*/
svm_copy_lbrs(vmcb02, vmcb12);
vmcb02->save.dbgctl &= ~DEBUGCTL_RESERVED_BITS;
- svm_update_lbrv(&svm->vcpu);
-
- } else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) {
+ } else {
svm_copy_lbrs(vmcb02, vmcb01);
}
+ svm_update_lbrv(&svm->vcpu);
}
static inline bool is_evtinj_soft(u32 evtinj)
@@ -833,11 +832,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
svm->soft_int_next_rip = vmcb12_rip;
}
- vmcb02->control.virt_ext = vmcb01->control.virt_ext &
- LBR_CTL_ENABLE_MASK;
- if (guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV))
- vmcb02->control.virt_ext |=
- (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK);
+ /* LBR_CTL_ENABLE_MASK is controlled by svm_update_lbrv() */
if (!nested_vmcb_needs_vls_intercept(svm))
vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
@@ -1189,13 +1184,12 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
if (unlikely(guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) &&
- (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) {
+ (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK)))
svm_copy_lbrs(vmcb12, vmcb02);
- svm_update_lbrv(vcpu);
- } else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) {
+ else
svm_copy_lbrs(vmcb01, vmcb02);
- svm_update_lbrv(vcpu);
- }
+
+ svm_update_lbrv(vcpu);
if (vnmi) {
if (vmcb02->control.int_ctl & V_NMI_BLOCKING_MASK)
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 153c12dbf3eb..9d29b2e7e855 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -705,7 +705,11 @@ void *svm_alloc_permissions_map(unsigned long size, gfp_t gfp_mask)
static void svm_recalc_lbr_msr_intercepts(struct kvm_vcpu *vcpu)
{
- bool intercept = !(to_svm(vcpu)->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK);
+ struct vcpu_svm *svm = to_svm(vcpu);
+ bool intercept = !(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK);
+
+ if (intercept == svm->lbr_msrs_intercepted)
+ return;
svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHFROMIP, MSR_TYPE_RW, intercept);
svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHTOIP, MSR_TYPE_RW, intercept);
@@ -714,6 +718,8 @@ static void svm_recalc_lbr_msr_intercepts(struct kvm_vcpu *vcpu)
if (sev_es_guest(vcpu->kvm))
svm_set_intercept_for_msr(vcpu, MSR_IA32_DEBUGCTLMSR, MSR_TYPE_RW, intercept);
+
+ svm->lbr_msrs_intercepted = intercept;
}
void svm_vcpu_free_msrpm(void *msrpm)
@@ -806,60 +812,43 @@ void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
vmcb_mark_dirty(to_vmcb, VMCB_LBR);
}
-void svm_enable_lbrv(struct kvm_vcpu *vcpu)
+static void __svm_enable_lbrv(struct kvm_vcpu *vcpu)
{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
- svm_recalc_lbr_msr_intercepts(vcpu);
-
- /* Move the LBR msrs to the vmcb02 so that the guest can see them. */
- if (is_guest_mode(vcpu))
- svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr);
+ to_svm(vcpu)->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
}
-static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
+void svm_enable_lbrv(struct kvm_vcpu *vcpu)
{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm);
- svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
+ __svm_enable_lbrv(vcpu);
svm_recalc_lbr_msr_intercepts(vcpu);
-
- /*
- * Move the LBR msrs back to the vmcb01 to avoid copying them
- * on nested guest entries.
- */
- if (is_guest_mode(vcpu))
- svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb);
}
-static struct vmcb *svm_get_lbr_vmcb(struct vcpu_svm *svm)
+static void __svm_disable_lbrv(struct kvm_vcpu *vcpu)
{
- /*
- * If LBR virtualization is disabled, the LBR MSRs are always kept in
- * vmcb01. If LBR virtualization is enabled and L1 is running VMs of
- * its own, the MSRs are moved between vmcb01 and vmcb02 as needed.
- */
- return svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK ? svm->vmcb :
- svm->vmcb01.ptr;
+ KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm);
+ to_svm(vcpu)->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
}
void svm_update_lbrv(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
bool current_enable_lbrv = svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK;
- bool enable_lbrv = (svm_get_lbr_vmcb(svm)->save.dbgctl & DEBUGCTLMSR_LBR) ||
+ bool enable_lbrv = (svm->vmcb->save.dbgctl & DEBUGCTLMSR_LBR) ||
(is_guest_mode(vcpu) && guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) &&
(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK));
- if (enable_lbrv == current_enable_lbrv)
- return;
+ if (enable_lbrv && !current_enable_lbrv)
+ __svm_enable_lbrv(vcpu);
+ else if (!enable_lbrv && current_enable_lbrv)
+ __svm_disable_lbrv(vcpu);
- if (enable_lbrv)
- svm_enable_lbrv(vcpu);
- else
- svm_disable_lbrv(vcpu);
+ /*
+ * During nested transitions, it is possible that the current VMCB has
+ * LBR_CTL set, but the previous LBR_CTL had it cleared (or vice versa).
+ * In this case, even though LBR_CTL does not need an update, intercepts
+ * do, so always recalculate the intercepts here.
+ */
+ svm_recalc_lbr_msr_intercepts(vcpu);
}
void disable_nmi_singlestep(struct vcpu_svm *svm)
@@ -921,6 +910,8 @@ static void svm_hardware_unsetup(void)
{
int cpu;
+ avic_hardware_unsetup();
+
sev_hardware_unsetup();
for_each_possible_cpu(cpu)
@@ -1236,6 +1227,7 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu)
}
svm->x2avic_msrs_intercepted = true;
+ svm->lbr_msrs_intercepted = true;
svm->vmcb01.ptr = page_address(vmcb01_page);
svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
@@ -2722,19 +2714,19 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = svm->tsc_aux;
break;
case MSR_IA32_DEBUGCTLMSR:
- msr_info->data = svm_get_lbr_vmcb(svm)->save.dbgctl;
+ msr_info->data = svm->vmcb->save.dbgctl;
break;
case MSR_IA32_LASTBRANCHFROMIP:
- msr_info->data = svm_get_lbr_vmcb(svm)->save.br_from;
+ msr_info->data = svm->vmcb->save.br_from;
break;
case MSR_IA32_LASTBRANCHTOIP:
- msr_info->data = svm_get_lbr_vmcb(svm)->save.br_to;
+ msr_info->data = svm->vmcb->save.br_to;
break;
case MSR_IA32_LASTINTFROMIP:
- msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_from;
+ msr_info->data = svm->vmcb->save.last_excp_from;
break;
case MSR_IA32_LASTINTTOIP:
- msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_to;
+ msr_info->data = svm->vmcb->save.last_excp_to;
break;
case MSR_VM_HSAVE_PA:
msr_info->data = svm->nested.hsave_msr;
@@ -3002,7 +2994,11 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
if (data & DEBUGCTL_RESERVED_BITS)
return 1;
- svm_get_lbr_vmcb(svm)->save.dbgctl = data;
+ if (svm->vmcb->save.dbgctl == data)
+ break;
+
+ svm->vmcb->save.dbgctl = data;
+ vmcb_mark_dirty(svm->vmcb, VMCB_LBR);
svm_update_lbrv(vcpu);
break;
case MSR_VM_HSAVE_PA:
@@ -5386,12 +5382,6 @@ static __init int svm_hardware_setup(void)
svm_hv_hardware_setup();
- for_each_possible_cpu(cpu) {
- r = svm_cpu_init(cpu);
- if (r)
- goto err;
- }
-
enable_apicv = avic_hardware_setup();
if (!enable_apicv) {
enable_ipiv = false;
@@ -5435,6 +5425,13 @@ static __init int svm_hardware_setup(void)
svm_set_cpu_caps();
kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_CD_NW_CLEARED;
+
+ for_each_possible_cpu(cpu) {
+ r = svm_cpu_init(cpu);
+ if (r)
+ goto err;
+ }
+
return 0;
err:
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index e4b04f435b3d..dd78e6402345 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -329,13 +329,14 @@ struct vcpu_svm {
* back into remapped mode).
*/
struct list_head ir_list;
- spinlock_t ir_list_lock;
+ raw_spinlock_t ir_list_lock;
struct vcpu_sev_es_state sev_es;
bool guest_state_loaded;
bool x2avic_msrs_intercepted;
+ bool lbr_msrs_intercepted;
/* Guest GIF value, used when vGIF is not enabled */
bool guest_gif;
@@ -805,7 +806,7 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
)
bool __init avic_hardware_setup(void);
-int avic_ga_log_notifier(u32 ga_tag);
+void avic_hardware_unsetup(void);
void avic_vm_destroy(struct kvm *kvm);
int avic_vm_init(struct kvm *kvm);
void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb);
diff --git a/arch/x86/kvm/vmx/common.h b/arch/x86/kvm/vmx/common.h
index bc5ece76533a..412d0829d7a2 100644
--- a/arch/x86/kvm/vmx/common.h
+++ b/arch/x86/kvm/vmx/common.h
@@ -98,7 +98,7 @@ static inline int __vmx_handle_ept_violation(struct kvm_vcpu *vcpu, gpa_t gpa,
error_code |= (exit_qualification & EPT_VIOLATION_PROT_MASK)
? PFERR_PRESENT_MASK : 0;
- if (error_code & EPT_VIOLATION_GVA_IS_VALID)
+ if (exit_qualification & EPT_VIOLATION_GVA_IS_VALID)
error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ?
PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 76271962cb70..bcea087b642f 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -6728,6 +6728,14 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu,
case EXIT_REASON_NOTIFY:
/* Notify VM exit is not exposed to L1 */
return false;
+ case EXIT_REASON_SEAMCALL:
+ case EXIT_REASON_TDCALL:
+ /*
+ * SEAMCALL and TDCALL unconditionally VM-Exit, but aren't
+ * virtualized by KVM for L1 hypervisors, i.e. L1 should
+ * never want or expect such an exit.
+ */
+ return false;
default:
return true;
}
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index f87c216d976d..91b6f2f3edc2 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6032,6 +6032,12 @@ static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
return 1;
}
+static int handle_tdx_instruction(struct kvm_vcpu *vcpu)
+{
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+}
+
#ifndef CONFIG_X86_SGX_KVM
static int handle_encls(struct kvm_vcpu *vcpu)
{
@@ -6157,6 +6163,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_ENCLS] = handle_encls,
[EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit,
[EXIT_REASON_NOTIFY] = handle_notify,
+ [EXIT_REASON_SEAMCALL] = handle_tdx_instruction,
+ [EXIT_REASON_TDCALL] = handle_tdx_instruction,
[EXIT_REASON_MSR_READ_IMM] = handle_rdmsr_imm,
[EXIT_REASON_MSR_WRITE_IMM] = handle_wrmsr_imm,
};
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b4b5d2d09634..c9c2aa6f4705 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3874,15 +3874,9 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
/*
* Returns true if the MSR in question is managed via XSTATE, i.e. is context
- * switched with the rest of guest FPU state. Note! S_CET is _not_ context
- * switched via XSTATE even though it _is_ saved/restored via XSAVES/XRSTORS.
- * Because S_CET is loaded on VM-Enter and VM-Exit via dedicated VMCS fields,
- * the value saved/restored via XSTATE is always the host's value. That detail
- * is _extremely_ important, as the guest's S_CET must _never_ be resident in
- * hardware while executing in the host. Loading guest values for U_CET and
- * PL[0-3]_SSP while executing in the kernel is safe, as U_CET is specific to
- * userspace, and PL[0-3]_SSP are only consumed when transitioning to lower
- * privilege levels, i.e. are effectively only consumed by userspace as well.
+ * switched with the rest of guest FPU state.
+ *
+ * Note, S_CET is _not_ saved/restored via XSAVES/XRSTORS.
*/
static bool is_xstate_managed_msr(struct kvm_vcpu *vcpu, u32 msr)
{
@@ -3905,6 +3899,11 @@ static bool is_xstate_managed_msr(struct kvm_vcpu *vcpu, u32 msr)
* MSR that is managed via XSTATE. Note, the caller is responsible for doing
* the initial FPU load, this helper only ensures that guest state is resident
* in hardware (the kernel can load its FPU state in IRQ context).
+ *
+ * Note, loading guest values for U_CET and PL[0-3]_SSP while executing in the
+ * kernel is safe, as U_CET is specific to userspace, and PL[0-3]_SSP are only
+ * consumed when transitioning to lower privilege levels, i.e. are effectively
+ * only consumed by userspace as well.
*/
static __always_inline void kvm_access_xstate_msr(struct kvm_vcpu *vcpu,
struct msr_data *msr_info,
@@ -11807,6 +11806,9 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
/* Swap (qemu) user FPU context for the guest FPU context. */
static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
{
+ if (KVM_BUG_ON(vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm))
+ return;
+
/* Exclude PKRU, it's restored separately immediately after VM-Exit. */
fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true);
trace_kvm_fpu(1);
@@ -11815,6 +11817,9 @@ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
/* When vcpu_run ends, restore user space FPU context. */
static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
{
+ if (KVM_BUG_ON(!vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm))
+ return;
+
fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, false);
++vcpu->stat.fpu_reload;
trace_kvm_fpu(0);
@@ -12137,9 +12142,6 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
int r;
vcpu_load(vcpu);
- if (kvm_mpx_supported())
- kvm_load_guest_fpu(vcpu);
-
kvm_vcpu_srcu_read_lock(vcpu);
r = kvm_apic_accept_events(vcpu);
@@ -12156,9 +12158,6 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
out:
kvm_vcpu_srcu_read_unlock(vcpu);
-
- if (kvm_mpx_supported())
- kvm_put_guest_fpu(vcpu);
vcpu_put(vcpu);
return r;
}
@@ -12788,6 +12787,7 @@ static void kvm_xstate_reset(struct kvm_vcpu *vcpu, bool init_event)
{
struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate;
u64 xfeatures_mask;
+ bool fpu_in_use;
int i;
/*
@@ -12811,13 +12811,23 @@ static void kvm_xstate_reset(struct kvm_vcpu *vcpu, bool init_event)
BUILD_BUG_ON(sizeof(xfeatures_mask) * BITS_PER_BYTE <= XFEATURE_MAX);
/*
- * All paths that lead to INIT are required to load the guest's FPU
- * state (because most paths are buried in KVM_RUN).
- */
- kvm_put_guest_fpu(vcpu);
+ * Unload guest FPU state (if necessary) before zeroing XSTATE fields
+ * as the kernel can only modify the state when its resident in memory,
+ * i.e. when it's not loaded into hardware.
+ *
+ * WARN if the vCPU's desire to run, i.e. whether or not its in KVM_RUN,
+ * doesn't match the loaded/in-use state of the FPU, as KVM_RUN is the
+ * only path that can trigger INIT emulation _and_ loads FPU state, and
+ * KVM_RUN should _always_ load FPU state.
+ */
+ WARN_ON_ONCE(vcpu->wants_to_run != fpstate->in_use);
+ fpu_in_use = fpstate->in_use;
+ if (fpu_in_use)
+ kvm_put_guest_fpu(vcpu);
for_each_set_bit(i, (unsigned long *)&xfeatures_mask, XFEATURE_MAX)
fpstate_clear_xstate_component(fpstate, i);
- kvm_load_guest_fpu(vcpu);
+ if (fpu_in_use)
+ kvm_load_guest_fpu(vcpu);
}
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index 374e4cb788d8..438a3b170402 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -440,3 +440,4 @@
467 common open_tree_attr sys_open_tree_attr
468 common file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr
+470 common listns sys_listns
diff --git a/block/bdev.c b/block/bdev.c
index 810707cca970..b8fbb9576110 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -67,7 +67,7 @@ static void bdev_write_inode(struct block_device *bdev)
int ret;
spin_lock(&inode->i_lock);
- while (inode->i_state & I_DIRTY) {
+ while (inode_state_read(inode) & I_DIRTY) {
spin_unlock(&inode->i_lock);
ret = write_inode_now(inode, true);
if (ret)
@@ -217,9 +217,26 @@ int set_blocksize(struct file *file, int size)
EXPORT_SYMBOL(set_blocksize);
+static int sb_validate_large_blocksize(struct super_block *sb, int size)
+{
+ const char *err_str = NULL;
+
+ if (!(sb->s_type->fs_flags & FS_LBS))
+ err_str = "not supported by filesystem";
+ else if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+ err_str = "is only supported with CONFIG_TRANSPARENT_HUGEPAGE";
+
+ if (!err_str)
+ return 0;
+
+ pr_warn_ratelimited("%s: block size(%d) > page size(%lu) %s\n",
+ sb->s_type->name, size, PAGE_SIZE, err_str);
+ return -EINVAL;
+}
+
int sb_set_blocksize(struct super_block *sb, int size)
{
- if (!(sb->s_type->fs_flags & FS_LBS) && size > PAGE_SIZE)
+ if (size > PAGE_SIZE && sb_validate_large_blocksize(sb, size))
return 0;
if (set_blocksize(sb->s_bdev_file, size))
return 0;
@@ -231,7 +248,7 @@ int sb_set_blocksize(struct super_block *sb, int size)
EXPORT_SYMBOL(sb_set_blocksize);
-int sb_min_blocksize(struct super_block *sb, int size)
+int __must_check sb_min_blocksize(struct super_block *sb, int size)
{
int minsize = bdev_logical_block_size(sb->s_bdev);
if (size < minsize)
@@ -1265,7 +1282,7 @@ void sync_bdevs(bool wait)
struct block_device *bdev;
spin_lock(&inode->i_lock);
- if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) ||
+ if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW) ||
mapping->nrpages == 0) {
spin_unlock(&inode->i_lock);
continue;
diff --git a/block/fops.c b/block/fops.c
index 5e3db9fead77..4dad9c2d5796 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -540,12 +540,13 @@ const struct address_space_operations def_blk_aops = {
#else /* CONFIG_BUFFER_HEAD */
static int blkdev_read_folio(struct file *file, struct folio *folio)
{
- return iomap_read_folio(folio, &blkdev_iomap_ops);
+ iomap_bio_read_folio(folio, &blkdev_iomap_ops);
+ return 0;
}
static void blkdev_readahead(struct readahead_control *rac)
{
- iomap_readahead(rac, &blkdev_iomap_ops);
+ iomap_bio_readahead(rac, &blkdev_iomap_ops);
}
static ssize_t blkdev_writeback_range(struct iomap_writepage_ctx *wpc,
diff --git a/drivers/acpi/acpi_mrrm.c b/drivers/acpi/acpi_mrrm.c
index a6dbf623e557..6d69554c940e 100644
--- a/drivers/acpi/acpi_mrrm.c
+++ b/drivers/acpi/acpi_mrrm.c
@@ -152,26 +152,49 @@ ATTRIBUTE_GROUPS(memory_range);
static __init int add_boot_memory_ranges(void)
{
- struct kobject *pkobj, *kobj;
+ struct kobject *pkobj, *kobj, **kobjs;
int ret = -EINVAL;
- char *name;
+ char name[16];
+ int i;
pkobj = kobject_create_and_add("memory_ranges", acpi_kobj);
+ if (!pkobj)
+ return -ENOMEM;
- for (int i = 0; i < mrrm_mem_entry_num; i++) {
- name = kasprintf(GFP_KERNEL, "range%d", i);
- if (!name) {
- ret = -ENOMEM;
- break;
- }
+ kobjs = kcalloc(mrrm_mem_entry_num, sizeof(*kobjs), GFP_KERNEL);
+ if (!kobjs) {
+ kobject_put(pkobj);
+ return -ENOMEM;
+ }
+ for (i = 0; i < mrrm_mem_entry_num; i++) {
+ scnprintf(name, sizeof(name), "range%d", i);
kobj = kobject_create_and_add(name, pkobj);
+ if (!kobj) {
+ ret = -ENOMEM;
+ goto cleanup;
+ }
ret = sysfs_create_groups(kobj, memory_range_groups);
- if (ret)
- return ret;
+ if (ret) {
+ kobject_put(kobj);
+ goto cleanup;
+ }
+ kobjs[i] = kobj;
}
+ kfree(kobjs);
+ return 0;
+
+cleanup:
+ for (int j = 0; j < i; j++) {
+ if (kobjs[j]) {
+ sysfs_remove_groups(kobjs[j], memory_range_groups);
+ kobject_put(kobjs[j]);
+ }
+ }
+ kfree(kobjs);
+ kobject_put(pkobj);
return ret;
}
diff --git a/drivers/acpi/apei/einj-core.c b/drivers/acpi/apei/einj-core.c
index 3c87953dbd19..305c240a303f 100644
--- a/drivers/acpi/apei/einj-core.c
+++ b/drivers/acpi/apei/einj-core.c
@@ -182,6 +182,7 @@ bool einj_initialized __ro_after_init;
static void __iomem *einj_param;
static u32 v5param_size;
+static u32 v66param_size;
static bool is_v2;
static void einj_exec_ctx_init(struct apei_exec_context *ctx)
@@ -283,6 +284,24 @@ static void check_vendor_extension(u64 paddr,
acpi_os_unmap_iomem(p, sizeof(v));
}
+static u32 einjv2_init(struct einjv2_extension_struct *e)
+{
+ if (e->revision != 1) {
+ pr_info("Unknown v2 extension revision %u\n", e->revision);
+ return 0;
+ }
+ if (e->length < sizeof(*e) || e->length > PAGE_SIZE) {
+ pr_info(FW_BUG "Bad1 v2 extension length %u\n", e->length);
+ return 0;
+ }
+ if ((e->length - sizeof(*e)) % sizeof(e->component_arr[0])) {
+ pr_info(FW_BUG "Bad2 v2 extension length %u\n", e->length);
+ return 0;
+ }
+
+ return (e->length - sizeof(*e)) / sizeof(e->component_arr[0]);
+}
+
static void __iomem *einj_get_parameter_address(void)
{
int i;
@@ -310,28 +329,21 @@ static void __iomem *einj_get_parameter_address(void)
v5param_size = sizeof(v5param);
p = acpi_os_map_iomem(pa_v5, sizeof(*p));
if (p) {
- int offset, len;
-
memcpy_fromio(&v5param, p, v5param_size);
acpi5 = 1;
check_vendor_extension(pa_v5, &v5param);
- if (is_v2 && available_error_type & ACPI65_EINJV2_SUPP) {
- len = v5param.einjv2_struct.length;
- offset = offsetof(struct einjv2_extension_struct, component_arr);
- max_nr_components = (len - offset) /
- sizeof(v5param.einjv2_struct.component_arr[0]);
- /*
- * The first call to acpi_os_map_iomem above does not include the
- * component array, instead it is used to read and calculate maximum
- * number of components supported by the system. Below, the mapping
- * is expanded to include the component array.
- */
+ if (available_error_type & ACPI65_EINJV2_SUPP) {
+ struct einjv2_extension_struct *e;
+
+ e = &v5param.einjv2_struct;
+ max_nr_components = einjv2_init(e);
+
+ /* remap including einjv2_extension_struct */
acpi_os_unmap_iomem(p, v5param_size);
- offset = offsetof(struct set_error_type_with_address, einjv2_struct);
- v5param_size = offset + struct_size(&v5param.einjv2_struct,
- component_arr, max_nr_components);
- p = acpi_os_map_iomem(pa_v5, v5param_size);
+ v66param_size = v5param_size - sizeof(*e) + e->length;
+ p = acpi_os_map_iomem(pa_v5, v66param_size);
}
+
return p;
}
}
@@ -527,6 +539,7 @@ static int __einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2,
u64 param3, u64 param4)
{
struct apei_exec_context ctx;
+ u32 param_size = is_v2 ? v66param_size : v5param_size;
u64 val, trigger_paddr, timeout = FIRMWARE_TIMEOUT;
int i, rc;
@@ -539,11 +552,11 @@ static int __einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2,
if (acpi5) {
struct set_error_type_with_address *v5param;
- v5param = kmalloc(v5param_size, GFP_KERNEL);
+ v5param = kmalloc(param_size, GFP_KERNEL);
if (!v5param)
return -ENOMEM;
- memcpy_fromio(v5param, einj_param, v5param_size);
+ memcpy_fromio(v5param, einj_param, param_size);
v5param->type = type;
if (type & ACPI5_VENDOR_BIT) {
switch (vendor_flags) {
@@ -601,7 +614,7 @@ static int __einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2,
break;
}
}
- memcpy_toio(einj_param, v5param, v5param_size);
+ memcpy_toio(einj_param, v5param, param_size);
kfree(v5param);
} else {
rc = apei_exec_run(&ctx, ACPI_EINJ_SET_ERROR_TYPE);
@@ -1132,9 +1145,14 @@ static void einj_remove(struct faux_device *fdev)
struct apei_exec_context ctx;
if (einj_param) {
- acpi_size size = (acpi5) ?
- v5param_size :
- sizeof(struct einj_parameter);
+ acpi_size size;
+
+ if (v66param_size)
+ size = v66param_size;
+ else if (acpi5)
+ size = v5param_size;
+ else
+ size = sizeof(struct einj_parameter);
acpi_os_unmap_iomem(einj_param, size);
if (vendor_errors.size)
diff --git a/drivers/acpi/arm64/gtdt.c b/drivers/acpi/arm64/gtdt.c
index fd995a1d3d24..8cc8af8fd408 100644
--- a/drivers/acpi/arm64/gtdt.c
+++ b/drivers/acpi/arm64/gtdt.c
@@ -430,10 +430,10 @@ static int __init gtdt_platform_timer_init(void)
continue;
pdev = platform_device_register_data(NULL, "gtdt-arm-mmio-timer",
- gwdt_count, &atm,
+ mmio_timer_count, &atm,
sizeof(atm));
if (IS_ERR(pdev)) {
- pr_err("Can't register timer %d\n", gwdt_count);
+ pr_err("Can't register timer %d\n", mmio_timer_count);
continue;
}
diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c
index 6c684e54fe01..3bdeeee3414e 100644
--- a/drivers/acpi/cppc_acpi.c
+++ b/drivers/acpi/cppc_acpi.c
@@ -460,7 +460,7 @@ bool acpi_cpc_valid(void)
if (acpi_disabled)
return false;
- for_each_present_cpu(cpu) {
+ for_each_online_cpu(cpu) {
cpc_ptr = per_cpu(cpc_desc_ptr, cpu);
if (!cpc_ptr)
return false;
@@ -476,7 +476,7 @@ bool cppc_allow_fast_switch(void)
struct cpc_desc *cpc_ptr;
int cpu;
- for_each_present_cpu(cpu) {
+ for_each_online_cpu(cpu) {
cpc_ptr = per_cpu(cpc_desc_ptr, cpu);
desired_reg = &cpc_ptr->cpc_regs[DESIRED_PERF];
if (!CPC_IN_SYSTEM_MEMORY(desired_reg) &&
@@ -1435,7 +1435,7 @@ bool cppc_perf_ctrs_in_pcc(void)
{
int cpu;
- for_each_present_cpu(cpu) {
+ for_each_online_cpu(cpu) {
struct cpc_register_resource *ref_perf_reg;
struct cpc_desc *cpc_desc;
diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c
index 5a36d57289b4..11e4483685c9 100644
--- a/drivers/acpi/numa/hmat.c
+++ b/drivers/acpi/numa/hmat.c
@@ -874,11 +874,33 @@ static void hmat_register_target_devices(struct memory_target *target)
}
}
-static void hmat_register_target(struct memory_target *target)
+static void hmat_hotplug_target(struct memory_target *target)
{
int nid = pxm_to_node(target->memory_pxm);
/*
+ * Skip offline nodes. This can happen when memory marked EFI_MEMORY_SP,
+ * "specific purpose", is applied to all the memory in a proximity
+ * domain leading to * the node being marked offline / unplugged, or if
+ * memory-only "hotplug" node is offline.
+ */
+ if (nid == NUMA_NO_NODE || !node_online(nid))
+ return;
+
+ guard(mutex)(&target_lock);
+ if (target->registered)
+ return;
+
+ hmat_register_target_initiators(target);
+ hmat_register_target_cache(target);
+ hmat_register_target_perf(target, ACCESS_COORDINATE_LOCAL);
+ hmat_register_target_perf(target, ACCESS_COORDINATE_CPU);
+ target->registered = true;
+}
+
+static void hmat_register_target(struct memory_target *target)
+{
+ /*
* Devices may belong to either an offline or online
* node, so unconditionally add them.
*/
@@ -895,25 +917,7 @@ static void hmat_register_target(struct memory_target *target)
}
mutex_unlock(&target_lock);
- /*
- * Skip offline nodes. This can happen when memory
- * marked EFI_MEMORY_SP, "specific purpose", is applied
- * to all the memory in a proximity domain leading to
- * the node being marked offline / unplugged, or if
- * memory-only "hotplug" node is offline.
- */
- if (nid == NUMA_NO_NODE || !node_online(nid))
- return;
-
- mutex_lock(&target_lock);
- if (!target->registered) {
- hmat_register_target_initiators(target);
- hmat_register_target_cache(target);
- hmat_register_target_perf(target, ACCESS_COORDINATE_LOCAL);
- hmat_register_target_perf(target, ACCESS_COORDINATE_CPU);
- target->registered = true;
- }
- mutex_unlock(&target_lock);
+ hmat_hotplug_target(target);
}
static void hmat_register_targets(void)
@@ -939,7 +943,7 @@ static int hmat_callback(struct notifier_block *self,
if (!target)
return NOTIFY_OK;
- hmat_register_target(target);
+ hmat_hotplug_target(target);
return NOTIFY_OK;
}
diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c
index 53816dfab645..aa87ee1583a4 100644
--- a/drivers/acpi/numa/srat.c
+++ b/drivers/acpi/numa/srat.c
@@ -237,7 +237,7 @@ acpi_table_print_srat_entry(struct acpi_subtable_header *header)
struct acpi_srat_generic_affinity *p =
(struct acpi_srat_generic_affinity *)header;
- if (p->device_handle_type == 0) {
+ if (p->device_handle_type == 1) {
/*
* For pci devices this may be the only place they
* are assigned a proximity domain
diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c
index 5d824435b26b..65e779be64ff 100644
--- a/drivers/acpi/processor_driver.c
+++ b/drivers/acpi/processor_driver.c
@@ -166,7 +166,8 @@ static int __acpi_processor_start(struct acpi_device *device)
if (result && !IS_ENABLED(CONFIG_ACPI_CPU_FREQ_PSS))
dev_dbg(&device->dev, "CPPC data invalid or not present\n");
- acpi_processor_power_init(pr);
+ if (!cpuidle_get_driver() || cpuidle_get_driver() == &acpi_idle_driver)
+ acpi_processor_power_init(pr);
acpi_pss_perf_init(pr);
@@ -262,8 +263,6 @@ static int __init acpi_processor_driver_init(void)
if (result < 0)
return result;
- acpi_processor_register_idle_driver();
-
result = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
"acpi/cpu-drv:online",
acpi_soft_cpu_online, NULL);
@@ -302,7 +301,6 @@ static void __exit acpi_processor_driver_exit(void)
cpuhp_remove_state_nocalls(hp_online);
cpuhp_remove_state_nocalls(CPUHP_ACPI_CPUDRV_DEAD);
- acpi_processor_unregister_idle_driver();
driver_unregister(&acpi_processor_driver);
}
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 22b051b94a86..4166090db642 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -51,7 +51,7 @@ module_param(latency_factor, uint, 0644);
static DEFINE_PER_CPU(struct cpuidle_device *, acpi_cpuidle_device);
-static struct cpuidle_driver acpi_idle_driver = {
+struct cpuidle_driver acpi_idle_driver = {
.name = "acpi_idle",
.owner = THIS_MODULE,
};
@@ -1357,102 +1357,79 @@ int acpi_processor_power_state_has_changed(struct acpi_processor *pr)
return 0;
}
-void acpi_processor_register_idle_driver(void)
-{
- struct acpi_processor *pr;
- int ret = -ENODEV;
- int cpu;
-
- /*
- * Acpi idle driver is used by all possible CPUs.
- * Install the idle handler by the processor power info of one in them.
- * Note that we use previously set idle handler will be used on
- * platforms that only support C1.
- */
- for_each_cpu(cpu, (struct cpumask *)cpu_possible_mask) {
- pr = per_cpu(processors, cpu);
- if (!pr)
- continue;
-
- ret = acpi_processor_get_power_info(pr);
- if (!ret) {
- pr->flags.power_setup_done = 1;
- acpi_processor_setup_cpuidle_states(pr);
- break;
- }
- }
-
- if (ret) {
- pr_debug("No ACPI power information from any CPUs.\n");
- return;
- }
+static int acpi_processor_registered;
- ret = cpuidle_register_driver(&acpi_idle_driver);
- if (ret) {
- pr_debug("register %s failed.\n", acpi_idle_driver.name);
- return;
- }
- pr_debug("%s registered with cpuidle.\n", acpi_idle_driver.name);
-}
-
-void acpi_processor_unregister_idle_driver(void)
-{
- cpuidle_unregister_driver(&acpi_idle_driver);
-}
-
-void acpi_processor_power_init(struct acpi_processor *pr)
+int acpi_processor_power_init(struct acpi_processor *pr)
{
+ int retval;
struct cpuidle_device *dev;
- /*
- * The code below only works if the current cpuidle driver is the ACPI
- * idle driver.
- */
- if (cpuidle_get_driver() != &acpi_idle_driver)
- return;
-
if (disabled_by_idle_boot_param())
- return;
+ return 0;
acpi_processor_cstate_first_run_checks();
if (!acpi_processor_get_power_info(pr))
pr->flags.power_setup_done = 1;
- if (!pr->flags.power)
- return;
-
- dev = kzalloc(sizeof(*dev), GFP_KERNEL);
- if (!dev)
- return;
+ /*
+ * Install the idle handler if processor power management is supported.
+ * Note that we use previously set idle handler will be used on
+ * platforms that only support C1.
+ */
+ if (pr->flags.power) {
+ /* Register acpi_idle_driver if not already registered */
+ if (!acpi_processor_registered) {
+ acpi_processor_setup_cpuidle_states(pr);
+ retval = cpuidle_register_driver(&acpi_idle_driver);
+ if (retval)
+ return retval;
+ pr_debug("%s registered with cpuidle\n",
+ acpi_idle_driver.name);
+ }
- per_cpu(acpi_cpuidle_device, pr->id) = dev;
+ dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+ if (!dev)
+ return -ENOMEM;
+ per_cpu(acpi_cpuidle_device, pr->id) = dev;
- acpi_processor_setup_cpuidle_dev(pr, dev);
+ acpi_processor_setup_cpuidle_dev(pr, dev);
- /*
- * Register a cpuidle device for this CPU. The cpuidle driver using
- * this device is expected to be registered.
- */
- if (cpuidle_register_device(dev)) {
- per_cpu(acpi_cpuidle_device, pr->id) = NULL;
- kfree(dev);
+ /* Register per-cpu cpuidle_device. Cpuidle driver
+ * must already be registered before registering device
+ */
+ retval = cpuidle_register_device(dev);
+ if (retval) {
+ if (acpi_processor_registered == 0)
+ cpuidle_unregister_driver(&acpi_idle_driver);
+
+ per_cpu(acpi_cpuidle_device, pr->id) = NULL;
+ kfree(dev);
+ return retval;
+ }
+ acpi_processor_registered++;
}
+ return 0;
}
-void acpi_processor_power_exit(struct acpi_processor *pr)
+int acpi_processor_power_exit(struct acpi_processor *pr)
{
struct cpuidle_device *dev = per_cpu(acpi_cpuidle_device, pr->id);
if (disabled_by_idle_boot_param())
- return;
+ return 0;
if (pr->flags.power) {
cpuidle_unregister_device(dev);
+ acpi_processor_registered--;
+ if (acpi_processor_registered == 0)
+ cpuidle_unregister_driver(&acpi_idle_driver);
+
kfree(dev);
}
pr->flags.power_setup_done = 0;
+ return 0;
}
MODULE_IMPORT_NS("ACPI_PROCESSOR_IDLE");
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 2a210719c4ce..f48fb63d7e85 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -3006,6 +3006,16 @@ int ata_dev_configure(struct ata_device *dev)
}
dev->n_sectors = ata_id_n_sectors(id);
+ if (ata_id_is_locked(id)) {
+ /*
+ * If Security locked, set capacity to zero to prevent
+ * any I/O, e.g. partition scanning, as any I/O to a
+ * locked drive will result in user visible errors.
+ */
+ ata_dev_info(dev,
+ "Security locked, setting capacity to zero\n");
+ dev->n_sectors = 0;
+ }
/* get current R/W Multiple count setting */
if ((dev->id[47] >> 8) == 0x80 && (dev->id[59] & 0x100)) {
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index b43a3196e2be..434774e71fe6 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -992,6 +992,13 @@ static void ata_gen_ata_sense(struct ata_queued_cmd *qc)
return;
}
+ if (ata_id_is_locked(dev->id)) {
+ /* Security locked */
+ /* LOGICAL UNIT ACCESS NOT AUTHORIZED */
+ ata_scsi_set_sense(dev, cmd, DATA_PROTECT, 0x74, 0x71);
+ return;
+ }
+
if (!(qc->flags & ATA_QCFLAG_RTF_FILLED)) {
ata_dev_dbg(dev,
"Missing result TF: reporting aborted command\n");
@@ -4894,8 +4901,10 @@ void ata_scsi_dev_rescan(struct work_struct *work)
spin_unlock_irqrestore(ap->lock, flags);
if (do_resume) {
ret = scsi_resume_device(sdev);
- if (ret == -EWOULDBLOCK)
+ if (ret == -EWOULDBLOCK) {
+ scsi_device_put(sdev);
goto unlock_scan;
+ }
dev->flags &= ~ATA_DFLAG_RESUMING;
}
ret = scsi_rescan_device(sdev);
diff --git a/drivers/atm/fore200e.c b/drivers/atm/fore200e.c
index 4fea1149e003..f62e38571440 100644
--- a/drivers/atm/fore200e.c
+++ b/drivers/atm/fore200e.c
@@ -1374,7 +1374,9 @@ fore200e_open(struct atm_vcc *vcc)
vcc->dev_data = NULL;
+ mutex_lock(&fore200e->rate_mtx);
fore200e->available_cell_rate += vcc->qos.txtp.max_pcr;
+ mutex_unlock(&fore200e->rate_mtx);
kfree(fore200e_vcc);
return -EINVAL;
diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c
index 9d4e46ad8352..2f576ecf1832 100644
--- a/drivers/base/devtmpfs.c
+++ b/drivers/base/devtmpfs.c
@@ -180,7 +180,7 @@ static int dev_mkdir(const char *name, umode_t mode)
if (IS_ERR(dentry))
return PTR_ERR(dentry);
- dentry = vfs_mkdir(&nop_mnt_idmap, d_inode(path.dentry), dentry, mode);
+ dentry = vfs_mkdir(&nop_mnt_idmap, d_inode(path.dentry), dentry, mode, NULL);
if (!IS_ERR(dentry))
/* mark as kernel-created inode */
d_inode(dentry)->i_private = &thread;
@@ -231,7 +231,7 @@ static int handle_create(const char *nodename, umode_t mode, kuid_t uid,
return PTR_ERR(dentry);
err = vfs_mknod(&nop_mnt_idmap, d_inode(path.dentry), dentry, mode,
- dev->devt);
+ dev->devt, NULL);
if (!err) {
struct iattr newattrs;
@@ -261,7 +261,7 @@ static int dev_rmdir(const char *name)
return PTR_ERR(dentry);
if (d_inode(dentry)->i_private == &thread)
err = vfs_rmdir(&nop_mnt_idmap, d_inode(parent.dentry),
- dentry);
+ dentry, NULL);
else
err = -EPERM;
diff --git a/drivers/base/firmware_loader/main.c b/drivers/base/firmware_loader/main.c
index 6942c62fa59d..bee3050a20d9 100644
--- a/drivers/base/firmware_loader/main.c
+++ b/drivers/base/firmware_loader/main.c
@@ -829,8 +829,6 @@ _request_firmware(const struct firmware **firmware_p, const char *name,
size_t offset, u32 opt_flags)
{
struct firmware *fw = NULL;
- struct cred *kern_cred = NULL;
- const struct cred *old_cred;
bool nondirect = false;
int ret;
@@ -871,45 +869,38 @@ _request_firmware(const struct firmware **firmware_p, const char *name,
* called by a driver when serving an unrelated request from userland, we use
* the kernel credentials to read the file.
*/
- kern_cred = prepare_kernel_cred(&init_task);
- if (!kern_cred) {
- ret = -ENOMEM;
- goto out;
- }
- old_cred = override_creds(kern_cred);
+ scoped_with_kernel_creds() {
+ ret = fw_get_filesystem_firmware(device, fw->priv, "", NULL);
- ret = fw_get_filesystem_firmware(device, fw->priv, "", NULL);
-
- /* Only full reads can support decompression, platform, and sysfs. */
- if (!(opt_flags & FW_OPT_PARTIAL))
- nondirect = true;
+ /* Only full reads can support decompression, platform, and sysfs. */
+ if (!(opt_flags & FW_OPT_PARTIAL))
+ nondirect = true;
#ifdef CONFIG_FW_LOADER_COMPRESS_ZSTD
- if (ret == -ENOENT && nondirect)
- ret = fw_get_filesystem_firmware(device, fw->priv, ".zst",
- fw_decompress_zstd);
+ if (ret == -ENOENT && nondirect)
+ ret = fw_get_filesystem_firmware(device, fw->priv, ".zst",
+ fw_decompress_zstd);
#endif
#ifdef CONFIG_FW_LOADER_COMPRESS_XZ
- if (ret == -ENOENT && nondirect)
- ret = fw_get_filesystem_firmware(device, fw->priv, ".xz",
- fw_decompress_xz);
+ if (ret == -ENOENT && nondirect)
+ ret = fw_get_filesystem_firmware(device, fw->priv, ".xz",
+ fw_decompress_xz);
#endif
- if (ret == -ENOENT && nondirect)
- ret = firmware_fallback_platform(fw->priv);
+ if (ret == -ENOENT && nondirect)
+ ret = firmware_fallback_platform(fw->priv);
- if (ret) {
- if (!(opt_flags & FW_OPT_NO_WARN))
- dev_warn(device,
- "Direct firmware load for %s failed with error %d\n",
- name, ret);
- if (nondirect)
- ret = firmware_fallback_sysfs(fw, name, device,
- opt_flags, ret);
- } else
- ret = assign_fw(fw, device);
-
- revert_creds(old_cred);
- put_cred(kern_cred);
+ if (ret) {
+ if (!(opt_flags & FW_OPT_NO_WARN))
+ dev_warn(device,
+ "Direct firmware load for %s failed with error %d\n",
+ name, ret);
+ if (nondirect)
+ ret = firmware_fallback_sysfs(fw, name, device,
+ opt_flags, ret);
+ } else {
+ ret = assign_fw(fw, device);
+ }
+ }
out:
if (ret < 0) {
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index e83503bdc1fd..1de1cd72b616 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -888,12 +888,15 @@ static void device_resume_early(struct device *dev, pm_message_t state, bool asy
TRACE_DEVICE(dev);
TRACE_RESUME(0);
- if (dev->power.syscore || dev->power.direct_complete)
+ if (dev->power.direct_complete)
goto Out;
if (!dev->power.is_late_suspended)
goto Out;
+ if (dev->power.syscore)
+ goto Skip;
+
if (!dpm_wait_for_superior(dev, async))
goto Out;
@@ -926,11 +929,11 @@ Run:
Skip:
dev->power.is_late_suspended = false;
+ pm_runtime_enable(dev);
Out:
TRACE_RESUME(error);
- pm_runtime_enable(dev);
complete_all(&dev->power.completion);
if (error) {
@@ -1615,12 +1618,6 @@ static void device_suspend_late(struct device *dev, pm_message_t state, bool asy
TRACE_DEVICE(dev);
TRACE_SUSPEND(0);
- /*
- * Disable runtime PM for the device without checking if there is a
- * pending resume request for it.
- */
- __pm_runtime_disable(dev, false);
-
dpm_wait_for_subordinate(dev, async);
if (READ_ONCE(async_error))
@@ -1631,9 +1628,18 @@ static void device_suspend_late(struct device *dev, pm_message_t state, bool asy
goto Complete;
}
- if (dev->power.syscore || dev->power.direct_complete)
+ if (dev->power.direct_complete)
goto Complete;
+ /*
+ * Disable runtime PM for the device without checking if there is a
+ * pending resume request for it.
+ */
+ __pm_runtime_disable(dev, false);
+
+ if (dev->power.syscore)
+ goto Skip;
+
if (dev->pm_domain) {
info = "late power domain ";
callback = pm_late_early_op(&dev->pm_domain->ops, state);
@@ -1664,6 +1670,7 @@ Run:
WRITE_ONCE(async_error, error);
dpm_save_failed_dev(dev_name(dev));
pm_dev_err(dev, state, async ? " async late" : " late", error);
+ pm_runtime_enable(dev);
goto Complete;
}
dpm_propagate_wakeup_to_parent(dev);
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index a853c65ac65d..3263040fcf2d 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -52,7 +52,6 @@
static DEFINE_IDR(nbd_index_idr);
static DEFINE_MUTEX(nbd_index_mutex);
static struct workqueue_struct *nbd_del_wq;
-static struct cred *nbd_cred;
static int nbd_total_devices = 0;
struct nbd_sock {
@@ -555,7 +554,6 @@ static int __sock_xmit(struct nbd_device *nbd, struct socket *sock, int send,
int result;
struct msghdr msg = {} ;
unsigned int noreclaim_flag;
- const struct cred *old_cred;
if (unlikely(!sock)) {
dev_err_ratelimited(disk_to_dev(nbd->disk),
@@ -564,33 +562,32 @@ static int __sock_xmit(struct nbd_device *nbd, struct socket *sock, int send,
return -EINVAL;
}
- old_cred = override_creds(nbd_cred);
-
msg.msg_iter = *iter;
noreclaim_flag = memalloc_noreclaim_save();
- do {
- sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
- sock->sk->sk_use_task_frag = false;
- msg.msg_flags = msg_flags | MSG_NOSIGNAL;
-
- if (send)
- result = sock_sendmsg(sock, &msg);
- else
- result = sock_recvmsg(sock, &msg, msg.msg_flags);
-
- if (result <= 0) {
- if (result == 0)
- result = -EPIPE; /* short read */
- break;
- }
- if (sent)
- *sent += result;
- } while (msg_data_left(&msg));
- memalloc_noreclaim_restore(noreclaim_flag);
+ scoped_with_kernel_creds() {
+ do {
+ sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
+ sock->sk->sk_use_task_frag = false;
+ msg.msg_flags = msg_flags | MSG_NOSIGNAL;
- revert_creds(old_cred);
+ if (send)
+ result = sock_sendmsg(sock, &msg);
+ else
+ result = sock_recvmsg(sock, &msg, msg.msg_flags);
+
+ if (result <= 0) {
+ if (result == 0)
+ result = -EPIPE; /* short read */
+ break;
+ }
+ if (sent)
+ *sent += result;
+ } while (msg_data_left(&msg));
+ }
+
+ memalloc_noreclaim_restore(noreclaim_flag);
return result;
}
@@ -2683,15 +2680,7 @@ static int __init nbd_init(void)
return -ENOMEM;
}
- nbd_cred = prepare_kernel_cred(&init_task);
- if (!nbd_cred) {
- destroy_workqueue(nbd_del_wq);
- unregister_blkdev(NBD_MAJOR, "nbd");
- return -ENOMEM;
- }
-
if (genl_register_family(&nbd_genl_family)) {
- put_cred(nbd_cred);
destroy_workqueue(nbd_del_wq);
unregister_blkdev(NBD_MAJOR, "nbd");
return -EINVAL;
@@ -2746,7 +2735,6 @@ static void __exit nbd_cleanup(void)
/* Also wait for nbd_dev_remove_work() completes */
destroy_workqueue(nbd_del_wq);
- put_cred(nbd_cred);
idr_destroy(&nbd_index_idr);
unregister_blkdev(NBD_MAJOR, "nbd");
}
diff --git a/drivers/bluetooth/btrtl.c b/drivers/bluetooth/btrtl.c
index 1d4a7887abcc..52794db2739b 100644
--- a/drivers/bluetooth/btrtl.c
+++ b/drivers/bluetooth/btrtl.c
@@ -50,7 +50,7 @@
#define RTL_CHIP_SUBVER (&(struct rtl_vendor_cmd) {{0x10, 0x38, 0x04, 0x28, 0x80}})
#define RTL_CHIP_REV (&(struct rtl_vendor_cmd) {{0x10, 0x3A, 0x04, 0x28, 0x80}})
-#define RTL_SEC_PROJ (&(struct rtl_vendor_cmd) {{0x10, 0xA4, 0x0D, 0x00, 0xb0}})
+#define RTL_SEC_PROJ (&(struct rtl_vendor_cmd) {{0x10, 0xA4, 0xAD, 0x00, 0xb0}})
#define RTL_PATCH_SNIPPETS 0x01
#define RTL_PATCH_DUMMY_HEADER 0x02
@@ -534,7 +534,6 @@ static int rtlbt_parse_firmware_v2(struct hci_dev *hdev,
{
struct rtl_epatch_header_v2 *hdr;
int rc;
- u8 reg_val[2];
u8 key_id;
u32 num_sections;
struct rtl_section *section;
@@ -549,14 +548,7 @@ static int rtlbt_parse_firmware_v2(struct hci_dev *hdev,
.len = btrtl_dev->fw_len - 7, /* Cut the tail */
};
- rc = btrtl_vendor_read_reg16(hdev, RTL_SEC_PROJ, reg_val);
- if (rc < 0)
- return -EIO;
- key_id = reg_val[0];
-
- rtl_dev_dbg(hdev, "%s: key id %u", __func__, key_id);
-
- btrtl_dev->key_id = key_id;
+ key_id = btrtl_dev->key_id;
hdr = rtl_iov_pull_data(&iov, sizeof(*hdr));
if (!hdr)
@@ -1070,6 +1062,8 @@ struct btrtl_device_info *btrtl_initialize(struct hci_dev *hdev,
u16 hci_rev, lmp_subver;
u8 hci_ver, lmp_ver, chip_type = 0;
int ret;
+ int rc;
+ u8 key_id;
u8 reg_val[2];
btrtl_dev = kzalloc(sizeof(*btrtl_dev), GFP_KERNEL);
@@ -1180,6 +1174,14 @@ next:
goto err_free;
}
+ rc = btrtl_vendor_read_reg16(hdev, RTL_SEC_PROJ, reg_val);
+ if (rc < 0)
+ goto err_free;
+
+ key_id = reg_val[0];
+ btrtl_dev->key_id = key_id;
+ rtl_dev_info(hdev, "%s: key id %u", __func__, key_id);
+
btrtl_dev->fw_len = -EIO;
if (lmp_subver == RTL_ROM_LMP_8852A && hci_rev == 0x000c) {
snprintf(fw_name, sizeof(fw_name), "%s_v2.bin",
@@ -1202,7 +1204,7 @@ next:
goto err_free;
}
- if (btrtl_dev->ic_info->cfg_name) {
+ if (btrtl_dev->ic_info->cfg_name && !btrtl_dev->key_id) {
if (postfix) {
snprintf(cfg_name, sizeof(cfg_name), "%s-%s.bin",
btrtl_dev->ic_info->cfg_name, postfix);
diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
index 5e9ebf0c5312..fa683bb7f0b4 100644
--- a/drivers/bluetooth/btusb.c
+++ b/drivers/bluetooth/btusb.c
@@ -2711,9 +2711,21 @@ static int btusb_recv_event_realtek(struct hci_dev *hdev, struct sk_buff *skb)
static void btusb_mtk_claim_iso_intf(struct btusb_data *data)
{
- struct btmtk_data *btmtk_data = hci_get_priv(data->hdev);
+ struct btmtk_data *btmtk_data;
int err;
+ if (!data->hdev)
+ return;
+
+ btmtk_data = hci_get_priv(data->hdev);
+ if (!btmtk_data)
+ return;
+
+ if (!btmtk_data->isopkt_intf) {
+ bt_dev_err(data->hdev, "Can't claim NULL iso interface");
+ return;
+ }
+
/*
* The function usb_driver_claim_interface() is documented to need
* locks held if it's not called from a probe routine. The code here
@@ -2735,17 +2747,30 @@ static void btusb_mtk_claim_iso_intf(struct btusb_data *data)
static void btusb_mtk_release_iso_intf(struct hci_dev *hdev)
{
- struct btmtk_data *btmtk_data = hci_get_priv(hdev);
+ struct btmtk_data *btmtk_data;
+
+ if (!hdev)
+ return;
+
+ btmtk_data = hci_get_priv(hdev);
+ if (!btmtk_data)
+ return;
if (test_bit(BTMTK_ISOPKT_OVER_INTR, &btmtk_data->flags)) {
usb_kill_anchored_urbs(&btmtk_data->isopkt_anchor);
clear_bit(BTMTK_ISOPKT_RUNNING, &btmtk_data->flags);
- dev_kfree_skb_irq(btmtk_data->isopkt_skb);
- btmtk_data->isopkt_skb = NULL;
- usb_set_intfdata(btmtk_data->isopkt_intf, NULL);
- usb_driver_release_interface(&btusb_driver,
- btmtk_data->isopkt_intf);
+ if (btmtk_data->isopkt_skb) {
+ dev_kfree_skb_irq(btmtk_data->isopkt_skb);
+ btmtk_data->isopkt_skb = NULL;
+ }
+
+ if (btmtk_data->isopkt_intf) {
+ usb_set_intfdata(btmtk_data->isopkt_intf, NULL);
+ usb_driver_release_interface(&btusb_driver,
+ btmtk_data->isopkt_intf);
+ btmtk_data->isopkt_intf = NULL;
+ }
}
clear_bit(BTMTK_ISOPKT_OVER_INTR, &btmtk_data->flags);
@@ -4361,6 +4386,11 @@ static void btusb_disconnect(struct usb_interface *intf)
hci_unregister_dev(hdev);
+ if (data->oob_wake_irq)
+ device_init_wakeup(&data->udev->dev, false);
+ if (data->reset_gpio)
+ gpiod_put(data->reset_gpio);
+
if (intf == data->intf) {
if (data->isoc)
usb_driver_release_interface(&btusb_driver, data->isoc);
@@ -4371,17 +4401,11 @@ static void btusb_disconnect(struct usb_interface *intf)
usb_driver_release_interface(&btusb_driver, data->diag);
usb_driver_release_interface(&btusb_driver, data->intf);
} else if (intf == data->diag) {
- usb_driver_release_interface(&btusb_driver, data->intf);
if (data->isoc)
usb_driver_release_interface(&btusb_driver, data->isoc);
+ usb_driver_release_interface(&btusb_driver, data->intf);
}
- if (data->oob_wake_irq)
- device_init_wakeup(&data->udev->dev, false);
-
- if (data->reset_gpio)
- gpiod_put(data->reset_gpio);
-
hci_free_dev(hdev);
}
diff --git a/drivers/clk/sunxi-ng/ccu-sun55i-a523-r.c b/drivers/clk/sunxi-ng/ccu-sun55i-a523-r.c
index 70ce0ca0cb7d..0339c4af0fe5 100644
--- a/drivers/clk/sunxi-ng/ccu-sun55i-a523-r.c
+++ b/drivers/clk/sunxi-ng/ccu-sun55i-a523-r.c
@@ -121,11 +121,11 @@ static SUNXI_CCU_GATE_HW(bus_r_ir_rx_clk, "bus-r-ir-rx",
&r_apb0_clk.common.hw, 0x1cc, BIT(0), 0);
static SUNXI_CCU_GATE_HW(bus_r_dma_clk, "bus-r-dma",
- &r_apb0_clk.common.hw, 0x1dc, BIT(0), 0);
+ &r_apb0_clk.common.hw, 0x1dc, BIT(0), CLK_IS_CRITICAL);
static SUNXI_CCU_GATE_HW(bus_r_rtc_clk, "bus-r-rtc",
&r_apb0_clk.common.hw, 0x20c, BIT(0), 0);
static SUNXI_CCU_GATE_HW(bus_r_cpucfg_clk, "bus-r-cpucfg",
- &r_apb0_clk.common.hw, 0x22c, BIT(0), 0);
+ &r_apb0_clk.common.hw, 0x22c, BIT(0), CLK_IS_CRITICAL);
static struct ccu_common *sun55i_a523_r_ccu_clks[] = {
&r_ahb_clk.common,
diff --git a/drivers/clk/sunxi-ng/ccu-sun55i-a523.c b/drivers/clk/sunxi-ng/ccu-sun55i-a523.c
index acb532f8361b..20dad06b37ca 100644
--- a/drivers/clk/sunxi-ng/ccu-sun55i-a523.c
+++ b/drivers/clk/sunxi-ng/ccu-sun55i-a523.c
@@ -300,7 +300,7 @@ static struct ccu_nm pll_audio0_4x_clk = {
.m = _SUNXI_CCU_DIV(16, 6),
.sdm = _SUNXI_CCU_SDM(pll_audio0_sdm_table, BIT(24),
0x178, BIT(31)),
- .min_rate = 180000000U,
+ .min_rate = 90000000U,
.max_rate = 3000000000U,
.common = {
.reg = 0x078,
diff --git a/drivers/counter/microchip-tcb-capture.c b/drivers/counter/microchip-tcb-capture.c
index 1a299d1f350b..19d457ae4c3b 100644
--- a/drivers/counter/microchip-tcb-capture.c
+++ b/drivers/counter/microchip-tcb-capture.c
@@ -451,7 +451,7 @@ static void mchp_tc_irq_remove(void *ptr)
static int mchp_tc_irq_enable(struct counter_device *const counter, int irq)
{
struct mchp_tc_data *const priv = counter_priv(counter);
- int ret = devm_request_irq(counter->parent, irq, mchp_tc_isr, 0,
+ int ret = devm_request_irq(counter->parent, irq, mchp_tc_isr, IRQF_SHARED,
dev_name(counter->parent), counter);
if (ret < 0)
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 38897bb14a2c..492a10f1bdbf 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -603,9 +603,6 @@ static bool turbo_is_disabled(void)
{
u64 misc_en;
- if (!cpu_feature_enabled(X86_FEATURE_IDA))
- return true;
-
rdmsrq(MSR_IA32_MISC_ENABLE, misc_en);
return !!(misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE);
@@ -2106,7 +2103,8 @@ static u64 atom_get_val(struct cpudata *cpudata, int pstate)
u32 vid;
val = (u64)pstate << 8;
- if (READ_ONCE(global.no_turbo) && !READ_ONCE(global.turbo_disabled))
+ if (READ_ONCE(global.no_turbo) && !READ_ONCE(global.turbo_disabled) &&
+ cpu_feature_enabled(X86_FEATURE_IDA))
val |= (u64)1 << 32;
vid_fp = cpudata->vid.min + mul_fp(
@@ -2271,7 +2269,8 @@ static u64 core_get_val(struct cpudata *cpudata, int pstate)
u64 val;
val = (u64)pstate << 8;
- if (READ_ONCE(global.no_turbo) && !READ_ONCE(global.turbo_disabled))
+ if (READ_ONCE(global.no_turbo) && !READ_ONCE(global.turbo_disabled) &&
+ cpu_feature_enabled(X86_FEATURE_IDA))
val |= (u64)1 << 32;
return val;
diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c
index 0d13d47c164b..b28a6f50daaa 100644
--- a/drivers/crypto/ccp/sev-dev.c
+++ b/drivers/crypto/ccp/sev-dev.c
@@ -259,27 +259,20 @@ static int sev_cmd_buffer_len(int cmd)
static struct file *open_file_as_root(const char *filename, int flags, umode_t mode)
{
- struct file *fp;
- struct path root;
- struct cred *cred;
- const struct cred *old_cred;
+ struct path root __free(path_put) = {};
task_lock(&init_task);
get_fs_root(init_task.fs, &root);
task_unlock(&init_task);
- cred = prepare_creds();
+ CLASS(prepare_creds, cred)();
if (!cred)
return ERR_PTR(-ENOMEM);
- cred->fsuid = GLOBAL_ROOT_UID;
- old_cred = override_creds(cred);
- fp = file_open_root(&root, filename, flags, mode);
- path_put(&root);
-
- put_cred(revert_creds(old_cred));
+ cred->fsuid = GLOBAL_ROOT_UID;
- return fp;
+ scoped_with_creds(cred)
+ return file_open_root(&root, filename, flags, mode);
}
static int sev_read_init_ex_file(void)
diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c
index a5b96adf2d1e..3b391a146635 100644
--- a/drivers/crypto/hisilicon/qm.c
+++ b/drivers/crypto/hisilicon/qm.c
@@ -3871,10 +3871,12 @@ static ssize_t qm_get_qos_value(struct hisi_qm *qm, const char *buf,
pdev = container_of(dev, struct pci_dev, dev);
if (pci_physfn(pdev) != qm->pdev) {
pci_err(qm->pdev, "the pdev input does not match the pf!\n");
+ put_device(dev);
return -EINVAL;
}
*fun_index = pdev->devfn;
+ put_device(dev);
return 0;
}
diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index b06fee1978ba..41b64d871c5a 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -3702,6 +3702,7 @@ static int cxl_region_debugfs_poison_inject(void *data, u64 offset)
if (validate_region_offset(cxlr, offset))
return -EINVAL;
+ offset -= cxlr->params.cache_size;
rc = region_offset_to_dpa_result(cxlr, offset, &result);
if (rc || !result.cxlmd || result.dpa == ULLONG_MAX) {
dev_dbg(&cxlr->dev,
@@ -3734,6 +3735,7 @@ static int cxl_region_debugfs_poison_clear(void *data, u64 offset)
if (validate_region_offset(cxlr, offset))
return -EINVAL;
+ offset -= cxlr->params.cache_size;
rc = region_offset_to_dpa_result(cxlr, offset, &result);
if (rc || !result.cxlmd || result.dpa == ULLONG_MAX) {
dev_dbg(&cxlr->dev,
diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index d7714d8afb0f..c00b9dff4a06 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -433,7 +433,7 @@ static struct dax_device *dax_dev_get(dev_t devt)
return NULL;
dax_dev = to_dax_dev(inode);
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
set_bit(DAXDEV_ALIVE, &dax_dev->flags);
inode->i_cdev = &dax_dev->cdev;
inode->i_mode = S_IFCHR;
diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index 2bcf9ceca997..edaa9e4ee4ae 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -768,18 +768,10 @@ EXPORT_SYMBOL_NS_GPL(dma_buf_export, "DMA_BUF");
*/
int dma_buf_fd(struct dma_buf *dmabuf, int flags)
{
- int fd;
-
if (!dmabuf || !dmabuf->file)
return -EINVAL;
- fd = get_unused_fd_flags(flags);
- if (fd < 0)
- return fd;
-
- fd_install(fd, dmabuf->file);
-
- return fd;
+ return FD_ADD(flags, dmabuf->file);
}
EXPORT_SYMBOL_NS_GPL(dma_buf_fd, "DMA_BUF");
diff --git a/drivers/edac/altera_edac.c b/drivers/edac/altera_edac.c
index 103b2c2eba2a..0c5b94e64ea1 100644
--- a/drivers/edac/altera_edac.c
+++ b/drivers/edac/altera_edac.c
@@ -1184,10 +1184,22 @@ altr_check_ocram_deps_init(struct altr_edac_device_dev *device)
if (ret)
return ret;
- /* Verify OCRAM has been initialized */
+ /*
+ * Verify that OCRAM has been initialized.
+ * During a warm reset, OCRAM contents are retained, but the control
+ * and status registers are reset to their default values. Therefore,
+ * ECC must be explicitly re-enabled in the control register.
+ * Error condition: if INITCOMPLETEA is clear and ECC_EN is already set.
+ */
if (!ecc_test_bits(ALTR_A10_ECC_INITCOMPLETEA,
- (base + ALTR_A10_ECC_INITSTAT_OFST)))
- return -ENODEV;
+ (base + ALTR_A10_ECC_INITSTAT_OFST))) {
+ if (!ecc_test_bits(ALTR_A10_ECC_EN,
+ (base + ALTR_A10_ECC_CTRL_OFST)))
+ ecc_set_bits(ALTR_A10_ECC_EN,
+ (base + ALTR_A10_ECC_CTRL_OFST));
+ else
+ return -ENODEV;
+ }
/* Enable IRQ on Single Bit Error */
writel(ALTR_A10_ECC_SERRINTEN, (base + ALTR_A10_ECC_ERRINTENS_OFST));
@@ -1357,7 +1369,7 @@ static const struct edac_device_prv_data a10_enetecc_data = {
.ue_set_mask = ALTR_A10_ECC_TDERRA,
.set_err_ofst = ALTR_A10_ECC_INTTEST_OFST,
.ecc_irq_handler = altr_edac_a10_ecc_irq,
- .inject_fops = &altr_edac_a10_device_inject2_fops,
+ .inject_fops = &altr_edac_a10_device_inject_fops,
};
#endif /* CONFIG_EDAC_ALTERA_ETHERNET */
@@ -1447,7 +1459,7 @@ static const struct edac_device_prv_data a10_usbecc_data = {
.ue_set_mask = ALTR_A10_ECC_TDERRA,
.set_err_ofst = ALTR_A10_ECC_INTTEST_OFST,
.ecc_irq_handler = altr_edac_a10_ecc_irq,
- .inject_fops = &altr_edac_a10_device_inject2_fops,
+ .inject_fops = &altr_edac_a10_device_inject_fops,
};
#endif /* CONFIG_EDAC_ALTERA_USB */
diff --git a/drivers/edac/versalnet_edac.c b/drivers/edac/versalnet_edac.c
index 1ded4c3f0213..1a1092793092 100644
--- a/drivers/edac/versalnet_edac.c
+++ b/drivers/edac/versalnet_edac.c
@@ -605,21 +605,23 @@ static int rpmsg_cb(struct rpmsg_device *rpdev, void *data,
length = result[MSG_ERR_LENGTH];
offset = result[MSG_ERR_OFFSET];
+ /*
+ * The data can come in two stretches. Construct the regs from two
+ * messages. The offset indicates the offset from which the data is to
+ * be taken.
+ */
+ for (i = 0 ; i < length; i++) {
+ k = offset + i;
+ j = ERROR_DATA + i;
+ mc_priv->regs[k] = result[j];
+ }
+
if (result[TOTAL_ERR_LENGTH] > length) {
if (!mc_priv->part_len)
mc_priv->part_len = length;
else
mc_priv->part_len += length;
- /*
- * The data can come in 2 stretches. Construct the regs from 2
- * messages the offset indicates the offset from which the data is to
- * be taken
- */
- for (i = 0 ; i < length; i++) {
- k = offset + i;
- j = ERROR_DATA + i;
- mc_priv->regs[k] = result[j];
- }
+
if (mc_priv->part_len < result[TOTAL_ERR_LENGTH])
return 0;
mc_priv->part_len = 0;
@@ -705,7 +707,7 @@ static int rpmsg_cb(struct rpmsg_device *rpdev, void *data,
/* Convert to bytes */
length = result[TOTAL_ERR_LENGTH] * 4;
log_non_standard_event(sec_type, &amd_versalnet_guid, mc_priv->message,
- sec_sev, (void *)&result[ERROR_DATA], length);
+ sec_sev, (void *)&mc_priv->regs, length);
return 0;
}
diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c
index e5e0174a0335..66e1106db5e7 100644
--- a/drivers/firewire/core-card.c
+++ b/drivers/firewire/core-card.c
@@ -577,6 +577,8 @@ void fw_card_initialize(struct fw_card *card,
INIT_LIST_HEAD(&card->transactions.list);
spin_lock_init(&card->transactions.lock);
+ spin_lock_init(&card->topology_map.lock);
+
card->split_timeout.hi = DEFAULT_SPLIT_TIMEOUT / 8000;
card->split_timeout.lo = (DEFAULT_SPLIT_TIMEOUT % 8000) << 19;
card->split_timeout.cycles = DEFAULT_SPLIT_TIMEOUT;
diff --git a/drivers/firewire/core-topology.c b/drivers/firewire/core-topology.c
index 2f73bcd5696f..ed3ae8cdb0cd 100644
--- a/drivers/firewire/core-topology.c
+++ b/drivers/firewire/core-topology.c
@@ -441,12 +441,13 @@ static void update_topology_map(__be32 *buffer, size_t buffer_size, int root_nod
const u32 *self_ids, int self_id_count)
{
__be32 *map = buffer;
+ u32 next_generation = be32_to_cpu(buffer[1]) + 1;
int node_count = (root_node_id & 0x3f) + 1;
memset(map, 0, buffer_size);
*map++ = cpu_to_be32((self_id_count + 2) << 16);
- *map++ = cpu_to_be32(be32_to_cpu(buffer[1]) + 1);
+ *map++ = cpu_to_be32(next_generation);
*map++ = cpu_to_be32((node_count << 16) | self_id_count);
while (self_id_count--)
diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
index 94b05e4451dd..7d15a85d579f 100644
--- a/drivers/firmware/efi/libstub/Makefile
+++ b/drivers/firmware/efi/libstub/Makefile
@@ -11,12 +11,12 @@ cflags-y := $(KBUILD_CFLAGS)
cflags-$(CONFIG_X86_32) := -march=i386
cflags-$(CONFIG_X86_64) := -mcmodel=small
-cflags-$(CONFIG_X86) += -m$(BITS) -D__KERNEL__ -std=gnu11 \
+cflags-$(CONFIG_X86) += -m$(BITS) -D__KERNEL__ -std=gnu11 -fms-extensions \
-fPIC -fno-strict-aliasing -mno-red-zone \
-mno-mmx -mno-sse -fshort-wchar \
-Wno-pointer-sign \
$(call cc-disable-warning, address-of-packed-member) \
- $(call cc-disable-warning, gnu) \
+ $(if $(CONFIG_CC_IS_CLANG),-Wno-gnu -Wno-microsoft-anon-tag) \
-fno-asynchronous-unwind-tables \
$(CLANG_FLAGS)
diff --git a/drivers/firmware/stratix10-svc.c b/drivers/firmware/stratix10-svc.c
index e3f990d888d7..00f58e27f6de 100644
--- a/drivers/firmware/stratix10-svc.c
+++ b/drivers/firmware/stratix10-svc.c
@@ -134,6 +134,7 @@ struct stratix10_svc_data {
* @complete_status: state for completion
* @svc_fifo_lock: protect access to service message data queue
* @invoke_fn: function to issue secure monitor call or hypervisor call
+ * @svc: manages the list of client svc drivers
*
* This struct is used to create communication channels for service clients, to
* handle secure monitor or hypervisor call.
@@ -150,6 +151,7 @@ struct stratix10_svc_controller {
struct completion complete_status;
spinlock_t svc_fifo_lock;
svc_invoke_fn *invoke_fn;
+ struct stratix10_svc *svc;
};
/**
@@ -1206,6 +1208,7 @@ static int stratix10_svc_drv_probe(struct platform_device *pdev)
ret = -ENOMEM;
goto err_free_kfifo;
}
+ controller->svc = svc;
svc->stratix10_svc_rsu = platform_device_alloc(STRATIX10_RSU, 0);
if (!svc->stratix10_svc_rsu) {
@@ -1237,8 +1240,6 @@ static int stratix10_svc_drv_probe(struct platform_device *pdev)
if (ret)
goto err_unregister_fcs_dev;
- dev_set_drvdata(dev, svc);
-
pr_info("Intel Service Layer Driver Initialized\n");
return 0;
@@ -1256,8 +1257,8 @@ err_destroy_pool:
static void stratix10_svc_drv_remove(struct platform_device *pdev)
{
- struct stratix10_svc *svc = dev_get_drvdata(&pdev->dev);
struct stratix10_svc_controller *ctrl = platform_get_drvdata(pdev);
+ struct stratix10_svc *svc = ctrl->svc;
of_platform_depopulate(ctrl->dev);
diff --git a/drivers/gpio/gpiolib-cdev.c b/drivers/gpio/gpiolib-cdev.c
index 175836467f21..084656564176 100644
--- a/drivers/gpio/gpiolib-cdev.c
+++ b/drivers/gpio/gpiolib-cdev.c
@@ -298,12 +298,13 @@ static const struct file_operations linehandle_fileops = {
#endif
};
+DEFINE_FREE(linehandle_free, struct linehandle_state *, if (!IS_ERR_OR_NULL(_T)) linehandle_free(_T))
+
static int linehandle_create(struct gpio_device *gdev, void __user *ip)
{
struct gpiohandle_request handlereq;
- struct linehandle_state *lh;
- struct file *file;
- int fd, i, ret;
+ struct linehandle_state *lh __free(linehandle_free) = NULL;
+ int i, ret;
u32 lflags;
if (copy_from_user(&handlereq, ip, sizeof(handlereq)))
@@ -327,10 +328,8 @@ static int linehandle_create(struct gpio_device *gdev, void __user *ip)
lh->label = kstrndup(handlereq.consumer_label,
sizeof(handlereq.consumer_label) - 1,
GFP_KERNEL);
- if (!lh->label) {
- ret = -ENOMEM;
- goto out_free_lh;
- }
+ if (!lh->label)
+ return -ENOMEM;
}
lh->num_descs = handlereq.lines;
@@ -340,20 +339,18 @@ static int linehandle_create(struct gpio_device *gdev, void __user *ip)
u32 offset = handlereq.lineoffsets[i];
struct gpio_desc *desc = gpio_device_get_desc(gdev, offset);
- if (IS_ERR(desc)) {
- ret = PTR_ERR(desc);
- goto out_free_lh;
- }
+ if (IS_ERR(desc))
+ return PTR_ERR(desc);
ret = gpiod_request_user(desc, lh->label);
if (ret)
- goto out_free_lh;
+ return ret;
lh->descs[i] = desc;
linehandle_flags_to_desc_flags(handlereq.flags, &desc->flags);
ret = gpiod_set_transitory(desc, false);
if (ret < 0)
- goto out_free_lh;
+ return ret;
/*
* Lines have to be requested explicitly for input
@@ -364,11 +361,11 @@ static int linehandle_create(struct gpio_device *gdev, void __user *ip)
ret = gpiod_direction_output_nonotify(desc, val);
if (ret)
- goto out_free_lh;
+ return ret;
} else if (lflags & GPIOHANDLE_REQUEST_INPUT) {
ret = gpiod_direction_input_nonotify(desc);
if (ret)
- goto out_free_lh;
+ return ret;
}
gpiod_line_state_notify(desc, GPIO_V2_LINE_CHANGED_REQUESTED);
@@ -377,44 +374,23 @@ static int linehandle_create(struct gpio_device *gdev, void __user *ip)
offset);
}
- fd = get_unused_fd_flags(O_RDONLY | O_CLOEXEC);
- if (fd < 0) {
- ret = fd;
- goto out_free_lh;
- }
-
- file = anon_inode_getfile("gpio-linehandle",
- &linehandle_fileops,
- lh,
- O_RDONLY | O_CLOEXEC);
- if (IS_ERR(file)) {
- ret = PTR_ERR(file);
- goto out_put_unused_fd;
- }
+ FD_PREPARE(fdf, O_RDONLY | O_CLOEXEC,
+ anon_inode_getfile("gpio-linehandle", &linehandle_fileops,
+ lh, O_RDONLY | O_CLOEXEC));
+ if (fdf.err)
+ return fdf.err;
+ retain_and_null_ptr(lh);
- handlereq.fd = fd;
- if (copy_to_user(ip, &handlereq, sizeof(handlereq))) {
- /*
- * fput() will trigger the release() callback, so do not go onto
- * the regular error cleanup path here.
- */
- fput(file);
- put_unused_fd(fd);
+ handlereq.fd = fd_prepare_fd(fdf);
+ if (copy_to_user(ip, &handlereq, sizeof(handlereq)))
return -EFAULT;
- }
- fd_install(fd, file);
+ fd_publish(fdf);
dev_dbg(&gdev->dev, "registered chardev handle for %d lines\n",
lh->num_descs);
return 0;
-
-out_put_unused_fd:
- put_unused_fd(fd);
-out_free_lh:
- linehandle_free(lh);
- return ret;
}
#endif /* CONFIG_GPIO_CDEV_V1 */
@@ -2548,10 +2524,17 @@ static int lineinfo_changed_notify(struct notifier_block *nb,
container_of(nb, struct gpio_chardev_data, lineinfo_changed_nb);
struct lineinfo_changed_ctx *ctx;
struct gpio_desc *desc = data;
+ struct file *fp;
if (!test_bit(gpio_chip_hwgpio(desc), cdev->watched_lines))
return NOTIFY_DONE;
+ /* Keep the file descriptor alive for the duration of the notification. */
+ fp = get_file_active(&cdev->fp);
+ if (!fp)
+ /* Chardev file descriptor was or is being released. */
+ return NOTIFY_DONE;
+
/*
* If this is called from atomic context (for instance: with a spinlock
* taken by the atomic notifier chain), any sleeping calls must be done
@@ -2575,8 +2558,6 @@ static int lineinfo_changed_notify(struct notifier_block *nb,
/* Keep the GPIO device alive until we emit the event. */
ctx->gdev = gpio_device_get(desc->gdev);
ctx->cdev = cdev;
- /* Keep the file descriptor alive too. */
- get_file(ctx->cdev->fp);
INIT_WORK(&ctx->work, lineinfo_changed_func);
queue_work(ctx->gdev->line_state_wq, &ctx->work);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
index f5d5c45ddc0d..afedea02188d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -236,7 +236,7 @@ static int amdgpu_ctx_init_entity(struct amdgpu_ctx *ctx, u32 hw_ip,
r = amdgpu_xcp_select_scheds(adev, hw_ip, hw_prio, fpriv,
&num_scheds, &scheds);
if (r)
- goto cleanup_entity;
+ goto error_free_entity;
}
/* disable load balance if the hw engine retains context among dependent jobs */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 2819aceaab74..96b6738e6252 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2638,6 +2638,8 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
chip_name = "navi12";
break;
case CHIP_CYAN_SKILLFISH:
+ if (adev->mman.discovery_bin)
+ return 0;
chip_name = "cyan_skillfish";
break;
}
@@ -3414,10 +3416,11 @@ int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
continue;
- /* skip CG for VCE/UVD, it's handled specially */
+ /* skip CG for VCE/UVD/VPE, it's handled specially */
if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
+ adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VPE &&
adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
adev->ip_blocks[i].version->funcs->set_powergating_state) {
/* enable powergating to save power */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
index 8561ad7f6180..ed3bef1edfe4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
@@ -82,6 +82,18 @@ static int amdgpu_dma_buf_attach(struct dma_buf *dmabuf,
struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj);
struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
+ /*
+ * Disable peer-to-peer access for DCC-enabled VRAM surfaces on GFX12+.
+ * Such buffers cannot be safely accessed over P2P due to device-local
+ * compression metadata. Fallback to system-memory path instead.
+ * Device supports GFX12 (GC 12.x or newer)
+ * BO was created with the AMDGPU_GEM_CREATE_GFX12_DCC flag
+ *
+ */
+ if (amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(12, 0, 0) &&
+ bo->flags & AMDGPU_GEM_CREATE_GFX12_DCC)
+ attach->peer2peer = false;
+
if (!amdgpu_dmabuf_is_xgmi_accessible(attach_adev, bo) &&
pci_p2pdma_distance(adev->pdev, attach->dev, false) < 0)
attach->peer2peer = false;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index 9dcf51991b5b..869bceb0fe2c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -597,6 +597,9 @@ int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device *adev)
/* reserve engine 5 for firmware */
if (adev->enable_mes)
vm_inv_engs[i] &= ~(1 << 5);
+ /* reserve engine 6 for uni mes */
+ if (adev->enable_uni_mes)
+ vm_inv_engs[i] &= ~(1 << 6);
/* reserve mmhub engine 3 for firmware */
if (adev->enable_umsch_mm)
vm_inv_engs[i] &= ~(1 << 3);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_isp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_isp.c
index 9cddbf50442a..37270c4dab8d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_isp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_isp.c
@@ -280,6 +280,8 @@ int isp_kernel_buffer_alloc(struct device *dev, u64 size,
if (ret)
return ret;
+ /* Ensure *bo is NULL so a new BO will be created */
+ *bo = NULL;
ret = amdgpu_bo_create_kernel(adev,
size,
ISP_MC_ADDR_ALIGN,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index aa9ee5dffa45..9d568c16beb1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1372,7 +1372,7 @@ uint64_t amdgpu_ttm_tt_pde_flags(struct ttm_tt *ttm, struct ttm_resource *mem)
mem->mem_type == AMDGPU_PL_MMIO_REMAP)) {
flags |= AMDGPU_PTE_SYSTEM;
- if (ttm->caching == ttm_cached)
+ if (ttm && ttm->caching == ttm_cached)
flags |= AMDGPU_PTE_SNOOPED;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
index 761bad98da3e..4d0096d0baa9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
@@ -151,15 +151,16 @@ void amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_d
{
struct amdgpu_userq_fence *userq_fence, *tmp;
struct dma_fence *fence;
+ unsigned long flags;
u64 rptr;
int i;
if (!fence_drv)
return;
+ spin_lock_irqsave(&fence_drv->fence_list_lock, flags);
rptr = amdgpu_userq_fence_read(fence_drv);
- spin_lock(&fence_drv->fence_list_lock);
list_for_each_entry_safe(userq_fence, tmp, &fence_drv->fences, link) {
fence = &userq_fence->base;
@@ -174,7 +175,7 @@ void amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_d
list_del(&userq_fence->link);
dma_fence_put(fence);
}
- spin_unlock(&fence_drv->fence_list_lock);
+ spin_unlock_irqrestore(&fence_drv->fence_list_lock, flags);
}
void amdgpu_userq_fence_driver_destroy(struct kref *ref)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index c1a801203949..676e24fb8864 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -1066,7 +1066,7 @@ amdgpu_vm_tlb_flush(struct amdgpu_vm_update_params *params,
}
/* Prepare a TLB flush fence to be attached to PTs */
- if (!params->unlocked && vm->is_compute_context) {
+ if (!params->unlocked) {
amdgpu_vm_tlb_fence_create(params->adev, vm, fence);
/* Makes sure no PD/PT is freed before the flush */
@@ -2078,7 +2078,7 @@ int amdgpu_vm_bo_clear_mappings(struct amdgpu_device *adev,
struct amdgpu_bo *bo = before->bo_va->base.bo;
amdgpu_vm_it_insert(before, &vm->va);
- if (before->flags & AMDGPU_PTE_PRT_FLAG(adev))
+ if (before->flags & AMDGPU_VM_PAGE_PRT)
amdgpu_vm_prt_get(adev);
if (amdgpu_vm_is_bo_always_valid(vm, bo) &&
@@ -2093,7 +2093,7 @@ int amdgpu_vm_bo_clear_mappings(struct amdgpu_device *adev,
struct amdgpu_bo *bo = after->bo_va->base.bo;
amdgpu_vm_it_insert(after, &vm->va);
- if (after->flags & AMDGPU_PTE_PRT_FLAG(adev))
+ if (after->flags & AMDGPU_VM_PAGE_PRT)
amdgpu_vm_prt_get(adev);
if (amdgpu_vm_is_bo_always_valid(vm, bo) &&
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index d61eb9f187c6..f2be16e700c4 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -5872,9 +5872,9 @@ static void gfx_v11_0_ring_emit_ib_gfx(struct amdgpu_ring *ring,
if (flags & AMDGPU_IB_PREEMPTED)
control |= INDIRECT_BUFFER_PRE_RESUME(1);
- if (vmid)
+ if (vmid && !ring->adev->gfx.rs64_enable)
gfx_v11_0_ring_emit_de_meta(ring,
- (!amdgpu_sriov_vf(ring->adev) && flags & AMDGPU_IB_PREEMPTED) ? true : false);
+ !amdgpu_sriov_vf(ring->adev) && (flags & AMDGPU_IB_PREEMPTED));
}
amdgpu_ring_write(ring, header);
diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_1.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_1.c
index baf097d2e1ac..ab0bf880d3d8 100644
--- a/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_1.c
@@ -878,6 +878,7 @@ static const struct amdgpu_ring_funcs jpeg_v5_0_1_dec_ring_vm_funcs = {
.get_rptr = jpeg_v5_0_1_dec_ring_get_rptr,
.get_wptr = jpeg_v5_0_1_dec_ring_get_wptr,
.set_wptr = jpeg_v5_0_1_dec_ring_set_wptr,
+ .parse_cs = amdgpu_jpeg_dec_parse_cs,
.emit_frame_size =
SOC15_FLUSH_GPU_TLB_NUM_WREG * 6 +
SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 8 +
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
index eacf4e93ba2f..cb7123ec1a5d 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
@@ -141,7 +141,7 @@ static int vcn_v4_0_3_late_init(struct amdgpu_ip_block *ip_block)
adev->vcn.supported_reset =
amdgpu_get_soft_full_reset_mask(&adev->vcn.inst[0].ring_enc[0]);
- if (amdgpu_dpm_reset_vcn_is_supported(adev))
+ if (amdgpu_dpm_reset_vcn_is_supported(adev) && !amdgpu_sriov_vf(adev))
adev->vcn.supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c
index 714350cabf2f..8bd457dea4cf 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c
@@ -122,7 +122,9 @@ static int vcn_v5_0_1_late_init(struct amdgpu_ip_block *ip_block)
switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) {
case IP_VERSION(13, 0, 12):
- if ((adev->psp.sos.fw_version >= 0x00450025) && amdgpu_dpm_reset_vcn_is_supported(adev))
+ if ((adev->psp.sos.fw_version >= 0x00450025) &&
+ amdgpu_dpm_reset_vcn_is_supported(adev) &&
+ !amdgpu_sriov_vf(adev))
adev->vcn.supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
break;
default:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
index a65c67cf56ff..f1e7583650c4 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
@@ -297,16 +297,16 @@ int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_prope
goto out_err_unreserve;
}
- if (properties->ctx_save_restore_area_size != topo_dev->node_props.cwsr_size) {
- pr_debug("queue cwsr size 0x%x not equal to node cwsr size 0x%x\n",
+ if (properties->ctx_save_restore_area_size < topo_dev->node_props.cwsr_size) {
+ pr_debug("queue cwsr size 0x%x not sufficient for node cwsr size 0x%x\n",
properties->ctx_save_restore_area_size,
topo_dev->node_props.cwsr_size);
err = -EINVAL;
goto out_err_unreserve;
}
- total_cwsr_size = (topo_dev->node_props.cwsr_size + topo_dev->node_props.debug_memory_size)
- * NUM_XCC(pdd->dev->xcc_mask);
+ total_cwsr_size = (properties->ctx_save_restore_area_size +
+ topo_dev->node_props.debug_memory_size) * NUM_XCC(pdd->dev->xcc_mask);
total_cwsr_size = ALIGN(total_cwsr_size, PAGE_SIZE);
err = kfd_queue_buffer_get(vm, (void *)properties->ctx_save_restore_area_address,
@@ -352,8 +352,8 @@ int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct queue_prope
topo_dev = kfd_topology_device_by_id(pdd->dev->id);
if (!topo_dev)
return -EINVAL;
- total_cwsr_size = (topo_dev->node_props.cwsr_size + topo_dev->node_props.debug_memory_size)
- * NUM_XCC(pdd->dev->xcc_mask);
+ total_cwsr_size = (properties->ctx_save_restore_area_size +
+ topo_dev->node_props.debug_memory_size) * NUM_XCC(pdd->dev->xcc_mask);
total_cwsr_size = ALIGN(total_cwsr_size, PAGE_SIZE);
kfd_queue_buffer_svm_put(pdd, properties->ctx_save_restore_area_address, total_cwsr_size);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 9d72411c3379..74a1d3e1d52b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -3687,6 +3687,8 @@ svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm,
svm_range_apply_attrs(p, prange, nattr, attrs, &update_mapping);
/* TODO: unmap ranges from GPU that lost access */
}
+ update_mapping |= !p->xnack_enabled && !list_empty(&remap_list);
+
list_for_each_entry_safe(prange, next, &remove_list, update_list) {
pr_debug("unlink old 0x%p prange 0x%p [0x%lx 0x%lx]\n",
prange->svms, prange, prange->start,
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index 91c0188a29b2..7fe40bbba265 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -3859,6 +3859,97 @@ void amdgpu_dm_update_connector_after_detect(
update_subconnector_property(aconnector);
}
+static bool are_sinks_equal(const struct dc_sink *sink1, const struct dc_sink *sink2)
+{
+ if (!sink1 || !sink2)
+ return false;
+ if (sink1->sink_signal != sink2->sink_signal)
+ return false;
+
+ if (sink1->dc_edid.length != sink2->dc_edid.length)
+ return false;
+
+ if (memcmp(sink1->dc_edid.raw_edid, sink2->dc_edid.raw_edid,
+ sink1->dc_edid.length) != 0)
+ return false;
+ return true;
+}
+
+
+/**
+ * DOC: hdmi_hpd_debounce_work
+ *
+ * HDMI HPD debounce delay in milliseconds. When an HDMI display toggles HPD
+ * (such as during power save transitions), this delay determines how long to
+ * wait before processing the HPD event. This allows distinguishing between a
+ * physical unplug (>hdmi_hpd_debounce_delay)
+ * and a spontaneous RX HPD toggle (<hdmi_hpd_debounce_delay).
+ *
+ * If the toggle is less than this delay, the driver compares sink capabilities
+ * and permits a hotplug event if they changed.
+ *
+ * The default value of 1500ms was chosen based on experimental testing with
+ * various monitors that exhibit spontaneous HPD toggling behavior.
+ */
+static void hdmi_hpd_debounce_work(struct work_struct *work)
+{
+ struct amdgpu_dm_connector *aconnector =
+ container_of(to_delayed_work(work), struct amdgpu_dm_connector,
+ hdmi_hpd_debounce_work);
+ struct drm_connector *connector = &aconnector->base;
+ struct drm_device *dev = connector->dev;
+ struct amdgpu_device *adev = drm_to_adev(dev);
+ struct dc *dc = aconnector->dc_link->ctx->dc;
+ bool fake_reconnect = false;
+ bool reallow_idle = false;
+ bool ret = false;
+ guard(mutex)(&aconnector->hpd_lock);
+
+ /* Re-detect the display */
+ scoped_guard(mutex, &adev->dm.dc_lock) {
+ if (dc->caps.ips_support && dc->ctx->dmub_srv->idle_allowed) {
+ dc_allow_idle_optimizations(dc, false);
+ reallow_idle = true;
+ }
+ ret = dc_link_detect(aconnector->dc_link, DETECT_REASON_HPD);
+ }
+
+ if (ret) {
+ /* Apply workaround delay for certain panels */
+ apply_delay_after_dpcd_poweroff(adev, aconnector->dc_sink);
+ /* Compare sinks to determine if this was a spontaneous HPD toggle */
+ if (are_sinks_equal(aconnector->dc_link->local_sink, aconnector->hdmi_prev_sink)) {
+ /*
+ * Sinks match - this was a spontaneous HDMI HPD toggle.
+ */
+ drm_dbg_kms(dev, "HDMI HPD: Sink unchanged after debounce, internal re-enable\n");
+ fake_reconnect = true;
+ }
+
+ /* Update connector state */
+ amdgpu_dm_update_connector_after_detect(aconnector);
+
+ drm_modeset_lock_all(dev);
+ dm_restore_drm_connector_state(dev, connector);
+ drm_modeset_unlock_all(dev);
+
+ /* Only notify OS if sink actually changed */
+ if (!fake_reconnect && aconnector->base.force == DRM_FORCE_UNSPECIFIED)
+ drm_kms_helper_hotplug_event(dev);
+ }
+
+ /* Release the cached sink reference */
+ if (aconnector->hdmi_prev_sink) {
+ dc_sink_release(aconnector->hdmi_prev_sink);
+ aconnector->hdmi_prev_sink = NULL;
+ }
+
+ scoped_guard(mutex, &adev->dm.dc_lock) {
+ if (reallow_idle && dc->caps.ips_support)
+ dc_allow_idle_optimizations(dc, true);
+ }
+}
+
static void handle_hpd_irq_helper(struct amdgpu_dm_connector *aconnector)
{
struct drm_connector *connector = &aconnector->base;
@@ -3868,6 +3959,7 @@ static void handle_hpd_irq_helper(struct amdgpu_dm_connector *aconnector)
struct dm_connector_state *dm_con_state = to_dm_connector_state(connector->state);
struct dc *dc = aconnector->dc_link->ctx->dc;
bool ret = false;
+ bool debounce_required = false;
if (adev->dm.disable_hpd_irq)
return;
@@ -3890,6 +3982,14 @@ static void handle_hpd_irq_helper(struct amdgpu_dm_connector *aconnector)
if (!dc_link_detect_connection_type(aconnector->dc_link, &new_connection_type))
drm_err(adev_to_drm(adev), "KMS: Failed to detect connector\n");
+ /*
+ * Check for HDMI disconnect with debounce enabled.
+ */
+ debounce_required = (aconnector->hdmi_hpd_debounce_delay_ms > 0 &&
+ dc_is_hdmi_signal(aconnector->dc_link->connector_signal) &&
+ new_connection_type == dc_connection_none &&
+ aconnector->dc_link->local_sink != NULL);
+
if (aconnector->base.force && new_connection_type == dc_connection_none) {
emulated_link_detect(aconnector->dc_link);
@@ -3899,7 +3999,34 @@ static void handle_hpd_irq_helper(struct amdgpu_dm_connector *aconnector)
if (aconnector->base.force == DRM_FORCE_UNSPECIFIED)
drm_kms_helper_connector_hotplug_event(connector);
+ } else if (debounce_required) {
+ /*
+ * HDMI disconnect detected - schedule delayed work instead of
+ * processing immediately. This allows us to coalesce spurious
+ * HDMI signals from physical unplugs.
+ */
+ drm_dbg_kms(dev, "HDMI HPD: Disconnect detected, scheduling debounce work (%u ms)\n",
+ aconnector->hdmi_hpd_debounce_delay_ms);
+
+ /* Cache the current sink for later comparison */
+ if (aconnector->hdmi_prev_sink)
+ dc_sink_release(aconnector->hdmi_prev_sink);
+ aconnector->hdmi_prev_sink = aconnector->dc_link->local_sink;
+ if (aconnector->hdmi_prev_sink)
+ dc_sink_retain(aconnector->hdmi_prev_sink);
+
+ /* Schedule delayed detection. */
+ if (mod_delayed_work(system_wq,
+ &aconnector->hdmi_hpd_debounce_work,
+ msecs_to_jiffies(aconnector->hdmi_hpd_debounce_delay_ms)))
+ drm_dbg_kms(dev, "HDMI HPD: Re-scheduled debounce work\n");
+
} else {
+
+ /* If the aconnector->hdmi_hpd_debounce_work is scheduled, exit early */
+ if (delayed_work_pending(&aconnector->hdmi_hpd_debounce_work))
+ return;
+
scoped_guard(mutex, &adev->dm.dc_lock) {
dc_exit_ips_for_hw_access(dc);
ret = dc_link_detect(aconnector->dc_link, DETECT_REASON_HPD);
@@ -4925,6 +5052,21 @@ static void amdgpu_dm_backlight_set_level(struct amdgpu_display_manager *dm,
struct dc_link *link;
u32 brightness;
bool rc, reallow_idle = false;
+ struct drm_connector *connector;
+
+ list_for_each_entry(connector, &dm->ddev->mode_config.connector_list, head) {
+ struct amdgpu_dm_connector *aconnector = to_amdgpu_dm_connector(connector);
+
+ if (aconnector->bl_idx != bl_idx)
+ continue;
+
+ /* if connector is off, save the brightness for next time it's on */
+ if (!aconnector->base.encoder) {
+ dm->brightness[bl_idx] = user_brightness;
+ dm->actual_brightness[bl_idx] = 0;
+ return;
+ }
+ }
amdgpu_dm_update_backlight_caps(dm, bl_idx);
caps = &dm->backlight_caps[bl_idx];
@@ -7388,6 +7530,13 @@ static void amdgpu_dm_connector_destroy(struct drm_connector *connector)
if (aconnector->mst_mgr.dev)
drm_dp_mst_topology_mgr_destroy(&aconnector->mst_mgr);
+ /* Cancel and flush any pending HDMI HPD debounce work */
+ cancel_delayed_work_sync(&aconnector->hdmi_hpd_debounce_work);
+ if (aconnector->hdmi_prev_sink) {
+ dc_sink_release(aconnector->hdmi_prev_sink);
+ aconnector->hdmi_prev_sink = NULL;
+ }
+
if (aconnector->bl_idx != -1) {
backlight_device_unregister(dm->backlight_dev[aconnector->bl_idx]);
dm->backlight_dev[aconnector->bl_idx] = NULL;
@@ -8549,6 +8698,10 @@ void amdgpu_dm_connector_init_helper(struct amdgpu_display_manager *dm,
mutex_init(&aconnector->hpd_lock);
mutex_init(&aconnector->handle_mst_msg_ready);
+ aconnector->hdmi_hpd_debounce_delay_ms = AMDGPU_DM_HDMI_HPD_DEBOUNCE_MS;
+ INIT_DELAYED_WORK(&aconnector->hdmi_hpd_debounce_work, hdmi_hpd_debounce_work);
+ aconnector->hdmi_prev_sink = NULL;
+
/*
* configure support HPD hot plug connector_>polled default value is 0
* which means HPD hot plug not supported
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
index db75e991ac7b..8ca738957598 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
@@ -59,6 +59,7 @@
#define AMDGPU_HDR_MULT_DEFAULT (0x100000000LL)
+#define AMDGPU_DM_HDMI_HPD_DEBOUNCE_MS 1500
/*
#include "include/amdgpu_dal_power_if.h"
#include "amdgpu_dm_irq.h"
@@ -819,6 +820,11 @@ struct amdgpu_dm_connector {
bool pack_sdp_v1_3;
enum adaptive_sync_type as_type;
struct amdgpu_hdmi_vsdb_info vsdb_info;
+
+ /* HDMI HPD debounce support */
+ unsigned int hdmi_hpd_debounce_delay_ms;
+ struct delayed_work hdmi_hpd_debounce_work;
+ struct dc_sink *hdmi_prev_sink;
};
static inline void amdgpu_dm_set_mst_status(uint8_t *status,
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c
index cc21337a182f..d0f770dd0a95 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c
@@ -997,8 +997,8 @@ enum dc_edid_status dm_helpers_read_local_edid(
struct amdgpu_dm_connector *aconnector = link->priv;
struct drm_connector *connector = &aconnector->base;
struct i2c_adapter *ddc;
- int retry = 3;
- enum dc_edid_status edid_status;
+ int retry = 25;
+ enum dc_edid_status edid_status = EDID_NO_RESPONSE;
const struct drm_edid *drm_edid;
const struct edid *edid;
@@ -1028,7 +1028,7 @@ enum dc_edid_status dm_helpers_read_local_edid(
}
if (!drm_edid)
- return EDID_NO_RESPONSE;
+ continue;
edid = drm_edid_raw(drm_edid); // FIXME: Get rid of drm_edid_raw()
if (!edid ||
@@ -1046,7 +1046,7 @@ enum dc_edid_status dm_helpers_read_local_edid(
&sink->dc_edid,
&sink->edid_caps);
- } while (edid_status == EDID_BAD_CHECKSUM && --retry > 0);
+ } while ((edid_status == EDID_BAD_CHECKSUM || edid_status == EDID_NO_RESPONSE) && --retry > 0);
if (edid_status != EDID_OK)
DRM_ERROR("EDID err: %d, on connector: %s",
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
index 5e92eaa67aa3..dbd1da4d85d3 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
@@ -884,26 +884,28 @@ struct dsc_mst_fairness_params {
};
#if defined(CONFIG_DRM_AMD_DC_FP)
-static uint16_t get_fec_overhead_multiplier(struct dc_link *dc_link)
+static uint64_t kbps_to_pbn(int kbps, bool is_peak_pbn)
{
- u8 link_coding_cap;
- uint16_t fec_overhead_multiplier_x1000 = PBN_FEC_OVERHEAD_MULTIPLIER_8B_10B;
+ uint64_t effective_kbps = (uint64_t)kbps;
- link_coding_cap = dc_link_dp_mst_decide_link_encoding_format(dc_link);
- if (link_coding_cap == DP_128b_132b_ENCODING)
- fec_overhead_multiplier_x1000 = PBN_FEC_OVERHEAD_MULTIPLIER_128B_132B;
+ if (is_peak_pbn) { // add 0.6% (1006/1000) overhead into effective kbps
+ effective_kbps *= 1006;
+ effective_kbps = div_u64(effective_kbps, 1000);
+ }
- return fec_overhead_multiplier_x1000;
+ return (uint64_t) DIV64_U64_ROUND_UP(effective_kbps * 64, (54 * 8 * 1000));
}
-static int kbps_to_peak_pbn(int kbps, uint16_t fec_overhead_multiplier_x1000)
+static uint32_t pbn_to_kbps(unsigned int pbn, bool with_margin)
{
- u64 peak_kbps = kbps;
+ uint64_t pbn_effective = (uint64_t)pbn;
+
+ if (with_margin) // deduct 0.6% (994/1000) overhead from effective pbn
+ pbn_effective *= (1000000 / PEAK_FACTOR_X1000);
+ else
+ pbn_effective *= 1000;
- peak_kbps *= 1006;
- peak_kbps *= fec_overhead_multiplier_x1000;
- peak_kbps = div_u64(peak_kbps, 1000 * 1000);
- return (int) DIV64_U64_ROUND_UP(peak_kbps * 64, (54 * 8 * 1000));
+ return DIV_U64_ROUND_UP(pbn_effective * 8 * 54, 64);
}
static void set_dsc_configs_from_fairness_vars(struct dsc_mst_fairness_params *params,
@@ -974,7 +976,7 @@ static int bpp_x16_from_pbn(struct dsc_mst_fairness_params param, int pbn)
dc_dsc_get_default_config_option(param.sink->ctx->dc, &dsc_options);
dsc_options.max_target_bpp_limit_override_x16 = drm_connector->display_info.max_dsc_bpp * 16;
- kbps = div_u64((u64)pbn * 994 * 8 * 54, 64);
+ kbps = pbn_to_kbps(pbn, false);
dc_dsc_compute_config(
param.sink->ctx->dc->res_pool->dscs[0],
&param.sink->dsc_caps.dsc_dec_caps,
@@ -1003,12 +1005,11 @@ static int increase_dsc_bpp(struct drm_atomic_state *state,
int link_timeslots_used;
int fair_pbn_alloc;
int ret = 0;
- uint16_t fec_overhead_multiplier_x1000 = get_fec_overhead_multiplier(dc_link);
for (i = 0; i < count; i++) {
if (vars[i + k].dsc_enabled) {
initial_slack[i] =
- kbps_to_peak_pbn(params[i].bw_range.max_kbps, fec_overhead_multiplier_x1000) - vars[i + k].pbn;
+ kbps_to_pbn(params[i].bw_range.max_kbps, false) - vars[i + k].pbn;
bpp_increased[i] = false;
remaining_to_increase += 1;
} else {
@@ -1104,7 +1105,6 @@ static int try_disable_dsc(struct drm_atomic_state *state,
int next_index;
int remaining_to_try = 0;
int ret;
- uint16_t fec_overhead_multiplier_x1000 = get_fec_overhead_multiplier(dc_link);
int var_pbn;
for (i = 0; i < count; i++) {
@@ -1137,7 +1137,7 @@ static int try_disable_dsc(struct drm_atomic_state *state,
DRM_DEBUG_DRIVER("MST_DSC index #%d, try no compression\n", next_index);
var_pbn = vars[next_index].pbn;
- vars[next_index].pbn = kbps_to_peak_pbn(params[next_index].bw_range.stream_kbps, fec_overhead_multiplier_x1000);
+ vars[next_index].pbn = kbps_to_pbn(params[next_index].bw_range.stream_kbps, true);
ret = drm_dp_atomic_find_time_slots(state,
params[next_index].port->mgr,
params[next_index].port,
@@ -1197,7 +1197,6 @@ static int compute_mst_dsc_configs_for_link(struct drm_atomic_state *state,
int count = 0;
int i, k, ret;
bool debugfs_overwrite = false;
- uint16_t fec_overhead_multiplier_x1000 = get_fec_overhead_multiplier(dc_link);
struct drm_connector_state *new_conn_state;
memset(params, 0, sizeof(params));
@@ -1278,7 +1277,7 @@ static int compute_mst_dsc_configs_for_link(struct drm_atomic_state *state,
DRM_DEBUG_DRIVER("MST_DSC Try no compression\n");
for (i = 0; i < count; i++) {
vars[i + k].aconnector = params[i].aconnector;
- vars[i + k].pbn = kbps_to_peak_pbn(params[i].bw_range.stream_kbps, fec_overhead_multiplier_x1000);
+ vars[i + k].pbn = kbps_to_pbn(params[i].bw_range.stream_kbps, false);
vars[i + k].dsc_enabled = false;
vars[i + k].bpp_x16 = 0;
ret = drm_dp_atomic_find_time_slots(state, params[i].port->mgr, params[i].port,
@@ -1300,7 +1299,7 @@ static int compute_mst_dsc_configs_for_link(struct drm_atomic_state *state,
DRM_DEBUG_DRIVER("MST_DSC Try max compression\n");
for (i = 0; i < count; i++) {
if (params[i].compression_possible && params[i].clock_force_enable != DSC_CLK_FORCE_DISABLE) {
- vars[i + k].pbn = kbps_to_peak_pbn(params[i].bw_range.min_kbps, fec_overhead_multiplier_x1000);
+ vars[i + k].pbn = kbps_to_pbn(params[i].bw_range.min_kbps, false);
vars[i + k].dsc_enabled = true;
vars[i + k].bpp_x16 = params[i].bw_range.min_target_bpp_x16;
ret = drm_dp_atomic_find_time_slots(state, params[i].port->mgr,
@@ -1308,7 +1307,7 @@ static int compute_mst_dsc_configs_for_link(struct drm_atomic_state *state,
if (ret < 0)
return ret;
} else {
- vars[i + k].pbn = kbps_to_peak_pbn(params[i].bw_range.stream_kbps, fec_overhead_multiplier_x1000);
+ vars[i + k].pbn = kbps_to_pbn(params[i].bw_range.stream_kbps, false);
vars[i + k].dsc_enabled = false;
vars[i + k].bpp_x16 = 0;
ret = drm_dp_atomic_find_time_slots(state, params[i].port->mgr,
@@ -1763,18 +1762,6 @@ clean_exit:
return ret;
}
-static uint32_t kbps_from_pbn(unsigned int pbn)
-{
- uint64_t kbps = (uint64_t)pbn;
-
- kbps *= (1000000 / PEAK_FACTOR_X1000);
- kbps *= 8;
- kbps *= 54;
- kbps /= 64;
-
- return (uint32_t)kbps;
-}
-
static bool is_dsc_common_config_possible(struct dc_stream_state *stream,
struct dc_dsc_bw_range *bw_range)
{
@@ -1873,7 +1860,7 @@ enum dc_status dm_dp_mst_is_port_support_mode(
dc_link_get_highest_encoding_format(stream->link));
cur_link_settings = stream->link->verified_link_cap;
root_link_bw_in_kbps = dc_link_bandwidth_kbps(aconnector->dc_link, &cur_link_settings);
- virtual_channel_bw_in_kbps = kbps_from_pbn(aconnector->mst_output_port->full_pbn);
+ virtual_channel_bw_in_kbps = pbn_to_kbps(aconnector->mst_output_port->full_pbn, true);
/* pick the end to end bw bottleneck */
end_to_end_bw_in_kbps = min(root_link_bw_in_kbps, virtual_channel_bw_in_kbps);
@@ -1926,7 +1913,7 @@ enum dc_status dm_dp_mst_is_port_support_mode(
immediate_upstream_port = aconnector->mst_output_port->parent->port_parent;
if (immediate_upstream_port) {
- virtual_channel_bw_in_kbps = kbps_from_pbn(immediate_upstream_port->full_pbn);
+ virtual_channel_bw_in_kbps = pbn_to_kbps(immediate_upstream_port->full_pbn, true);
virtual_channel_bw_in_kbps = min(root_link_bw_in_kbps, virtual_channel_bw_in_kbps);
} else {
/* For topology LCT 1 case - only one mstb*/
diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c
index b11383fba35f..1eb04772f5da 100644
--- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c
+++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c
@@ -394,6 +394,8 @@ void dcn35_update_clocks(struct clk_mgr *clk_mgr_base,
display_count = dcn35_get_active_display_cnt_wa(dc, context, &all_active_disps);
if (new_clocks->dtbclk_en && !new_clocks->ref_dtbclk_khz)
new_clocks->ref_dtbclk_khz = 600000;
+ else if (!new_clocks->dtbclk_en && new_clocks->ref_dtbclk_khz > 590000)
+ new_clocks->ref_dtbclk_khz = 0;
/*
* if it is safe to lower, but we are already in the lower state, we don't have to do anything
@@ -435,7 +437,7 @@ void dcn35_update_clocks(struct clk_mgr *clk_mgr_base,
actual_dtbclk = REG_READ(CLK1_CLK4_CURRENT_CNT);
- if (actual_dtbclk) {
+ if (actual_dtbclk > 590000) {
clk_mgr_base->clks.ref_dtbclk_khz = new_clocks->ref_dtbclk_khz;
clk_mgr_base->clks.dtbclk_en = new_clocks->dtbclk_en;
}
diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_stream.c b/drivers/gpu/drm/amd/display/dc/core/dc_stream.c
index 9ac2d41f8fca..0a46e834357a 100644
--- a/drivers/gpu/drm/amd/display/dc/core/dc_stream.c
+++ b/drivers/gpu/drm/amd/display/dc/core/dc_stream.c
@@ -705,9 +705,14 @@ bool dc_stream_get_scanoutpos(const struct dc_stream_state *stream,
{
uint8_t i;
bool ret = false;
- struct dc *dc = stream->ctx->dc;
- struct resource_context *res_ctx =
- &dc->current_state->res_ctx;
+ struct dc *dc;
+ struct resource_context *res_ctx;
+
+ if (!stream->ctx)
+ return false;
+
+ dc = stream->ctx->dc;
+ res_ctx = &dc->current_state->res_ctx;
dc_exit_ips_for_hw_access(dc);
diff --git a/drivers/gpu/drm/amd/display/dc/dccg/dcn35/dcn35_dccg.c b/drivers/gpu/drm/amd/display/dc/dccg/dcn35/dcn35_dccg.c
index de6d62401362..c899c09ea31b 100644
--- a/drivers/gpu/drm/amd/display/dc/dccg/dcn35/dcn35_dccg.c
+++ b/drivers/gpu/drm/amd/display/dc/dccg/dcn35/dcn35_dccg.c
@@ -1411,7 +1411,7 @@ static void dccg35_set_dtbclk_dto(
__func__, params->otg_inst, params->pixclk_khz,
params->ref_dtbclk_khz, req_dtbclk_khz, phase, modulo);
- } else {
+ } else if (!params->ref_dtbclk_khz && !req_dtbclk_khz) {
switch (params->otg_inst) {
case 0:
REG_UPDATE(DCCG_GATE_DISABLE_CNTL5, DTBCLK_P0_GATE_DISABLE, 0);
diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dce110/dce110_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dce110/dce110_hwseq.c
index 24184b4eb352..ebc220b29d14 100644
--- a/drivers/gpu/drm/amd/display/dc/hwss/dce110/dce110_hwseq.c
+++ b/drivers/gpu/drm/amd/display/dc/hwss/dce110/dce110_hwseq.c
@@ -671,7 +671,6 @@ void dce110_enable_stream(struct pipe_ctx *pipe_ctx)
uint32_t early_control = 0;
struct timing_generator *tg = pipe_ctx->stream_res.tg;
- link_hwss->setup_stream_attribute(pipe_ctx);
link_hwss->setup_stream_encoder(pipe_ctx);
dc->hwss.update_info_frame(pipe_ctx);
diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c
index 9477c9f9e196..56c1ab6c7330 100644
--- a/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c
+++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c
@@ -614,6 +614,14 @@ void dcn20_dpp_pg_control(
* DOMAIN11_PGFSM_PWR_STATUS, pwr_status,
* 1, 1000);
*/
+
+ /* Force disable cursor on plane powerdown on DPP 5 using dpp_force_disable_cursor */
+ if (!power_on) {
+ struct dpp *dpp5 = hws->ctx->dc->res_pool->dpps[dpp_inst];
+ if (dpp5 && dpp5->funcs->dpp_force_disable_cursor)
+ dpp5->funcs->dpp_force_disable_cursor(dpp5);
+ }
+
break;
default:
BREAK_TO_DEBUGGER();
@@ -3052,8 +3060,6 @@ void dcn20_enable_stream(struct pipe_ctx *pipe_ctx)
link_enc->transmitter - TRANSMITTER_UNIPHY_A);
}
- link_hwss->setup_stream_attribute(pipe_ctx);
-
if (dc->res_pool->dccg->funcs->set_pixel_rate_div)
dc->res_pool->dccg->funcs->set_pixel_rate_div(
dc->res_pool->dccg,
diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn401/dcn401_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn401/dcn401_hwseq.c
index ce3d0b45fb4c..68e48a2492c9 100644
--- a/drivers/gpu/drm/amd/display/dc/hwss/dcn401/dcn401_hwseq.c
+++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn401/dcn401_hwseq.c
@@ -971,8 +971,6 @@ void dcn401_enable_stream(struct pipe_ctx *pipe_ctx)
}
}
- link_hwss->setup_stream_attribute(pipe_ctx);
-
if (dc->res_pool->dccg->funcs->set_pixel_rate_div) {
dc->res_pool->dccg->funcs->set_pixel_rate_div(
dc->res_pool->dccg,
diff --git a/drivers/gpu/drm/amd/display/dc/link/link_dpms.c b/drivers/gpu/drm/amd/display/dc/link/link_dpms.c
index 83419e1a9036..b66fbcb0040d 100644
--- a/drivers/gpu/drm/amd/display/dc/link/link_dpms.c
+++ b/drivers/gpu/drm/amd/display/dc/link/link_dpms.c
@@ -2458,6 +2458,7 @@ void link_set_dpms_on(
struct link_encoder *link_enc = pipe_ctx->link_res.dio_link_enc;
enum otg_out_mux_dest otg_out_dest = OUT_MUX_DIO;
struct vpg *vpg = pipe_ctx->stream_res.stream_enc->vpg;
+ const struct link_hwss *link_hwss = get_link_hwss(link, &pipe_ctx->link_res);
bool apply_edp_fast_boot_optimization =
pipe_ctx->stream->apply_edp_fast_boot_optimization;
@@ -2502,6 +2503,8 @@ void link_set_dpms_on(
pipe_ctx->stream_res.tg->funcs->set_out_mux(pipe_ctx->stream_res.tg, otg_out_dest);
}
+ link_hwss->setup_stream_attribute(pipe_ctx);
+
pipe_ctx->stream->apply_edp_fast_boot_optimization = false;
// Enable VPG before building infoframe
diff --git a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_capability.c b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_capability.c
index b12c11bd6a14..eb262ce42e2d 100644
--- a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_capability.c
+++ b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_capability.c
@@ -1691,7 +1691,7 @@ static bool retrieve_link_cap(struct dc_link *link)
union edp_configuration_cap edp_config_cap;
union dp_downstream_port_present ds_port = { 0 };
enum dc_status status = DC_ERROR_UNEXPECTED;
- uint32_t read_dpcd_retry_cnt = 3;
+ uint32_t read_dpcd_retry_cnt = 20;
int i;
struct dp_sink_hw_fw_revision dp_hw_fw_revision;
const uint32_t post_oui_delay = 30; // 30ms
@@ -1734,12 +1734,13 @@ static bool retrieve_link_cap(struct dc_link *link)
}
dpcd_set_source_specific_data(link);
- /* Sink may need to configure internals based on vendor, so allow some
- * time before proceeding with possibly vendor specific transactions
- */
- msleep(post_oui_delay);
for (i = 0; i < read_dpcd_retry_cnt; i++) {
+ /*
+ * Sink may need to configure internals based on vendor, so allow some
+ * time before proceeding with possibly vendor specific transactions
+ */
+ msleep(post_oui_delay);
status = core_link_read_dpcd(
link,
DP_DPCD_REV,
diff --git a/drivers/gpu/drm/amd/display/dc/virtual/virtual_stream_encoder.c b/drivers/gpu/drm/amd/display/dc/virtual/virtual_stream_encoder.c
index 6ffc74fc9dcd..ad088d70e189 100644
--- a/drivers/gpu/drm/amd/display/dc/virtual/virtual_stream_encoder.c
+++ b/drivers/gpu/drm/amd/display/dc/virtual/virtual_stream_encoder.c
@@ -44,11 +44,6 @@ static void virtual_stream_encoder_dvi_set_stream_attribute(
struct dc_crtc_timing *crtc_timing,
bool is_dual_link) {}
-static void virtual_stream_encoder_lvds_set_stream_attribute(
- struct stream_encoder *enc,
- struct dc_crtc_timing *crtc_timing)
-{}
-
static void virtual_stream_encoder_set_throttled_vcp_size(
struct stream_encoder *enc,
struct fixed31_32 avg_time_slots_per_mtp)
@@ -120,8 +115,6 @@ static const struct stream_encoder_funcs virtual_str_enc_funcs = {
virtual_stream_encoder_hdmi_set_stream_attribute,
.dvi_set_stream_attribute =
virtual_stream_encoder_dvi_set_stream_attribute,
- .lvds_set_stream_attribute =
- virtual_stream_encoder_lvds_set_stream_attribute,
.set_throttled_vcp_size =
virtual_stream_encoder_set_throttled_vcp_size,
.update_hdmi_info_packets =
diff --git a/drivers/gpu/drm/amd/display/modules/freesync/freesync.c b/drivers/gpu/drm/amd/display/modules/freesync/freesync.c
index ce421bcddcb0..1aae46d703ba 100644
--- a/drivers/gpu/drm/amd/display/modules/freesync/freesync.c
+++ b/drivers/gpu/drm/amd/display/modules/freesync/freesync.c
@@ -1260,6 +1260,17 @@ void mod_freesync_handle_v_update(struct mod_freesync *mod_freesync,
update_v_total_for_static_ramp(
core_freesync, stream, in_out_vrr);
}
+
+ /*
+ * If VRR is inactive, set vtotal min and max to nominal vtotal
+ */
+ if (in_out_vrr->state == VRR_STATE_INACTIVE) {
+ in_out_vrr->adjust.v_total_min =
+ mod_freesync_calc_v_total_from_refresh(stream,
+ in_out_vrr->max_refresh_in_uhz);
+ in_out_vrr->adjust.v_total_max = in_out_vrr->adjust.v_total_min;
+ return;
+ }
}
unsigned long long mod_freesync_calc_nominal_field_rate(
diff --git a/drivers/gpu/drm/bridge/sii902x.c b/drivers/gpu/drm/bridge/sii902x.c
index d537b1d036fb..1f0aba28ad1e 100644
--- a/drivers/gpu/drm/bridge/sii902x.c
+++ b/drivers/gpu/drm/bridge/sii902x.c
@@ -179,7 +179,6 @@ struct sii902x {
struct drm_connector connector;
struct gpio_desc *reset_gpio;
struct i2c_mux_core *i2cmux;
- bool sink_is_hdmi;
u32 bus_width;
/*
@@ -315,8 +314,6 @@ static int sii902x_get_modes(struct drm_connector *connector)
drm_edid_free(drm_edid);
}
- sii902x->sink_is_hdmi = connector->display_info.is_hdmi;
-
return num;
}
@@ -342,9 +339,17 @@ static void sii902x_bridge_atomic_enable(struct drm_bridge *bridge,
struct drm_atomic_state *state)
{
struct sii902x *sii902x = bridge_to_sii902x(bridge);
+ struct drm_connector *connector;
+ u8 output_mode = SII902X_SYS_CTRL_OUTPUT_DVI;
+
+ connector = drm_atomic_get_new_connector_for_encoder(state, bridge->encoder);
+ if (connector && connector->display_info.is_hdmi)
+ output_mode = SII902X_SYS_CTRL_OUTPUT_HDMI;
mutex_lock(&sii902x->mutex);
+ regmap_update_bits(sii902x->regmap, SII902X_SYS_CTRL_DATA,
+ SII902X_SYS_CTRL_OUTPUT_MODE, output_mode);
regmap_update_bits(sii902x->regmap, SII902X_PWR_STATE_CTRL,
SII902X_AVI_POWER_STATE_MSK,
SII902X_AVI_POWER_STATE_D(0));
@@ -359,16 +364,12 @@ static void sii902x_bridge_mode_set(struct drm_bridge *bridge,
const struct drm_display_mode *adj)
{
struct sii902x *sii902x = bridge_to_sii902x(bridge);
- u8 output_mode = SII902X_SYS_CTRL_OUTPUT_DVI;
struct regmap *regmap = sii902x->regmap;
u8 buf[HDMI_INFOFRAME_SIZE(AVI)];
struct hdmi_avi_infoframe frame;
u16 pixel_clock_10kHz = adj->clock / 10;
int ret;
- if (sii902x->sink_is_hdmi)
- output_mode = SII902X_SYS_CTRL_OUTPUT_HDMI;
-
buf[0] = pixel_clock_10kHz & 0xff;
buf[1] = pixel_clock_10kHz >> 8;
buf[2] = drm_mode_vrefresh(adj);
@@ -384,11 +385,6 @@ static void sii902x_bridge_mode_set(struct drm_bridge *bridge,
mutex_lock(&sii902x->mutex);
- ret = regmap_update_bits(sii902x->regmap, SII902X_SYS_CTRL_DATA,
- SII902X_SYS_CTRL_OUTPUT_MODE, output_mode);
- if (ret)
- goto out;
-
ret = regmap_bulk_write(regmap, SII902X_TPI_VIDEO_DATA, buf, 10);
if (ret)
goto out;
diff --git a/drivers/gpu/drm/clients/drm_client_setup.c b/drivers/gpu/drm/clients/drm_client_setup.c
index 72480db1f00d..515aceac22b1 100644
--- a/drivers/gpu/drm/clients/drm_client_setup.c
+++ b/drivers/gpu/drm/clients/drm_client_setup.c
@@ -13,8 +13,8 @@
static char drm_client_default[16] = CONFIG_DRM_CLIENT_DEFAULT;
module_param_string(active, drm_client_default, sizeof(drm_client_default), 0444);
MODULE_PARM_DESC(active,
- "Choose which drm client to start, default is"
- CONFIG_DRM_CLIENT_DEFAULT "]");
+ "Choose which drm client to start, default is "
+ CONFIG_DRM_CLIENT_DEFAULT);
/**
* drm_client_setup() - Setup in-kernel DRM clients
diff --git a/drivers/gpu/drm/drm_fb_helper.c b/drivers/gpu/drm/drm_fb_helper.c
index 11a5b60cb9ce..0b3ee008523d 100644
--- a/drivers/gpu/drm/drm_fb_helper.c
+++ b/drivers/gpu/drm/drm_fb_helper.c
@@ -31,9 +31,7 @@
#include <linux/console.h>
#include <linux/export.h>
-#include <linux/pci.h>
#include <linux/sysrq.h>
-#include <linux/vga_switcheroo.h>
#include <drm/drm_atomic.h>
#include <drm/drm_drv.h>
@@ -566,11 +564,6 @@ EXPORT_SYMBOL(drm_fb_helper_release_info);
*/
void drm_fb_helper_unregister_info(struct drm_fb_helper *fb_helper)
{
- struct fb_info *info = fb_helper->info;
- struct device *dev = info->device;
-
- if (dev_is_pci(dev))
- vga_switcheroo_client_fb_set(to_pci_dev(dev), NULL);
unregister_framebuffer(fb_helper->info);
}
EXPORT_SYMBOL(drm_fb_helper_unregister_info);
@@ -1632,7 +1625,6 @@ static int drm_fb_helper_single_fb_probe(struct drm_fb_helper *fb_helper)
struct drm_client_dev *client = &fb_helper->client;
struct drm_device *dev = fb_helper->dev;
struct drm_fb_helper_surface_size sizes;
- struct fb_info *info;
int ret;
if (drm_WARN_ON(dev, !dev->driver->fbdev_probe))
@@ -1653,12 +1645,6 @@ static int drm_fb_helper_single_fb_probe(struct drm_fb_helper *fb_helper)
strcpy(fb_helper->fb->comm, "[fbcon]");
- info = fb_helper->info;
-
- /* Set the fb info for vgaswitcheroo clients. Does nothing otherwise. */
- if (dev_is_pci(info->device))
- vga_switcheroo_client_fb_set(to_pci_dev(info->device), info);
-
return 0;
}
diff --git a/drivers/gpu/drm/drm_plane.c b/drivers/gpu/drm/drm_plane.c
index 38f82391bfda..a30493ed9715 100644
--- a/drivers/gpu/drm/drm_plane.c
+++ b/drivers/gpu/drm/drm_plane.c
@@ -210,7 +210,7 @@ static struct drm_property_blob *create_in_format_blob(struct drm_device *dev,
formats_size = sizeof(__u32) * plane->format_count;
if (WARN_ON(!formats_size)) {
/* 0 formats are never expected */
- return 0;
+ return ERR_PTR(-EINVAL);
}
modifiers_size =
@@ -226,7 +226,7 @@ static struct drm_property_blob *create_in_format_blob(struct drm_device *dev,
blob = drm_property_create_blob(dev, blob_size, NULL);
if (IS_ERR(blob))
- return NULL;
+ return blob;
blob_data = blob->data;
blob_data->version = FORMAT_BLOB_CURRENT;
diff --git a/drivers/gpu/drm/i915/display/intel_cx0_phy.c b/drivers/gpu/drm/i915/display/intel_cx0_phy.c
index 801235a5bc0a..a2d2cecf7121 100644
--- a/drivers/gpu/drm/i915/display/intel_cx0_phy.c
+++ b/drivers/gpu/drm/i915/display/intel_cx0_phy.c
@@ -39,14 +39,12 @@ bool intel_encoder_is_c10phy(struct intel_encoder *encoder)
struct intel_display *display = to_intel_display(encoder);
enum phy phy = intel_encoder_to_phy(encoder);
- /* PTL doesn't have a PHY connected to PORT B; as such,
- * there will never be a case where PTL uses PHY B.
- * WCL uses PORT A and B with the C10 PHY.
- * Reusing the condition for WCL and extending it for PORT B
- * should not cause any issues for PTL.
- */
- if (display->platform.pantherlake && phy < PHY_C)
- return true;
+ if (display->platform.pantherlake) {
+ if (display->platform.pantherlake_wildcatlake)
+ return phy <= PHY_B;
+ else
+ return phy == PHY_A;
+ }
if ((display->platform.lunarlake || display->platform.meteorlake) && phy < PHY_C)
return true;
diff --git a/drivers/gpu/drm/i915/display/intel_display.c b/drivers/gpu/drm/i915/display/intel_display.c
index 5dca7f96b425..0d527cf22866 100644
--- a/drivers/gpu/drm/i915/display/intel_display.c
+++ b/drivers/gpu/drm/i915/display/intel_display.c
@@ -5964,6 +5964,14 @@ static int intel_async_flip_check_uapi(struct intel_atomic_state *state,
return -EINVAL;
}
+ /* FIXME: selective fetch should be disabled for async flips */
+ if (new_crtc_state->enable_psr2_sel_fetch) {
+ drm_dbg_kms(display->drm,
+ "[CRTC:%d:%s] async flip disallowed with PSR2 selective fetch\n",
+ crtc->base.base.id, crtc->base.name);
+ return -EINVAL;
+ }
+
for_each_oldnew_intel_plane_in_state(state, plane, old_plane_state,
new_plane_state, i) {
if (plane->pipe != crtc->pipe)
diff --git a/drivers/gpu/drm/i915/display/intel_display_device.c b/drivers/gpu/drm/i915/display/intel_display_device.c
index a002bc6ce7b0..f3f1f25b0f38 100644
--- a/drivers/gpu/drm/i915/display/intel_display_device.c
+++ b/drivers/gpu/drm/i915/display/intel_display_device.c
@@ -1404,8 +1404,20 @@ static const struct platform_desc bmg_desc = {
PLATFORM_GROUP(dgfx),
};
+static const u16 wcl_ids[] = {
+ INTEL_WCL_IDS(ID),
+ 0
+};
+
static const struct platform_desc ptl_desc = {
PLATFORM(pantherlake),
+ .subplatforms = (const struct subplatform_desc[]) {
+ {
+ SUBPLATFORM(pantherlake, wildcatlake),
+ .pciidlist = wcl_ids,
+ },
+ {},
+ }
};
__diag_pop();
@@ -1482,6 +1494,7 @@ static const struct {
INTEL_LNL_IDS(INTEL_DISPLAY_DEVICE, &lnl_desc),
INTEL_BMG_IDS(INTEL_DISPLAY_DEVICE, &bmg_desc),
INTEL_PTL_IDS(INTEL_DISPLAY_DEVICE, &ptl_desc),
+ INTEL_WCL_IDS(INTEL_DISPLAY_DEVICE, &ptl_desc),
};
static const struct {
diff --git a/drivers/gpu/drm/i915/display/intel_display_device.h b/drivers/gpu/drm/i915/display/intel_display_device.h
index f329f1beafef..a910642d589c 100644
--- a/drivers/gpu/drm/i915/display/intel_display_device.h
+++ b/drivers/gpu/drm/i915/display/intel_display_device.h
@@ -101,7 +101,9 @@ struct pci_dev;
/* Display ver 14.1 (based on GMD ID) */ \
func(battlemage) \
/* Display ver 30 (based on GMD ID) */ \
- func(pantherlake)
+ func(pantherlake) \
+ func(pantherlake_wildcatlake)
+
#define __MEMBER(name) unsigned long name:1;
#define __COUNT(x) 1 +
diff --git a/drivers/gpu/drm/i915/display/intel_dmc.c b/drivers/gpu/drm/i915/display/intel_dmc.c
index 4a4cace1f879..e1455fd7277f 100644
--- a/drivers/gpu/drm/i915/display/intel_dmc.c
+++ b/drivers/gpu/drm/i915/display/intel_dmc.c
@@ -127,6 +127,9 @@ static bool dmc_firmware_param_disabled(struct intel_display *display)
#define DISPLAY_VER13_DMC_MAX_FW_SIZE 0x20000
#define DISPLAY_VER12_DMC_MAX_FW_SIZE ICL_DMC_MAX_FW_SIZE
+#define XE3LPD_3002_DMC_PATH DMC_PATH(xe3lpd_3002)
+MODULE_FIRMWARE(XE3LPD_3002_DMC_PATH);
+
#define XE3LPD_DMC_PATH DMC_PATH(xe3lpd)
MODULE_FIRMWARE(XE3LPD_DMC_PATH);
@@ -183,9 +186,10 @@ static const char *dmc_firmware_default(struct intel_display *display, u32 *size
{
const char *fw_path = NULL;
u32 max_fw_size = 0;
-
- if (DISPLAY_VERx100(display) == 3002 ||
- DISPLAY_VERx100(display) == 3000) {
+ if (DISPLAY_VERx100(display) == 3002) {
+ fw_path = XE3LPD_3002_DMC_PATH;
+ max_fw_size = XE2LPD_DMC_MAX_FW_SIZE;
+ } else if (DISPLAY_VERx100(display) == 3000) {
fw_path = XE3LPD_DMC_PATH;
max_fw_size = XE2LPD_DMC_MAX_FW_SIZE;
} else if (DISPLAY_VERx100(display) == 2000) {
diff --git a/drivers/gpu/drm/i915/display/intel_psr.c b/drivers/gpu/drm/i915/display/intel_psr.c
index 10eb93a34cf2..6d9c95e5c025 100644
--- a/drivers/gpu/drm/i915/display/intel_psr.c
+++ b/drivers/gpu/drm/i915/display/intel_psr.c
@@ -585,6 +585,10 @@ static void _panel_replay_init_dpcd(struct intel_dp *intel_dp)
struct intel_display *display = to_intel_display(intel_dp);
int ret;
+ /* TODO: Enable Panel Replay on MST once it's properly implemented. */
+ if (intel_dp->mst_detect == DRM_DP_MST)
+ return;
+
ret = drm_dp_dpcd_read_data(&intel_dp->aux, DP_PANEL_REPLAY_CAP_SUPPORT,
&intel_dp->pr_dpcd, sizeof(intel_dp->pr_dpcd));
if (ret < 0)
@@ -888,7 +892,8 @@ static bool is_dc5_dc6_blocked(struct intel_dp *intel_dp)
{
struct intel_display *display = to_intel_display(intel_dp);
u32 current_dc_state = intel_display_power_get_current_dc_state(display);
- struct drm_vblank_crtc *vblank = &display->drm->vblank[intel_dp->psr.pipe];
+ struct intel_crtc *crtc = intel_crtc_for_pipe(display, intel_dp->psr.pipe);
+ struct drm_vblank_crtc *vblank = drm_crtc_vblank_crtc(&crtc->base);
return (current_dc_state != DC_STATE_EN_UPTO_DC5 &&
current_dc_state != DC_STATE_EN_UPTO_DC6) ||
@@ -1251,12 +1256,6 @@ static bool intel_psr2_sel_fetch_config_valid(struct intel_dp *intel_dp,
return false;
}
- if (crtc_state->uapi.async_flip) {
- drm_dbg_kms(display->drm,
- "PSR2 sel fetch not enabled, async flip enabled\n");
- return false;
- }
-
return crtc_state->enable_psr2_sel_fetch = true;
}
diff --git a/drivers/gpu/drm/imagination/pvr_device.h b/drivers/gpu/drm/imagination/pvr_device.h
index ab8f56ae15df..ec53ff275541 100644
--- a/drivers/gpu/drm/imagination/pvr_device.h
+++ b/drivers/gpu/drm/imagination/pvr_device.h
@@ -146,6 +146,14 @@ struct pvr_device {
*/
struct clk *mem_clk;
+ /**
+ * @power: Optional power domain devices.
+ *
+ * On platforms with more than one power domain for the GPU, they are
+ * stored here in @domain_devs, along with links between them in
+ * @domain_links. The size of @domain_devs is given by @domain_count,
+ * while the size of @domain_links is (2 * @domain_count) - 1.
+ */
struct pvr_device_power {
struct device **domain_devs;
struct device_link **domain_links;
diff --git a/drivers/gpu/drm/nouveau/nvkm/falcon/fw.c b/drivers/gpu/drm/nouveau/nvkm/falcon/fw.c
index cac6d64ab67d..4e8b3f1c7e25 100644
--- a/drivers/gpu/drm/nouveau/nvkm/falcon/fw.c
+++ b/drivers/gpu/drm/nouveau/nvkm/falcon/fw.c
@@ -159,6 +159,8 @@ nvkm_falcon_fw_dtor(struct nvkm_falcon_fw *fw)
nvkm_memory_unref(&fw->inst);
nvkm_falcon_fw_dtor_sigs(fw);
nvkm_firmware_dtor(&fw->fw);
+ kfree(fw->boot);
+ fw->boot = NULL;
}
static const struct nvkm_firmware_func
diff --git a/drivers/gpu/drm/panthor/panthor_gem.c b/drivers/gpu/drm/panthor/panthor_gem.c
index 156c7a0b62a2..3f43686f0195 100644
--- a/drivers/gpu/drm/panthor/panthor_gem.c
+++ b/drivers/gpu/drm/panthor/panthor_gem.c
@@ -288,6 +288,23 @@ panthor_gem_create_with_handle(struct drm_file *file,
panthor_gem_debugfs_set_usage_flags(bo, 0);
+ /* If this is a write-combine mapping, we query the sgt to force a CPU
+ * cache flush (dma_map_sgtable() is called when the sgt is created).
+ * This ensures the zero-ing is visible to any uncached mapping created
+ * by vmap/mmap.
+ * FIXME: Ideally this should be done when pages are allocated, not at
+ * BO creation time.
+ */
+ if (shmem->map_wc) {
+ struct sg_table *sgt;
+
+ sgt = drm_gem_shmem_get_pages_sgt(shmem);
+ if (IS_ERR(sgt)) {
+ ret = PTR_ERR(sgt);
+ goto out_put_gem;
+ }
+ }
+
/*
* Allocate an id of idr table where the obj is registered
* and handle has the id what user can see.
@@ -296,6 +313,7 @@ panthor_gem_create_with_handle(struct drm_file *file,
if (!ret)
*size = bo->base.base.size;
+out_put_gem:
/* drop reference from allocate - handle holds it now. */
drm_gem_object_put(&shmem->base);
diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c
index 5b5b54e876d4..167d6f122b8e 100644
--- a/drivers/gpu/drm/radeon/radeon_fence.c
+++ b/drivers/gpu/drm/radeon/radeon_fence.c
@@ -360,13 +360,6 @@ static bool radeon_fence_is_signaled(struct dma_fence *f)
if (atomic64_read(&rdev->fence_drv[ring].last_seq) >= seq)
return true;
- if (down_read_trylock(&rdev->exclusive_lock)) {
- radeon_fence_process(rdev, ring);
- up_read(&rdev->exclusive_lock);
-
- if (atomic64_read(&rdev->fence_drv[ring].last_seq) >= seq)
- return true;
- }
return false;
}
diff --git a/drivers/gpu/drm/sti/sti_vtg.c b/drivers/gpu/drm/sti/sti_vtg.c
index ee81691b3203..ce6bc7e7b135 100644
--- a/drivers/gpu/drm/sti/sti_vtg.c
+++ b/drivers/gpu/drm/sti/sti_vtg.c
@@ -143,12 +143,17 @@ struct sti_vtg {
struct sti_vtg *of_vtg_find(struct device_node *np)
{
struct platform_device *pdev;
+ struct sti_vtg *vtg;
pdev = of_find_device_by_node(np);
if (!pdev)
return NULL;
- return (struct sti_vtg *)platform_get_drvdata(pdev);
+ vtg = platform_get_drvdata(pdev);
+
+ put_device(&pdev->dev);
+
+ return vtg;
}
static void vtg_reset(struct sti_vtg *vtg)
diff --git a/drivers/gpu/drm/tegra/dc.c b/drivers/gpu/drm/tegra/dc.c
index 59d5c1ba145a..6c84bd69b11f 100644
--- a/drivers/gpu/drm/tegra/dc.c
+++ b/drivers/gpu/drm/tegra/dc.c
@@ -3148,6 +3148,7 @@ static int tegra_dc_couple(struct tegra_dc *dc)
dc->client.parent = &parent->client;
dev_dbg(dc->dev, "coupled to %s\n", dev_name(companion));
+ put_device(companion);
}
return 0;
diff --git a/drivers/gpu/drm/tegra/dsi.c b/drivers/gpu/drm/tegra/dsi.c
index b5089b772267..ddfb2858acbf 100644
--- a/drivers/gpu/drm/tegra/dsi.c
+++ b/drivers/gpu/drm/tegra/dsi.c
@@ -913,15 +913,6 @@ static void tegra_dsi_encoder_enable(struct drm_encoder *encoder)
u32 value;
int err;
- /* If the bootloader enabled DSI it needs to be disabled
- * in order for the panel initialization commands to be
- * properly sent.
- */
- value = tegra_dsi_readl(dsi, DSI_POWER_CONTROL);
-
- if (value & DSI_POWER_CONTROL_ENABLE)
- tegra_dsi_disable(dsi);
-
err = tegra_dsi_prepare(dsi);
if (err < 0) {
dev_err(dsi->dev, "failed to prepare: %d\n", err);
diff --git a/drivers/gpu/drm/tegra/uapi.c b/drivers/gpu/drm/tegra/uapi.c
index 5adab6b22916..d0b6a1fa6efa 100644
--- a/drivers/gpu/drm/tegra/uapi.c
+++ b/drivers/gpu/drm/tegra/uapi.c
@@ -114,9 +114,12 @@ int tegra_drm_ioctl_channel_open(struct drm_device *drm, void *data, struct drm_
if (err)
goto put_channel;
- if (supported)
+ if (supported) {
+ struct pid *pid = get_task_pid(current, PIDTYPE_TGID);
context->memory_context = host1x_memory_context_alloc(
- host, client->base.dev, get_task_pid(current, PIDTYPE_TGID));
+ host, client->base.dev, pid);
+ put_pid(pid);
+ }
if (IS_ERR(context->memory_context)) {
if (PTR_ERR(context->memory_context) != -EOPNOTSUPP) {
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_cursor_plane.c b/drivers/gpu/drm/vmwgfx/vmwgfx_cursor_plane.c
index 718832b08d96..c46f17ba7236 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_cursor_plane.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_cursor_plane.c
@@ -100,8 +100,10 @@ vmw_cursor_update_type(struct vmw_private *vmw, struct vmw_plane_state *vps)
if (vmw->has_mob) {
if ((vmw->capabilities2 & SVGA_CAP2_CURSOR_MOB) != 0)
return VMW_CURSOR_UPDATE_MOB;
+ else
+ return VMW_CURSOR_UPDATE_GB_ONLY;
}
-
+ drm_warn_once(&vmw->drm, "Unknown Cursor Type!\n");
return VMW_CURSOR_UPDATE_NONE;
}
@@ -139,6 +141,7 @@ static u32 vmw_cursor_mob_size(enum vmw_cursor_update_type update_type,
{
switch (update_type) {
case VMW_CURSOR_UPDATE_LEGACY:
+ case VMW_CURSOR_UPDATE_GB_ONLY:
case VMW_CURSOR_UPDATE_NONE:
return 0;
case VMW_CURSOR_UPDATE_MOB:
@@ -623,6 +626,7 @@ int vmw_cursor_plane_prepare_fb(struct drm_plane *plane,
if (!surface || vps->cursor.legacy.id == surface->snooper.id)
vps->cursor.update_type = VMW_CURSOR_UPDATE_NONE;
break;
+ case VMW_CURSOR_UPDATE_GB_ONLY:
case VMW_CURSOR_UPDATE_MOB: {
bo = vmw_user_object_buffer(&vps->uo);
if (bo) {
@@ -737,6 +741,7 @@ void
vmw_cursor_plane_atomic_update(struct drm_plane *plane,
struct drm_atomic_state *state)
{
+ struct vmw_bo *bo;
struct drm_plane_state *new_state =
drm_atomic_get_new_plane_state(state, plane);
struct drm_plane_state *old_state =
@@ -762,6 +767,15 @@ vmw_cursor_plane_atomic_update(struct drm_plane *plane,
case VMW_CURSOR_UPDATE_MOB:
vmw_cursor_update_mob(dev_priv, vps);
break;
+ case VMW_CURSOR_UPDATE_GB_ONLY:
+ bo = vmw_user_object_buffer(&vps->uo);
+ if (bo)
+ vmw_send_define_cursor_cmd(dev_priv, bo->map.virtual,
+ vps->base.crtc_w,
+ vps->base.crtc_h,
+ vps->base.hotspot_x,
+ vps->base.hotspot_y);
+ break;
case VMW_CURSOR_UPDATE_NONE:
/* do nothing */
break;
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_cursor_plane.h b/drivers/gpu/drm/vmwgfx/vmwgfx_cursor_plane.h
index 40694925a70e..0c2cc0699b0d 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_cursor_plane.h
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_cursor_plane.h
@@ -33,6 +33,7 @@ static const u32 __maybe_unused vmw_cursor_plane_formats[] = {
enum vmw_cursor_update_type {
VMW_CURSOR_UPDATE_NONE = 0,
VMW_CURSOR_UPDATE_LEGACY,
+ VMW_CURSOR_UPDATE_GB_ONLY,
VMW_CURSOR_UPDATE_MOB,
};
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
index d539f25b5fbe..3057f8baa7d2 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
@@ -3668,6 +3668,11 @@ static int vmw_cmd_check(struct vmw_private *dev_priv,
cmd_id = header->id;
+ if (header->size > SVGA_CMD_MAX_DATASIZE) {
+ VMW_DEBUG_USER("SVGA3D command: %d is too big.\n",
+ cmd_id + SVGA_3D_CMD_BASE);
+ return -E2BIG;
+ }
*size = header->size + sizeof(SVGA3dCmdHeader);
cmd_id -= SVGA_3D_CMD_BASE;
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
index 7de20e56082c..fd4e76486f2d 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
@@ -32,22 +32,22 @@ enum vmw_bo_dirty_method {
/**
* struct vmw_bo_dirty - Dirty information for buffer objects
+ * @ref_count: Reference count for this structure. Must be first member!
* @start: First currently dirty bit
* @end: Last currently dirty bit + 1
* @method: The currently used dirty method
* @change_count: Number of consecutive method change triggers
- * @ref_count: Reference count for this structure
* @bitmap_size: The size of the bitmap in bits. Typically equal to the
* nuber of pages in the bo.
* @bitmap: A bitmap where each bit represents a page. A set bit means a
* dirty page.
*/
struct vmw_bo_dirty {
+ struct kref ref_count;
unsigned long start;
unsigned long end;
enum vmw_bo_dirty_method method;
unsigned int change_count;
- unsigned int ref_count;
unsigned long bitmap_size;
unsigned long bitmap[];
};
@@ -221,7 +221,7 @@ int vmw_bo_dirty_add(struct vmw_bo *vbo)
int ret;
if (dirty) {
- dirty->ref_count++;
+ kref_get(&dirty->ref_count);
return 0;
}
@@ -235,7 +235,7 @@ int vmw_bo_dirty_add(struct vmw_bo *vbo)
dirty->bitmap_size = num_pages;
dirty->start = dirty->bitmap_size;
dirty->end = 0;
- dirty->ref_count = 1;
+ kref_init(&dirty->ref_count);
if (num_pages < PAGE_SIZE / sizeof(pte_t)) {
dirty->method = VMW_BO_DIRTY_PAGETABLE;
} else {
@@ -274,10 +274,8 @@ void vmw_bo_dirty_release(struct vmw_bo *vbo)
{
struct vmw_bo_dirty *dirty = vbo->dirty;
- if (dirty && --dirty->ref_count == 0) {
- kvfree(dirty);
+ if (dirty && kref_put(&dirty->ref_count, (void *)kvfree))
vbo->dirty = NULL;
- }
}
/**
diff --git a/drivers/gpu/drm/xe/Kconfig b/drivers/gpu/drm/xe/Kconfig
index 7219f6b884b6..4b288eb3f5b0 100644
--- a/drivers/gpu/drm/xe/Kconfig
+++ b/drivers/gpu/drm/xe/Kconfig
@@ -13,7 +13,6 @@ config DRM_XE
select TMPFS
select DRM_BUDDY
select DRM_CLIENT_SELECTION
- select DRM_EXEC
select DRM_KMS_HELPER
select DRM_KUNIT_TEST_HELPERS if DRM_XE_KUNIT_TEST != n
select DRM_PANEL
diff --git a/drivers/gpu/drm/xe/regs/xe_gt_regs.h b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
index 51f2a03847f9..f680c8b8f258 100644
--- a/drivers/gpu/drm/xe/regs/xe_gt_regs.h
+++ b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
@@ -168,6 +168,7 @@
#define XEHP_SLICE_COMMON_ECO_CHICKEN1 XE_REG_MCR(0x731c, XE_REG_OPTION_MASKED)
#define MSC_MSAA_REODER_BUF_BYPASS_DISABLE REG_BIT(14)
+#define FAST_CLEAR_VALIGN_FIX REG_BIT(13)
#define XE2LPM_CCCHKNREG1 XE_REG(0x82a8)
diff --git a/drivers/gpu/drm/xe/tests/xe_mocs.c b/drivers/gpu/drm/xe/tests/xe_mocs.c
index 0e502feaca81..6bb278167aaf 100644
--- a/drivers/gpu/drm/xe/tests/xe_mocs.c
+++ b/drivers/gpu/drm/xe/tests/xe_mocs.c
@@ -49,7 +49,7 @@ static void read_l3cc_table(struct xe_gt *gt,
fw_ref = xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL);
if (!xe_force_wake_ref_has_domain(fw_ref, XE_FORCEWAKE_ALL)) {
xe_force_wake_put(gt_to_fw(gt), fw_ref);
- KUNIT_ASSERT_TRUE_MSG(test, true, "Forcewake Failed.\n");
+ KUNIT_FAIL_AND_ABORT(test, "Forcewake Failed.\n");
}
for (i = 0; i < info->num_mocs_regs; i++) {
diff --git a/drivers/gpu/drm/xe/xe_gt_clock.c b/drivers/gpu/drm/xe/xe_gt_clock.c
index 4f011d1573c6..f65d1edd0567 100644
--- a/drivers/gpu/drm/xe/xe_gt_clock.c
+++ b/drivers/gpu/drm/xe/xe_gt_clock.c
@@ -93,11 +93,6 @@ int xe_gt_clock_init(struct xe_gt *gt)
return 0;
}
-static u64 div_u64_roundup(u64 n, u32 d)
-{
- return div_u64(n + d - 1, d);
-}
-
/**
* xe_gt_clock_interval_to_ms - Convert sampled GT clock ticks to msec
*
@@ -108,5 +103,5 @@ static u64 div_u64_roundup(u64 n, u32 d)
*/
u64 xe_gt_clock_interval_to_ms(struct xe_gt *gt, u64 count)
{
- return div_u64_roundup(count * MSEC_PER_SEC, gt->info.reference_clock);
+ return mul_u64_u32_div(count, MSEC_PER_SEC, gt->info.reference_clock);
}
diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c
index 283d846c3512..b7afe8e983cb 100644
--- a/drivers/gpu/drm/xe/xe_guc_ct.c
+++ b/drivers/gpu/drm/xe/xe_guc_ct.c
@@ -226,6 +226,12 @@ int xe_guc_ct_init_noalloc(struct xe_guc_ct *ct)
xe_gt_assert(gt, !(guc_ct_size() % PAGE_SIZE));
+ err = drmm_mutex_init(&xe->drm, &ct->lock);
+ if (err)
+ return err;
+
+ primelockdep(ct);
+
ct->g2h_wq = alloc_ordered_workqueue("xe-g2h-wq", WQ_MEM_RECLAIM);
if (!ct->g2h_wq)
return -ENOMEM;
@@ -237,16 +243,13 @@ int xe_guc_ct_init_noalloc(struct xe_guc_ct *ct)
#if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
spin_lock_init(&ct->dead.lock);
INIT_WORK(&ct->dead.worker, ct_dead_worker_func);
+#if IS_ENABLED(CONFIG_DRM_XE_DEBUG_GUC)
+ stack_depot_init();
+#endif
#endif
init_waitqueue_head(&ct->wq);
init_waitqueue_head(&ct->g2h_fence_wq);
- err = drmm_mutex_init(&xe->drm, &ct->lock);
- if (err)
- return err;
-
- primelockdep(ct);
-
err = drmm_add_action_or_reset(&xe->drm, guc_ct_fini, ct);
if (err)
return err;
diff --git a/drivers/gpu/drm/xe/xe_irq.c b/drivers/gpu/drm/xe/xe_irq.c
index 870edaf69388..06976cc77918 100644
--- a/drivers/gpu/drm/xe/xe_irq.c
+++ b/drivers/gpu/drm/xe/xe_irq.c
@@ -847,22 +847,6 @@ static int xe_irq_msix_init(struct xe_device *xe)
return 0;
}
-static irqreturn_t guc2host_irq_handler(int irq, void *arg)
-{
- struct xe_device *xe = arg;
- struct xe_tile *tile;
- u8 id;
-
- if (!atomic_read(&xe->irq.enabled))
- return IRQ_NONE;
-
- for_each_tile(tile, xe, id)
- xe_guc_irq_handler(&tile->primary_gt->uc.guc,
- GUC_INTR_GUC2HOST);
-
- return IRQ_HANDLED;
-}
-
static irqreturn_t xe_irq_msix_default_hwe_handler(int irq, void *arg)
{
unsigned int tile_id, gt_id;
@@ -979,7 +963,7 @@ int xe_irq_msix_request_irqs(struct xe_device *xe)
u16 msix;
msix = GUC2HOST_MSIX;
- err = xe_irq_msix_request_irq(xe, guc2host_irq_handler, xe,
+ err = xe_irq_msix_request_irq(xe, xe_irq_handler(xe), xe,
DRIVER_NAME "-guc2host", false, &msix);
if (err)
return err;
diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c
index 9a6df79fc5b6..89cc6d32f041 100644
--- a/drivers/gpu/drm/xe/xe_pci.c
+++ b/drivers/gpu/drm/xe/xe_pci.c
@@ -375,6 +375,7 @@ static const struct pci_device_id pciidlist[] = {
INTEL_LNL_IDS(INTEL_VGA_DEVICE, &lnl_desc),
INTEL_BMG_IDS(INTEL_VGA_DEVICE, &bmg_desc),
INTEL_PTL_IDS(INTEL_VGA_DEVICE, &ptl_desc),
+ INTEL_WCL_IDS(INTEL_VGA_DEVICE, &ptl_desc),
{ }
};
MODULE_DEVICE_TABLE(pci, pciidlist);
diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
index ccb09ef4ec9e..cdd1dc540a59 100644
--- a/drivers/gpu/drm/xe/xe_vm.c
+++ b/drivers/gpu/drm/xe/xe_vm.c
@@ -3369,8 +3369,10 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe, struct xe_vm *vm,
op == DRM_XE_VM_BIND_OP_PREFETCH) ||
XE_IOCTL_DBG(xe, prefetch_region &&
op != DRM_XE_VM_BIND_OP_PREFETCH) ||
- XE_IOCTL_DBG(xe, (prefetch_region != DRM_XE_CONSULT_MEM_ADVISE_PREF_LOC &&
- !(BIT(prefetch_region) & xe->info.mem_region_mask))) ||
+ XE_IOCTL_DBG(xe, (prefetch_region != DRM_XE_CONSULT_MEM_ADVISE_PREF_LOC &&
+ /* Guard against undefined shift in BIT(prefetch_region) */
+ (prefetch_region >= (sizeof(xe->info.mem_region_mask) * 8) ||
+ !(BIT(prefetch_region) & xe->info.mem_region_mask)))) ||
XE_IOCTL_DBG(xe, obj &&
op == DRM_XE_VM_BIND_OP_UNMAP) ||
XE_IOCTL_DBG(xe, (flags & DRM_XE_VM_BIND_FLAG_MADVISE_AUTORESET) &&
diff --git a/drivers/gpu/drm/xe/xe_wa.c b/drivers/gpu/drm/xe/xe_wa.c
index cd03891654a1..3cf30718b200 100644
--- a/drivers/gpu/drm/xe/xe_wa.c
+++ b/drivers/gpu/drm/xe/xe_wa.c
@@ -679,6 +679,8 @@ static const struct xe_rtp_entry_sr engine_was[] = {
},
{ XE_RTP_NAME("14023061436"),
XE_RTP_RULES(GRAPHICS_VERSION_RANGE(3000, 3001),
+ FUNC(xe_rtp_match_first_render_or_compute), OR,
+ GRAPHICS_VERSION_RANGE(3003, 3005),
FUNC(xe_rtp_match_first_render_or_compute)),
XE_RTP_ACTIONS(SET(TDL_CHICKEN, QID_WAIT_FOR_THREAD_NOT_RUN_DISABLE))
},
@@ -916,6 +918,15 @@ static const struct xe_rtp_entry_sr lrc_was[] = {
XE_RTP_RULES(GRAPHICS_VERSION_RANGE(3000, 3003), ENGINE_CLASS(RENDER)),
XE_RTP_ACTIONS(SET(COMMON_SLICE_CHICKEN4, SBE_PUSH_CONSTANT_BEHIND_FIX_ENABLE))
},
+ { XE_RTP_NAME("14024681466"),
+ XE_RTP_RULES(GRAPHICS_VERSION_RANGE(3000, 3005), ENGINE_CLASS(RENDER)),
+ XE_RTP_ACTIONS(SET(XEHP_SLICE_COMMON_ECO_CHICKEN1, FAST_CLEAR_VALIGN_FIX))
+ },
+ { XE_RTP_NAME("15016589081"),
+ XE_RTP_RULES(GRAPHICS_VERSION(3000), GRAPHICS_STEP(A0, B0),
+ ENGINE_CLASS(RENDER)),
+ XE_RTP_ACTIONS(SET(CHICKEN_RASTER_1, DIS_CLIP_NEGATIVE_BOUNDING_BOX))
+ },
};
static __maybe_unused const struct xe_rtp_entry oob_was[] = {
diff --git a/drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_init.c b/drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_init.c
index 0a9b44ce4904..b0bab2a1ddcc 100644
--- a/drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_init.c
+++ b/drivers/hid/amd-sfh-hid/sfh1_1/amd_sfh_init.c
@@ -194,6 +194,8 @@ static int amd_sfh1_1_hid_client_init(struct amd_mp2_dev *privdata)
if (rc)
goto cleanup;
+ mp2_ops->stop(privdata, cl_data->sensor_idx[i]);
+ amd_sfh_wait_for_response(privdata, cl_data->sensor_idx[i], DISABLE_SENSOR);
writel(0, privdata->mmio + amd_get_p2c_val(privdata, 0));
mp2_ops->start(privdata, info);
status = amd_sfh_wait_for_response
diff --git a/drivers/hid/hid-apple.c b/drivers/hid/hid-apple.c
index 61404d7a43ee..57da4f86a9fa 100644
--- a/drivers/hid/hid-apple.c
+++ b/drivers/hid/hid-apple.c
@@ -355,6 +355,7 @@ static const struct apple_key_translation swapped_fn_leftctrl_keys[] = {
static const struct apple_non_apple_keyboard non_apple_keyboards[] = {
{ "SONiX USB DEVICE" },
+ { "SONiX AK870 PRO" },
{ "Keychron" },
{ "AONE" },
{ "GANSS" },
diff --git a/drivers/hid/hid-corsair-void.c b/drivers/hid/hid-corsair-void.c
index fee134a7eba3..5e9a5b8f7f16 100644
--- a/drivers/hid/hid-corsair-void.c
+++ b/drivers/hid/hid-corsair-void.c
@@ -553,9 +553,8 @@ static void corsair_void_add_battery(struct corsair_void_drvdata *drvdata)
if (IS_ERR(new_supply)) {
hid_err(drvdata->hid_dev,
- "failed to register battery '%s' (reason: %ld)\n",
- drvdata->battery_desc.name,
- PTR_ERR(new_supply));
+ "failed to register battery '%s' (reason: %pe)\n",
+ drvdata->battery_desc.name, new_supply);
return;
}
diff --git a/drivers/hid/hid-elecom.c b/drivers/hid/hid-elecom.c
index 69771fd35006..981d1b6e9658 100644
--- a/drivers/hid/hid-elecom.c
+++ b/drivers/hid/hid-elecom.c
@@ -75,7 +75,8 @@ static const __u8 *elecom_report_fixup(struct hid_device *hdev, __u8 *rdesc,
*/
mouse_button_fixup(hdev, rdesc, *rsize, 20, 28, 22, 14, 8);
break;
- case USB_DEVICE_ID_ELECOM_M_XT3URBK:
+ case USB_DEVICE_ID_ELECOM_M_XT3URBK_00FB:
+ case USB_DEVICE_ID_ELECOM_M_XT3URBK_018F:
case USB_DEVICE_ID_ELECOM_M_XT3DRBK:
case USB_DEVICE_ID_ELECOM_M_XT4DRBK:
/*
@@ -119,7 +120,8 @@ static const __u8 *elecom_report_fixup(struct hid_device *hdev, __u8 *rdesc,
static const struct hid_device_id elecom_devices[] = {
{ HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_BM084) },
{ HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_XGL20DLBK) },
- { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_XT3URBK) },
+ { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_XT3URBK_00FB) },
+ { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_XT3URBK_018F) },
{ HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_XT3DRBK) },
{ HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_XT4DRBK) },
{ HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_DT1URBK) },
diff --git a/drivers/hid/hid-haptic.c b/drivers/hid/hid-haptic.c
index aa090684c1f2..fc8a9997f815 100644
--- a/drivers/hid/hid-haptic.c
+++ b/drivers/hid/hid-haptic.c
@@ -86,7 +86,7 @@ int hid_haptic_input_configured(struct hid_device *hdev,
if (hi->application == HID_DG_TOUCHPAD) {
if (haptic->auto_trigger_report &&
haptic->manual_trigger_report) {
- __set_bit(INPUT_PROP_HAPTIC_TOUCHPAD, hi->input->propbit);
+ __set_bit(INPUT_PROP_PRESSUREPAD, hi->input->propbit);
return 1;
}
return 0;
diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h
index 0723b4b1c9ec..c4589075a5ed 100644
--- a/drivers/hid/hid-ids.h
+++ b/drivers/hid/hid-ids.h
@@ -449,7 +449,8 @@
#define USB_VENDOR_ID_ELECOM 0x056e
#define USB_DEVICE_ID_ELECOM_BM084 0x0061
#define USB_DEVICE_ID_ELECOM_M_XGL20DLBK 0x00e6
-#define USB_DEVICE_ID_ELECOM_M_XT3URBK 0x00fb
+#define USB_DEVICE_ID_ELECOM_M_XT3URBK_00FB 0x00fb
+#define USB_DEVICE_ID_ELECOM_M_XT3URBK_018F 0x018f
#define USB_DEVICE_ID_ELECOM_M_XT3DRBK 0x00fc
#define USB_DEVICE_ID_ELECOM_M_XT4DRBK 0x00fd
#define USB_DEVICE_ID_ELECOM_M_DT1URBK 0x00fe
@@ -718,6 +719,7 @@
#define USB_DEVICE_ID_ITE_LENOVO_YOGA2 0x8350
#define I2C_DEVICE_ID_ITE_LENOVO_LEGION_Y720 0x837a
#define USB_DEVICE_ID_ITE_LENOVO_YOGA900 0x8396
+#define I2C_DEVICE_ID_ITE_LENOVO_YOGA_SLIM_7X_KEYBOARD 0x8987
#define USB_DEVICE_ID_ITE8595 0x8595
#define USB_DEVICE_ID_ITE_MEDION_E1239T 0xce50
@@ -1543,7 +1545,7 @@
#define USB_VENDOR_ID_SIGNOTEC 0x2133
#define USB_DEVICE_ID_SIGNOTEC_VIEWSONIC_PD1011 0x0018
-#define USB_VENDOR_ID_SMARTLINKTECHNOLOGY 0x4c4a
-#define USB_DEVICE_ID_SMARTLINKTECHNOLOGY_4155 0x4155
+#define USB_VENDOR_ID_JIELI_SDK_DEFAULT 0x4c4a
+#define USB_DEVICE_ID_JIELI_SDK_4155 0x4155
#endif
diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c
index e56e7de53279..2bbb645c2ff4 100644
--- a/drivers/hid/hid-input.c
+++ b/drivers/hid/hid-input.c
@@ -399,10 +399,11 @@ static const struct hid_device_id hid_battery_quirks[] = {
{ HID_I2C_DEVICE(USB_VENDOR_ID_ELAN, I2C_DEVICE_ID_CHROMEBOOK_TROGDOR_POMPOM),
HID_BATTERY_QUIRK_AVOID_QUERY },
/*
- * Elan I2C-HID touchscreens seem to all report a non present battery,
- * set HID_BATTERY_QUIRK_IGNORE for all Elan I2C-HID devices.
+ * Elan HID touchscreens seem to all report a non present battery,
+ * set HID_BATTERY_QUIRK_IGNORE for all Elan I2C and USB HID devices.
*/
{ HID_I2C_DEVICE(USB_VENDOR_ID_ELAN, HID_ANY_ID), HID_BATTERY_QUIRK_IGNORE },
+ { HID_USB_DEVICE(USB_VENDOR_ID_ELAN, HID_ANY_ID), HID_BATTERY_QUIRK_IGNORE },
{}
};
diff --git a/drivers/hid/hid-lenovo.c b/drivers/hid/hid-lenovo.c
index 654879814f97..9cc3e029e9f6 100644
--- a/drivers/hid/hid-lenovo.c
+++ b/drivers/hid/hid-lenovo.c
@@ -148,6 +148,14 @@ static const __u8 lenovo_tpIIbtkbd_need_fixup_collection[] = {
0x81, 0x01, /* Input (Const,Array,Abs,No Wrap,Linear,Preferred State,No Null Position) */
};
+static const __u8 lenovo_yoga7x_kbd_need_fixup_collection[] = {
+ 0x15, 0x00, // Logical Minimum (0)
+ 0x25, 0x65, // Logical Maximum (101)
+ 0x05, 0x07, // Usage Page (Keyboard)
+ 0x19, 0x00, // Usage Minimum (0)
+ 0x29, 0xDD, // Usage Maximum (221)
+};
+
static const __u8 *lenovo_report_fixup(struct hid_device *hdev, __u8 *rdesc,
unsigned int *rsize)
{
@@ -177,6 +185,13 @@ static const __u8 *lenovo_report_fixup(struct hid_device *hdev, __u8 *rdesc,
rdesc[260] = 0x01; /* report count (2) = 0x01 */
}
break;
+ case I2C_DEVICE_ID_ITE_LENOVO_YOGA_SLIM_7X_KEYBOARD:
+ if (*rsize == 176 &&
+ memcmp(&rdesc[52], lenovo_yoga7x_kbd_need_fixup_collection,
+ sizeof(lenovo_yoga7x_kbd_need_fixup_collection)) == 0) {
+ rdesc[55] = rdesc[61]; // logical maximum = usage maximum
+ }
+ break;
}
return rdesc;
}
@@ -1538,6 +1553,8 @@ static const struct hid_device_id lenovo_devices[] = {
USB_VENDOR_ID_LENOVO, USB_DEVICE_ID_LENOVO_X12_TAB) },
{ HID_DEVICE(BUS_USB, HID_GROUP_GENERIC,
USB_VENDOR_ID_LENOVO, USB_DEVICE_ID_LENOVO_X12_TAB2) },
+ { HID_DEVICE(BUS_I2C, HID_GROUP_GENERIC,
+ USB_VENDOR_ID_ITE, I2C_DEVICE_ID_ITE_LENOVO_YOGA_SLIM_7X_KEYBOARD) },
{ }
};
diff --git a/drivers/hid/hid-ntrig.c b/drivers/hid/hid-ntrig.c
index 0f76e241e0af..a7f10c45f62b 100644
--- a/drivers/hid/hid-ntrig.c
+++ b/drivers/hid/hid-ntrig.c
@@ -142,13 +142,13 @@ static void ntrig_report_version(struct hid_device *hdev)
int ret;
char buf[20];
struct usb_device *usb_dev = hid_to_usb_dev(hdev);
- unsigned char *data = kmalloc(8, GFP_KERNEL);
+ unsigned char *data __free(kfree) = kmalloc(8, GFP_KERNEL);
if (!hid_is_usb(hdev))
return;
if (!data)
- goto err_free;
+ return;
ret = usb_control_msg(usb_dev, usb_rcvctrlpipe(usb_dev, 0),
USB_REQ_CLEAR_FEATURE,
@@ -163,9 +163,6 @@ static void ntrig_report_version(struct hid_device *hdev)
hid_info(hdev, "Firmware version: %s (%02x%02x %02x%02x)\n",
buf, data[2], data[3], data[4], data[5]);
}
-
-err_free:
- kfree(data);
}
static ssize_t show_phys_width(struct device *dev,
diff --git a/drivers/hid/hid-playstation.c b/drivers/hid/hid-playstation.c
index 63f6eb9030d1..128aa6abd10b 100644
--- a/drivers/hid/hid-playstation.c
+++ b/drivers/hid/hid-playstation.c
@@ -1942,6 +1942,7 @@ static int dualshock4_get_calibration_data(struct dualshock4 *ds4)
"Failed to retrieve DualShock4 calibration info: %d\n",
ret);
ret = -EILSEQ;
+ kfree(buf);
goto transfer_failed;
} else {
break;
@@ -1959,6 +1960,7 @@ static int dualshock4_get_calibration_data(struct dualshock4 *ds4)
if (ret) {
hid_warn(hdev, "Failed to retrieve DualShock4 calibration info: %d\n", ret);
+ kfree(buf);
goto transfer_failed;
}
}
diff --git a/drivers/hid/hid-quirks.c b/drivers/hid/hid-quirks.c
index bcd4bccf1a7c..c89a015686c0 100644
--- a/drivers/hid/hid-quirks.c
+++ b/drivers/hid/hid-quirks.c
@@ -410,7 +410,8 @@ static const struct hid_device_id hid_have_special_driver[] = {
#if IS_ENABLED(CONFIG_HID_ELECOM)
{ HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_BM084) },
{ HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_XGL20DLBK) },
- { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_XT3URBK) },
+ { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_XT3URBK_00FB) },
+ { HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_XT3URBK_018F) },
{ HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_XT3DRBK) },
{ HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_XT4DRBK) },
{ HID_USB_DEVICE(USB_VENDOR_ID_ELECOM, USB_DEVICE_ID_ELECOM_M_DT1URBK) },
@@ -915,7 +916,6 @@ static const struct hid_device_id hid_ignore_list[] = {
#endif
{ HID_USB_DEVICE(USB_VENDOR_ID_YEALINK, USB_DEVICE_ID_YEALINK_P1K_P4K_B2K) },
{ HID_USB_DEVICE(USB_VENDOR_ID_QUANTA, USB_DEVICE_ID_QUANTA_HP_5MP_CAMERA_5473) },
- { HID_USB_DEVICE(USB_VENDOR_ID_SMARTLINKTECHNOLOGY, USB_DEVICE_ID_SMARTLINKTECHNOLOGY_4155) },
{ }
};
@@ -1064,6 +1064,18 @@ bool hid_ignore(struct hid_device *hdev)
strlen(elan_acpi_id[i].id)))
return true;
break;
+ case USB_VENDOR_ID_JIELI_SDK_DEFAULT:
+ /*
+ * Multiple USB devices with identical IDs (mic & touchscreen).
+ * The touch screen requires hid core processing, but the
+ * microphone does not. They can be distinguished by manufacturer
+ * and serial number.
+ */
+ if (hdev->product == USB_DEVICE_ID_JIELI_SDK_4155 &&
+ strncmp(hdev->name, "SmartlinkTechnology", 19) == 0 &&
+ strncmp(hdev->uniq, "20201111000001", 14) == 0)
+ return true;
+ break;
}
if (hdev->type == HID_TYPE_USBMOUSE &&
diff --git a/drivers/hid/hid-uclogic-params.c b/drivers/hid/hid-uclogic-params.c
index ffa14a4621ef..4c4bac6f792b 100644
--- a/drivers/hid/hid-uclogic-params.c
+++ b/drivers/hid/hid-uclogic-params.c
@@ -1369,8 +1369,10 @@ static int uclogic_params_ugee_v2_init_event_hooks(struct hid_device *hdev,
event_hook->hdev = hdev;
event_hook->size = ARRAY_SIZE(reconnect_event);
event_hook->event = kmemdup(reconnect_event, event_hook->size, GFP_KERNEL);
- if (!event_hook->event)
+ if (!event_hook->event) {
+ kfree(event_hook);
return -ENOMEM;
+ }
list_add_tail(&event_hook->list, &p->event_hooks->list);
diff --git a/drivers/hid/usbhid/hid-pidff.c b/drivers/hid/usbhid/hid-pidff.c
index edd61ef50e16..95377c5f6335 100644
--- a/drivers/hid/usbhid/hid-pidff.c
+++ b/drivers/hid/usbhid/hid-pidff.c
@@ -806,8 +806,8 @@ static int pidff_request_effect_upload(struct pidff_device *pidff, int efnum)
static int pidff_needs_playback(struct pidff_device *pidff, int effect_id, int n)
{
- return pidff->effect[effect_id].is_infinite ||
- pidff->effect[effect_id].loop_count != n;
+ return !pidff->effect[effect_id].is_infinite ||
+ pidff->effect[effect_id].loop_count != n;
}
/*
diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index e3b2bd417c46..bed1a02425cd 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -1870,8 +1870,6 @@ mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev)
struct hv_partition_creation_properties creation_properties = {};
union hv_partition_isolation_properties isolation_properties = {};
struct mshv_partition *partition;
- struct file *file;
- int fd;
long ret;
if (copy_from_user(&args, user_arg, sizeof(args)))
@@ -1938,29 +1936,13 @@ mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev)
goto delete_partition;
ret = mshv_init_async_handler(partition);
- if (ret)
- goto remove_partition;
-
- fd = get_unused_fd_flags(O_CLOEXEC);
- if (fd < 0) {
- ret = fd;
- goto remove_partition;
- }
-
- file = anon_inode_getfile("mshv_partition", &mshv_partition_fops,
- partition, O_RDWR);
- if (IS_ERR(file)) {
- ret = PTR_ERR(file);
- goto put_fd;
+ if (!ret) {
+ ret = FD_ADD(O_CLOEXEC, anon_inode_getfile("mshv_partition",
+ &mshv_partition_fops,
+ partition, O_RDWR));
+ if (ret >= 0)
+ return ret;
}
-
- fd_install(fd, file);
-
- return fd;
-
-put_fd:
- put_unused_fd(fd);
-remove_partition:
remove_partition(partition);
delete_partition:
hv_call_delete_partition(partition->pt_id);
diff --git a/drivers/hwmon/gpd-fan.c b/drivers/hwmon/gpd-fan.c
index 321794807e8d..f81c3bc422f4 100644
--- a/drivers/hwmon/gpd-fan.c
+++ b/drivers/hwmon/gpd-fan.c
@@ -12,9 +12,9 @@
* Copyright (c) 2024 Cryolitia PukNgae
*/
-#include <linux/acpi.h>
#include <linux/dmi.h>
#include <linux/hwmon.h>
+#include <linux/io.h>
#include <linux/ioport.h>
#include <linux/kernel.h>
#include <linux/module.h>
@@ -276,31 +276,6 @@ static int gpd_generic_read_rpm(void)
return (u16)high << 8 | low;
}
-static void gpd_win4_init_ec(void)
-{
- u8 chip_id, chip_ver;
-
- gpd_ecram_read(0x2000, &chip_id);
-
- if (chip_id == 0x55) {
- gpd_ecram_read(0x1060, &chip_ver);
- gpd_ecram_write(0x1060, chip_ver | 0x80);
- }
-}
-
-static int gpd_win4_read_rpm(void)
-{
- int ret;
-
- ret = gpd_generic_read_rpm();
-
- if (ret == 0)
- // Re-init EC when speed is 0
- gpd_win4_init_ec();
-
- return ret;
-}
-
static int gpd_wm2_read_rpm(void)
{
for (u16 pwm_ctr_offset = GPD_PWM_CTR_OFFSET;
@@ -320,11 +295,10 @@ static int gpd_wm2_read_rpm(void)
static int gpd_read_rpm(void)
{
switch (gpd_driver_priv.drvdata->board) {
+ case win4_6800u:
case win_mini:
case duo:
return gpd_generic_read_rpm();
- case win4_6800u:
- return gpd_win4_read_rpm();
case win_max_2:
return gpd_wm2_read_rpm();
}
@@ -607,6 +581,28 @@ static struct hwmon_chip_info gpd_fan_chip_info = {
.info = gpd_fan_hwmon_channel_info
};
+static void gpd_win4_init_ec(void)
+{
+ u8 chip_id, chip_ver;
+
+ gpd_ecram_read(0x2000, &chip_id);
+
+ if (chip_id == 0x55) {
+ gpd_ecram_read(0x1060, &chip_ver);
+ gpd_ecram_write(0x1060, chip_ver | 0x80);
+ }
+}
+
+static void gpd_init_ec(void)
+{
+ // The buggy firmware won't initialize EC properly on boot.
+ // Before its initialization, reading RPM will always return 0,
+ // and writing PWM will have no effect.
+ // Initialize it manually on driver load.
+ if (gpd_driver_priv.drvdata->board == win4_6800u)
+ gpd_win4_init_ec();
+}
+
static int gpd_fan_probe(struct platform_device *pdev)
{
struct device *dev = &pdev->dev;
@@ -634,6 +630,8 @@ static int gpd_fan_probe(struct platform_device *pdev)
return dev_err_probe(dev, PTR_ERR(hwdev),
"Failed to register hwmon device\n");
+ gpd_init_ec();
+
return 0;
}
diff --git a/drivers/iio/accel/adxl355_core.c b/drivers/iio/accel/adxl355_core.c
index 2e00fd51b4d5..5fc7f814b907 100644
--- a/drivers/iio/accel/adxl355_core.c
+++ b/drivers/iio/accel/adxl355_core.c
@@ -56,6 +56,8 @@
#define ADXL355_POWER_CTL_DRDY_MSK BIT(2)
#define ADXL355_SELF_TEST_REG 0x2E
#define ADXL355_RESET_REG 0x2F
+#define ADXL355_BASE_ADDR_SHADOW_REG 0x50
+#define ADXL355_SHADOW_REG_COUNT 5
#define ADXL355_DEVID_AD_VAL 0xAD
#define ADXL355_DEVID_MST_VAL 0x1D
@@ -294,7 +296,12 @@ static void adxl355_fill_3db_frequency_table(struct adxl355_data *data)
static int adxl355_setup(struct adxl355_data *data)
{
unsigned int regval;
+ int retries = 5; /* the number is chosen based on empirical reasons */
int ret;
+ u8 *shadow_regs __free(kfree) = kzalloc(ADXL355_SHADOW_REG_COUNT, GFP_KERNEL);
+
+ if (!shadow_regs)
+ return -ENOMEM;
ret = regmap_read(data->regmap, ADXL355_DEVID_AD_REG, &regval);
if (ret)
@@ -321,14 +328,41 @@ static int adxl355_setup(struct adxl355_data *data)
if (regval != ADXL355_PARTID_VAL)
dev_warn(data->dev, "Invalid DEV ID 0x%02x\n", regval);
- /*
- * Perform a software reset to make sure the device is in a consistent
- * state after start-up.
- */
- ret = regmap_write(data->regmap, ADXL355_RESET_REG, ADXL355_RESET_CODE);
+ /* Read shadow registers to be compared after reset */
+ ret = regmap_bulk_read(data->regmap,
+ ADXL355_BASE_ADDR_SHADOW_REG,
+ shadow_regs, ADXL355_SHADOW_REG_COUNT);
if (ret)
return ret;
+ do {
+ if (--retries == 0) {
+ dev_err(data->dev, "Shadow registers mismatch\n");
+ return -EIO;
+ }
+
+ /*
+ * Perform a software reset to make sure the device is in a consistent
+ * state after start-up.
+ */
+ ret = regmap_write(data->regmap, ADXL355_RESET_REG,
+ ADXL355_RESET_CODE);
+ if (ret)
+ return ret;
+
+ /* Wait at least 5ms after software reset */
+ usleep_range(5000, 10000);
+
+ /* Read shadow registers for comparison */
+ ret = regmap_bulk_read(data->regmap,
+ ADXL355_BASE_ADDR_SHADOW_REG,
+ data->buffer.buf,
+ ADXL355_SHADOW_REG_COUNT);
+ if (ret)
+ return ret;
+ } while (memcmp(shadow_regs, data->buffer.buf,
+ ADXL355_SHADOW_REG_COUNT));
+
ret = regmap_update_bits(data->regmap, ADXL355_POWER_CTL_REG,
ADXL355_POWER_CTL_DRDY_MSK,
FIELD_PREP(ADXL355_POWER_CTL_DRDY_MSK, 1));
diff --git a/drivers/iio/accel/bmc150-accel-core.c b/drivers/iio/accel/bmc150-accel-core.c
index 3c5d1560b163..42ccf0316ce5 100644
--- a/drivers/iio/accel/bmc150-accel-core.c
+++ b/drivers/iio/accel/bmc150-accel-core.c
@@ -523,6 +523,10 @@ static int bmc150_accel_set_interrupt(struct bmc150_accel_data *data, int i,
const struct bmc150_accel_interrupt_info *info = intr->info;
int ret;
+ /* We do not always have an IRQ */
+ if (data->irq <= 0)
+ return 0;
+
if (state) {
if (atomic_inc_return(&intr->users) > 1)
return 0;
@@ -1696,6 +1700,7 @@ int bmc150_accel_core_probe(struct device *dev, struct regmap *regmap, int irq,
}
if (irq > 0) {
+ data->irq = irq;
ret = devm_request_threaded_irq(dev, irq,
bmc150_accel_irq_handler,
bmc150_accel_irq_thread_handler,
diff --git a/drivers/iio/accel/bmc150-accel.h b/drivers/iio/accel/bmc150-accel.h
index 7a7baf52e595..e8f26198359f 100644
--- a/drivers/iio/accel/bmc150-accel.h
+++ b/drivers/iio/accel/bmc150-accel.h
@@ -58,6 +58,7 @@ enum bmc150_accel_trigger_id {
struct bmc150_accel_data {
struct regmap *regmap;
+ int irq;
struct regulator_bulk_data regulators[2];
struct bmc150_accel_interrupt interrupts[BMC150_ACCEL_INTERRUPTS];
struct bmc150_accel_trigger triggers[BMC150_ACCEL_TRIGGERS];
diff --git a/drivers/iio/adc/ad4030.c b/drivers/iio/adc/ad4030.c
index 1bc2f9a22470..d8bee6a4215a 100644
--- a/drivers/iio/adc/ad4030.c
+++ b/drivers/iio/adc/ad4030.c
@@ -385,7 +385,7 @@ static int ad4030_get_chan_scale(struct iio_dev *indio_dev,
struct ad4030_state *st = iio_priv(indio_dev);
const struct iio_scan_type *scan_type;
- scan_type = iio_get_current_scan_type(indio_dev, st->chip->channels);
+ scan_type = iio_get_current_scan_type(indio_dev, chan);
if (IS_ERR(scan_type))
return PTR_ERR(scan_type);
diff --git a/drivers/iio/adc/ad7124.c b/drivers/iio/adc/ad7124.c
index 910b40393f77..61623cc6cb25 100644
--- a/drivers/iio/adc/ad7124.c
+++ b/drivers/iio/adc/ad7124.c
@@ -1525,10 +1525,6 @@ static int __ad7124_calibrate_all(struct ad7124_state *st, struct iio_dev *indio
int ret, i;
for (i = 0; i < st->num_channels; i++) {
-
- if (indio_dev->channels[i].type != IIO_VOLTAGE)
- continue;
-
/*
* For calibration the OFFSET register should hold its reset default
* value. For the GAIN register there is no such requirement but
@@ -1539,6 +1535,14 @@ static int __ad7124_calibrate_all(struct ad7124_state *st, struct iio_dev *indio
st->channels[i].cfg.calibration_gain = st->gain_default;
/*
+ * Only the main voltage input channels are important enough
+ * to be automatically calibrated here. For everything else,
+ * just use the default values set above.
+ */
+ if (indio_dev->channels[i].type != IIO_VOLTAGE)
+ continue;
+
+ /*
* Full-scale calibration isn't supported at gain 1, so skip in
* that case. Note that untypically full-scale calibration has
* to happen before zero-scale calibration. This only applies to
diff --git a/drivers/iio/adc/ad7280a.c b/drivers/iio/adc/ad7280a.c
index dda2986ccda0..50a6ff7c8b1c 100644
--- a/drivers/iio/adc/ad7280a.c
+++ b/drivers/iio/adc/ad7280a.c
@@ -541,7 +541,7 @@ static ssize_t ad7280_store_balance_timer(struct iio_dev *indio_dev,
int val, val2;
int ret;
- ret = iio_str_to_fixpoint(buf, 1000, &val, &val2);
+ ret = iio_str_to_fixpoint(buf, 100, &val, &val2);
if (ret)
return ret;
diff --git a/drivers/iio/adc/ad7380.c b/drivers/iio/adc/ad7380.c
index fa251dc1aae6..bfd908deefc0 100644
--- a/drivers/iio/adc/ad7380.c
+++ b/drivers/iio/adc/ad7380.c
@@ -1227,6 +1227,14 @@ static int ad7380_offload_buffer_postenable(struct iio_dev *indio_dev)
if (ret)
return ret;
+ /*
+ * When the sequencer is required to read all channels, we need to
+ * trigger twice per sample period in order to read one complete set
+ * of samples.
+ */
+ if (st->seq)
+ config.periodic.frequency_hz *= 2;
+
ret = spi_offload_trigger_enable(st->offload, st->offload_trigger, &config);
if (ret)
spi_unoptimize_message(&st->offload_msg);
diff --git a/drivers/iio/adc/rtq6056.c b/drivers/iio/adc/rtq6056.c
index ad9738228b7f..2bf3a09ac6b0 100644
--- a/drivers/iio/adc/rtq6056.c
+++ b/drivers/iio/adc/rtq6056.c
@@ -300,7 +300,7 @@ static int rtq6056_adc_read_channel(struct rtq6056_priv *priv,
return IIO_VAL_INT;
case RTQ6056_REG_SHUNTVOLT:
case RTQ6056_REG_CURRENT:
- *val = sign_extend32(regval, 16);
+ *val = sign_extend32(regval, 15);
return IIO_VAL_INT;
default:
return -EINVAL;
diff --git a/drivers/iio/adc/stm32-dfsdm-adc.c b/drivers/iio/adc/stm32-dfsdm-adc.c
index 74b1b4dc6e81..9664b9bd75d4 100644
--- a/drivers/iio/adc/stm32-dfsdm-adc.c
+++ b/drivers/iio/adc/stm32-dfsdm-adc.c
@@ -725,9 +725,8 @@ static int stm32_dfsdm_generic_channel_parse_of(struct stm32_dfsdm *dfsdm,
}
df_ch->src = val;
- ret = fwnode_property_read_u32(node, "st,adc-alt-channel", &df_ch->alt_si);
- if (ret != -EINVAL)
- df_ch->alt_si = 0;
+ if (fwnode_property_present(node, "st,adc-alt-channel"))
+ df_ch->alt_si = 1;
if (adc->dev_data->type == DFSDM_IIO) {
backend = devm_iio_backend_fwnode_get(&indio_dev->dev, NULL, node);
diff --git a/drivers/iio/buffer/industrialio-buffer-dma.c b/drivers/iio/buffer/industrialio-buffer-dma.c
index ee294a775e8a..7a7a9d37339b 100644
--- a/drivers/iio/buffer/industrialio-buffer-dma.c
+++ b/drivers/iio/buffer/industrialio-buffer-dma.c
@@ -786,6 +786,12 @@ out_end_signalling:
}
EXPORT_SYMBOL_NS_GPL(iio_dma_buffer_enqueue_dmabuf, "IIO_DMA_BUFFER");
+struct device *iio_dma_buffer_get_dma_dev(struct iio_buffer *buffer)
+{
+ return iio_buffer_to_queue(buffer)->dev;
+}
+EXPORT_SYMBOL_NS_GPL(iio_dma_buffer_get_dma_dev, "IIO_DMA_BUFFER");
+
void iio_dma_buffer_lock_queue(struct iio_buffer *buffer)
{
struct iio_dma_buffer_queue *queue = iio_buffer_to_queue(buffer);
diff --git a/drivers/iio/buffer/industrialio-buffer-dmaengine.c b/drivers/iio/buffer/industrialio-buffer-dmaengine.c
index e9d9a7d39fe1..27dd56334345 100644
--- a/drivers/iio/buffer/industrialio-buffer-dmaengine.c
+++ b/drivers/iio/buffer/industrialio-buffer-dmaengine.c
@@ -177,6 +177,8 @@ static const struct iio_buffer_access_funcs iio_dmaengine_buffer_ops = {
.lock_queue = iio_dma_buffer_lock_queue,
.unlock_queue = iio_dma_buffer_unlock_queue,
+ .get_dma_dev = iio_dma_buffer_get_dma_dev,
+
.modes = INDIO_BUFFER_HARDWARE,
.flags = INDIO_BUFFER_FLAG_FIXED_WATERMARK,
};
diff --git a/drivers/iio/common/ssp_sensors/ssp_dev.c b/drivers/iio/common/ssp_sensors/ssp_dev.c
index 1e167dc673ca..da09c9f3ceb6 100644
--- a/drivers/iio/common/ssp_sensors/ssp_dev.c
+++ b/drivers/iio/common/ssp_sensors/ssp_dev.c
@@ -503,7 +503,7 @@ static int ssp_probe(struct spi_device *spi)
ret = spi_setup(spi);
if (ret < 0) {
dev_err(&spi->dev, "Failed to setup spi\n");
- return ret;
+ goto err_setup_spi;
}
data->fw_dl_state = SSP_FW_DL_STATE_NONE;
@@ -568,6 +568,8 @@ err_read_reg:
err_setup_irq:
mutex_destroy(&data->pending_lock);
mutex_destroy(&data->comm_lock);
+err_setup_spi:
+ mfd_remove_devices(&spi->dev);
dev_err(&spi->dev, "Probe failed!\n");
diff --git a/drivers/iio/humidity/hdc3020.c b/drivers/iio/humidity/hdc3020.c
index ffb25596d3a8..78b2c171c8da 100644
--- a/drivers/iio/humidity/hdc3020.c
+++ b/drivers/iio/humidity/hdc3020.c
@@ -72,6 +72,9 @@
#define HDC3020_MAX_TEMP_HYST_MICRO 164748607
#define HDC3020_MAX_HUM_MICRO 99220264
+/* Divide 65535 from the datasheet by 5 to avoid overflows */
+#define HDC3020_THRESH_FRACTION (65535 / 5)
+
struct hdc3020_data {
struct i2c_client *client;
struct gpio_desc *reset_gpio;
@@ -301,9 +304,9 @@ static int hdc3020_read_raw(struct iio_dev *indio_dev,
case IIO_CHAN_INFO_SCALE:
*val2 = 65536;
if (chan->type == IIO_TEMP)
- *val = 175;
+ *val = 175 * MILLI;
else
- *val = 100;
+ *val = 100 * MILLI;
return IIO_VAL_FRACTIONAL;
case IIO_CHAN_INFO_OFFSET:
@@ -376,15 +379,18 @@ static int hdc3020_thresh_get_temp(u16 thresh)
int temp;
/*
- * Get the temperature threshold from 9 LSBs, shift them to get
- * the truncated temperature threshold representation and
- * calculate the threshold according to the formula in the
- * datasheet. Result is degree celsius scaled by 65535.
+ * Get the temperature threshold from 9 LSBs, shift them to get the
+ * truncated temperature threshold representation and calculate the
+ * threshold according to the explicit formula in the datasheet:
+ * T(C) = -45 + (175 * temp) / 65535.
+ * Additionally scale by HDC3020_THRESH_FRACTION to avoid precision loss
+ * when calculating threshold and hysteresis values. Result is degree
+ * celsius scaled by HDC3020_THRESH_FRACTION.
*/
temp = FIELD_GET(HDC3020_THRESH_TEMP_MASK, thresh) <<
HDC3020_THRESH_TEMP_TRUNC_SHIFT;
- return -2949075 + (175 * temp);
+ return -2949075 / 5 + (175 / 5 * temp);
}
static int hdc3020_thresh_get_hum(u16 thresh)
@@ -394,13 +400,16 @@ static int hdc3020_thresh_get_hum(u16 thresh)
/*
* Get the humidity threshold from 7 MSBs, shift them to get the
* truncated humidity threshold representation and calculate the
- * threshold according to the formula in the datasheet. Result is
- * percent scaled by 65535.
+ * threshold according to the explicit formula in the datasheet:
+ * RH(%) = 100 * hum / 65535.
+ * Additionally scale by HDC3020_THRESH_FRACTION to avoid precision loss
+ * when calculating threshold and hysteresis values. Result is percent
+ * scaled by HDC3020_THRESH_FRACTION.
*/
hum = FIELD_GET(HDC3020_THRESH_HUM_MASK, thresh) <<
HDC3020_THRESH_HUM_TRUNC_SHIFT;
- return hum * 100;
+ return hum * 100 / 5;
}
static u16 hdc3020_thresh_set_temp(int s_temp, u16 curr_thresh)
@@ -455,8 +464,8 @@ int hdc3020_thresh_clr(s64 s_thresh, s64 s_hyst, enum iio_event_direction dir)
else
s_clr = s_thresh + s_hyst;
- /* Divide by 65535 to get units of micro */
- return div_s64(s_clr, 65535);
+ /* Divide by HDC3020_THRESH_FRACTION to get units of micro */
+ return div_s64(s_clr, HDC3020_THRESH_FRACTION);
}
static int _hdc3020_write_thresh(struct hdc3020_data *data, u16 reg, u16 val)
@@ -507,7 +516,7 @@ static int hdc3020_write_thresh(struct iio_dev *indio_dev,
clr = ret;
/* Scale value to include decimal part into calculations */
- s_val = (val < 0) ? (val * 1000000 - val2) : (val * 1000000 + val2);
+ s_val = (val < 0) ? (val * 1000 - val2) : (val * 1000 + val2);
switch (chan->type) {
case IIO_TEMP:
switch (info) {
@@ -523,7 +532,8 @@ static int hdc3020_write_thresh(struct iio_dev *indio_dev,
/* Calculate old hysteresis */
s_thresh = (s64)hdc3020_thresh_get_temp(thresh) * 1000000;
s_clr = (s64)hdc3020_thresh_get_temp(clr) * 1000000;
- s_hyst = div_s64(abs(s_thresh - s_clr), 65535);
+ s_hyst = div_s64(abs(s_thresh - s_clr),
+ HDC3020_THRESH_FRACTION);
/* Set new threshold */
thresh = reg_val;
/* Set old hysteresis */
@@ -532,16 +542,17 @@ static int hdc3020_write_thresh(struct iio_dev *indio_dev,
case IIO_EV_INFO_HYSTERESIS:
/*
* Function hdc3020_thresh_get_temp returns temperature
- * in degree celsius scaled by 65535. Scale by 1000000
- * to be able to subtract scaled hysteresis value.
+ * in degree celsius scaled by HDC3020_THRESH_FRACTION.
+ * Scale by 1000000 to be able to subtract scaled
+ * hysteresis value.
*/
s_thresh = (s64)hdc3020_thresh_get_temp(thresh) * 1000000;
/*
* Units of s_val are in micro degree celsius, scale by
- * 65535 to get same units as s_thresh.
+ * HDC3020_THRESH_FRACTION to get same units as s_thresh.
*/
s_val = min(abs(s_val), HDC3020_MAX_TEMP_HYST_MICRO);
- s_hyst = (s64)s_val * 65535;
+ s_hyst = (s64)s_val * HDC3020_THRESH_FRACTION;
s_clr = hdc3020_thresh_clr(s_thresh, s_hyst, dir);
s_clr = max(s_clr, HDC3020_MIN_TEMP_MICRO);
s_clr = min(s_clr, HDC3020_MAX_TEMP_MICRO);
@@ -565,7 +576,8 @@ static int hdc3020_write_thresh(struct iio_dev *indio_dev,
/* Calculate old hysteresis */
s_thresh = (s64)hdc3020_thresh_get_hum(thresh) * 1000000;
s_clr = (s64)hdc3020_thresh_get_hum(clr) * 1000000;
- s_hyst = div_s64(abs(s_thresh - s_clr), 65535);
+ s_hyst = div_s64(abs(s_thresh - s_clr),
+ HDC3020_THRESH_FRACTION);
/* Set new threshold */
thresh = reg_val;
/* Try to set old hysteresis */
@@ -574,15 +586,16 @@ static int hdc3020_write_thresh(struct iio_dev *indio_dev,
case IIO_EV_INFO_HYSTERESIS:
/*
* Function hdc3020_thresh_get_hum returns relative
- * humidity in percent scaled by 65535. Scale by 1000000
- * to be able to subtract scaled hysteresis value.
+ * humidity in percent scaled by HDC3020_THRESH_FRACTION.
+ * Scale by 1000000 to be able to subtract scaled
+ * hysteresis value.
*/
s_thresh = (s64)hdc3020_thresh_get_hum(thresh) * 1000000;
/*
- * Units of s_val are in micro percent, scale by 65535
- * to get same units as s_thresh.
+ * Units of s_val are in micro percent, scale by
+ * HDC3020_THRESH_FRACTION to get same units as s_thresh.
*/
- s_hyst = (s64)s_val * 65535;
+ s_hyst = (s64)s_val * HDC3020_THRESH_FRACTION;
s_clr = hdc3020_thresh_clr(s_thresh, s_hyst, dir);
s_clr = max(s_clr, 0);
s_clr = min(s_clr, HDC3020_MAX_HUM_MICRO);
@@ -630,7 +643,7 @@ static int hdc3020_read_thresh(struct iio_dev *indio_dev,
thresh = hdc3020_thresh_get_temp(ret);
switch (info) {
case IIO_EV_INFO_VALUE:
- *val = thresh;
+ *val = thresh * MILLI;
break;
case IIO_EV_INFO_HYSTERESIS:
ret = hdc3020_read_be16(data, reg_clr);
@@ -638,18 +651,18 @@ static int hdc3020_read_thresh(struct iio_dev *indio_dev,
return ret;
clr = hdc3020_thresh_get_temp(ret);
- *val = abs(thresh - clr);
+ *val = abs(thresh - clr) * MILLI;
break;
default:
return -EOPNOTSUPP;
}
- *val2 = 65535;
+ *val2 = HDC3020_THRESH_FRACTION;
return IIO_VAL_FRACTIONAL;
case IIO_HUMIDITYRELATIVE:
thresh = hdc3020_thresh_get_hum(ret);
switch (info) {
case IIO_EV_INFO_VALUE:
- *val = thresh;
+ *val = thresh * MILLI;
break;
case IIO_EV_INFO_HYSTERESIS:
ret = hdc3020_read_be16(data, reg_clr);
@@ -657,12 +670,12 @@ static int hdc3020_read_thresh(struct iio_dev *indio_dev,
return ret;
clr = hdc3020_thresh_get_hum(ret);
- *val = abs(thresh - clr);
+ *val = abs(thresh - clr) * MILLI;
break;
default:
return -EOPNOTSUPP;
}
- *val2 = 65535;
+ *val2 = HDC3020_THRESH_FRACTION;
return IIO_VAL_FRACTIONAL;
default:
return -EOPNOTSUPP;
diff --git a/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx.h b/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx.h
index c225b246c8a5..381b016fa524 100644
--- a/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx.h
+++ b/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx.h
@@ -192,6 +192,22 @@ struct st_lsm6dsx_fifo_ops {
* @fifo_en: Hw timer FIFO enable register info (addr + mask).
* @decimator: Hw timer FIFO decimator register info (addr + mask).
* @freq_fine: Difference in % of ODR with respect to the typical.
+ * @ts_sensitivity: Nominal timestamp sensitivity.
+ * @ts_trim_coeff: Coefficient for calculating the calibrated timestamp gain.
+ * This coefficient comes into play when linearizing the formula
+ * used to calculate the calibrated timestamp (please see the
+ * relevant formula in the AN for the specific IMU).
+ * For example, in the case of LSM6DSO we have:
+ *
+ * 1 / (1 + x) ~= 1 - x (Taylor’s Series)
+ * ttrim[s] = 1 / (40000 * (1 + 0.0015 * val)) (from AN5192)
+ * ttrim[ns] ~= 25000 - 37.5 * val
+ * ttrim[ns] ~= 25000 - (37500 * val) / 1000
+ *
+ * so, replacing ts_sensitivity = 25000 and
+ * ts_trim_coeff = 37500
+ *
+ * ttrim[ns] ~= ts_sensitivity - (ts_trim_coeff * val) / 1000
*/
struct st_lsm6dsx_hw_ts_settings {
struct st_lsm6dsx_reg timer_en;
@@ -199,6 +215,8 @@ struct st_lsm6dsx_hw_ts_settings {
struct st_lsm6dsx_reg fifo_en;
struct st_lsm6dsx_reg decimator;
u8 freq_fine;
+ u16 ts_sensitivity;
+ u16 ts_trim_coeff;
};
/**
@@ -252,6 +270,15 @@ struct st_lsm6dsx_event_settings {
u8 wakeup_src_x_mask;
};
+enum st_lsm6dsx_sensor_id {
+ ST_LSM6DSX_ID_GYRO,
+ ST_LSM6DSX_ID_ACC,
+ ST_LSM6DSX_ID_EXT0,
+ ST_LSM6DSX_ID_EXT1,
+ ST_LSM6DSX_ID_EXT2,
+ ST_LSM6DSX_ID_MAX
+};
+
enum st_lsm6dsx_ext_sensor_id {
ST_LSM6DSX_ID_MAGN,
};
@@ -337,23 +364,14 @@ struct st_lsm6dsx_settings {
struct st_lsm6dsx_odr_table_entry odr_table[2];
struct st_lsm6dsx_samples_to_discard samples_to_discard[2];
struct st_lsm6dsx_fs_table_entry fs_table[2];
- struct st_lsm6dsx_reg decimator[ST_LSM6DSX_MAX_ID];
- struct st_lsm6dsx_reg batch[ST_LSM6DSX_MAX_ID];
+ struct st_lsm6dsx_reg decimator[ST_LSM6DSX_ID_MAX];
+ struct st_lsm6dsx_reg batch[2];
struct st_lsm6dsx_fifo_ops fifo_ops;
struct st_lsm6dsx_hw_ts_settings ts_settings;
struct st_lsm6dsx_shub_settings shub_settings;
struct st_lsm6dsx_event_settings event_settings;
};
-enum st_lsm6dsx_sensor_id {
- ST_LSM6DSX_ID_GYRO,
- ST_LSM6DSX_ID_ACC,
- ST_LSM6DSX_ID_EXT0,
- ST_LSM6DSX_ID_EXT1,
- ST_LSM6DSX_ID_EXT2,
- ST_LSM6DSX_ID_MAX,
-};
-
enum st_lsm6dsx_fifo_mode {
ST_LSM6DSX_FIFO_BYPASS = 0x0,
ST_LSM6DSX_FIFO_CONT = 0x6,
diff --git a/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_core.c b/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_core.c
index d8cb4b0218d5..a2daf0c14d96 100644
--- a/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_core.c
+++ b/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_core.c
@@ -94,8 +94,6 @@
#define ST_LSM6DSX_REG_WHOAMI_ADDR 0x0f
-#define ST_LSM6DSX_TS_SENSITIVITY 25000UL /* 25us */
-
static const struct iio_chan_spec st_lsm6dsx_acc_channels[] = {
ST_LSM6DSX_CHANNEL_ACC(IIO_ACCEL, 0x28, IIO_MOD_X, 0),
ST_LSM6DSX_CHANNEL_ACC(IIO_ACCEL, 0x2a, IIO_MOD_Y, 1),
@@ -983,6 +981,8 @@ static const struct st_lsm6dsx_settings st_lsm6dsx_sensor_settings[] = {
.mask = GENMASK(7, 6),
},
.freq_fine = 0x63,
+ .ts_sensitivity = 25000,
+ .ts_trim_coeff = 37500,
},
.shub_settings = {
.page_mux = {
@@ -1196,6 +1196,8 @@ static const struct st_lsm6dsx_settings st_lsm6dsx_sensor_settings[] = {
.mask = GENMASK(7, 6),
},
.freq_fine = 0x63,
+ .ts_sensitivity = 25000,
+ .ts_trim_coeff = 37500,
},
.event_settings = {
.enable_reg = {
@@ -1371,6 +1373,8 @@ static const struct st_lsm6dsx_settings st_lsm6dsx_sensor_settings[] = {
.mask = GENMASK(7, 6),
},
.freq_fine = 0x4f,
+ .ts_sensitivity = 21701,
+ .ts_trim_coeff = 28212,
},
.shub_settings = {
.page_mux = {
@@ -2248,20 +2252,13 @@ static int st_lsm6dsx_init_hw_timer(struct st_lsm6dsx_hw *hw)
}
/* calibrate timestamp sensitivity */
- hw->ts_gain = ST_LSM6DSX_TS_SENSITIVITY;
+ hw->ts_gain = ts_settings->ts_sensitivity;
if (ts_settings->freq_fine) {
err = regmap_read(hw->regmap, ts_settings->freq_fine, &val);
if (err < 0)
return err;
- /*
- * linearize the AN5192 formula:
- * 1 / (1 + x) ~= 1 - x (Taylor’s Series)
- * ttrim[s] = 1 / (40000 * (1 + 0.0015 * val))
- * ttrim[ns] ~= 25000 - 37.5 * val
- * ttrim[ns] ~= 25000 - (37500 * val) / 1000
- */
- hw->ts_gain -= ((s8)val * 37500) / 1000;
+ hw->ts_gain -= ((s8)val * ts_settings->ts_trim_coeff) / 1000;
}
return 0;
diff --git a/drivers/iio/industrialio-buffer.c b/drivers/iio/industrialio-buffer.c
index a80f7cc25a27..96ea0f039dfb 100644
--- a/drivers/iio/industrialio-buffer.c
+++ b/drivers/iio/industrialio-buffer.c
@@ -1623,19 +1623,28 @@ static int iio_dma_resv_lock(struct dma_buf *dmabuf, bool nonblock)
return 0;
}
+static struct device *iio_buffer_get_dma_dev(const struct iio_dev *indio_dev,
+ struct iio_buffer *buffer)
+{
+ if (buffer->access->get_dma_dev)
+ return buffer->access->get_dma_dev(buffer);
+
+ return indio_dev->dev.parent;
+}
+
static struct dma_buf_attachment *
iio_buffer_find_attachment(struct iio_dev_buffer_pair *ib,
struct dma_buf *dmabuf, bool nonblock)
{
- struct device *dev = ib->indio_dev->dev.parent;
struct iio_buffer *buffer = ib->buffer;
+ struct device *dma_dev = iio_buffer_get_dma_dev(ib->indio_dev, buffer);
struct dma_buf_attachment *attach = NULL;
struct iio_dmabuf_priv *priv;
guard(mutex)(&buffer->dmabufs_mutex);
list_for_each_entry(priv, &buffer->dmabufs, entry) {
- if (priv->attach->dev == dev
+ if (priv->attach->dev == dma_dev
&& priv->attach->dmabuf == dmabuf) {
attach = priv->attach;
break;
@@ -1653,6 +1662,7 @@ static int iio_buffer_attach_dmabuf(struct iio_dev_buffer_pair *ib,
{
struct iio_dev *indio_dev = ib->indio_dev;
struct iio_buffer *buffer = ib->buffer;
+ struct device *dma_dev = iio_buffer_get_dma_dev(indio_dev, buffer);
struct dma_buf_attachment *attach;
struct iio_dmabuf_priv *priv, *each;
struct dma_buf *dmabuf;
@@ -1679,7 +1689,7 @@ static int iio_buffer_attach_dmabuf(struct iio_dev_buffer_pair *ib,
goto err_free_priv;
}
- attach = dma_buf_attach(dmabuf, indio_dev->dev.parent);
+ attach = dma_buf_attach(dmabuf, dma_dev);
if (IS_ERR(attach)) {
err = PTR_ERR(attach);
goto err_dmabuf_put;
@@ -1719,7 +1729,7 @@ static int iio_buffer_attach_dmabuf(struct iio_dev_buffer_pair *ib,
* combo. If we do, refuse to attach.
*/
list_for_each_entry(each, &buffer->dmabufs, entry) {
- if (each->attach->dev == indio_dev->dev.parent
+ if (each->attach->dev == dma_dev
&& each->attach->dmabuf == dmabuf) {
/*
* We unlocked the reservation object, so going through
@@ -1758,6 +1768,7 @@ static int iio_buffer_detach_dmabuf(struct iio_dev_buffer_pair *ib,
{
struct iio_buffer *buffer = ib->buffer;
struct iio_dev *indio_dev = ib->indio_dev;
+ struct device *dma_dev = iio_buffer_get_dma_dev(indio_dev, buffer);
struct iio_dmabuf_priv *priv;
struct dma_buf *dmabuf;
int dmabuf_fd, ret = -EPERM;
@@ -1772,7 +1783,7 @@ static int iio_buffer_detach_dmabuf(struct iio_dev_buffer_pair *ib,
guard(mutex)(&buffer->dmabufs_mutex);
list_for_each_entry(priv, &buffer->dmabufs, entry) {
- if (priv->attach->dev == indio_dev->dev.parent
+ if (priv->attach->dev == dma_dev
&& priv->attach->dmabuf == dmabuf) {
list_del(&priv->entry);
diff --git a/drivers/iio/pressure/bmp280-core.c b/drivers/iio/pressure/bmp280-core.c
index c04e8bb4c993..d983ce9c0b99 100644
--- a/drivers/iio/pressure/bmp280-core.c
+++ b/drivers/iio/pressure/bmp280-core.c
@@ -1040,13 +1040,16 @@ static int bmp280_wait_conv(struct bmp280_data *data)
unsigned int reg, meas_time_us;
int ret;
- /* Check if we are using a BME280 device */
- if (data->oversampling_humid)
- meas_time_us = BMP280_PRESS_HUMID_MEAS_OFFSET +
- BIT(data->oversampling_humid) * BMP280_MEAS_DUR;
+ /* Constant part of the measurement time */
+ meas_time_us = BMP280_MEAS_OFFSET;
- else
- meas_time_us = 0;
+ /*
+ * Check if we are using a BME280 device,
+ * Humidity measurement time
+ */
+ if (data->chip_info->oversampling_humid_avail)
+ meas_time_us += BMP280_PRESS_HUMID_MEAS_OFFSET +
+ BIT(data->oversampling_humid) * BMP280_MEAS_DUR;
/* Pressure measurement time */
meas_time_us += BMP280_PRESS_HUMID_MEAS_OFFSET +
diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index a23b364e24ff..651d76bca114 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -1020,15 +1020,18 @@ int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
if (cq->create_flags & IB_UVERBS_CQ_FLAGS_IGNORE_OVERRUN)
MLX5_SET(cqc, cqc, oi, 1);
+ if (udata) {
+ cq->mcq.comp = mlx5_add_cq_to_tasklet;
+ cq->mcq.tasklet_ctx.comp = mlx5_ib_cq_comp;
+ } else {
+ cq->mcq.comp = mlx5_ib_cq_comp;
+ }
+
err = mlx5_core_create_cq(dev->mdev, &cq->mcq, cqb, inlen, out, sizeof(out));
if (err)
goto err_cqb;
mlx5_ib_dbg(dev, "cqn 0x%x\n", cq->mcq.cqn);
- if (udata)
- cq->mcq.tasklet_ctx.comp = mlx5_ib_cq_comp;
- else
- cq->mcq.comp = mlx5_ib_cq_comp;
cq->mcq.event = mlx5_ib_cq_event;
INIT_LIST_HEAD(&cq->wc_list);
diff --git a/drivers/input/keyboard/cros_ec_keyb.c b/drivers/input/keyboard/cros_ec_keyb.c
index f7209c8ebbcc..1c6b0461dc35 100644
--- a/drivers/input/keyboard/cros_ec_keyb.c
+++ b/drivers/input/keyboard/cros_ec_keyb.c
@@ -261,6 +261,12 @@ static int cros_ec_keyb_work(struct notifier_block *nb,
case EC_MKBP_EVENT_KEY_MATRIX:
pm_wakeup_event(ckdev->dev, 0);
+ if (!ckdev->idev) {
+ dev_warn_once(ckdev->dev,
+ "Unexpected key matrix event\n");
+ return NOTIFY_OK;
+ }
+
if (ckdev->ec->event_size != ckdev->cols) {
dev_err(ckdev->dev,
"Discarded incomplete key matrix event.\n");
diff --git a/drivers/input/keyboard/imx_sc_key.c b/drivers/input/keyboard/imx_sc_key.c
index d18839f1f4f6..b620cd310cdb 100644
--- a/drivers/input/keyboard/imx_sc_key.c
+++ b/drivers/input/keyboard/imx_sc_key.c
@@ -158,7 +158,7 @@ static int imx_sc_key_probe(struct platform_device *pdev)
return error;
}
- error = devm_add_action_or_reset(&pdev->dev, imx_sc_key_action, &priv);
+ error = devm_add_action_or_reset(&pdev->dev, imx_sc_key_action, priv);
if (error)
return error;
diff --git a/drivers/input/tablet/pegasus_notetaker.c b/drivers/input/tablet/pegasus_notetaker.c
index 8d6b71d59793..eabb4a0b8a0d 100644
--- a/drivers/input/tablet/pegasus_notetaker.c
+++ b/drivers/input/tablet/pegasus_notetaker.c
@@ -63,6 +63,9 @@
#define BUTTON_PRESSED 0xb5
#define COMMAND_VERSION 0xa9
+/* 1 Status + 1 Color + 2 X + 2 Y = 6 bytes */
+#define NOTETAKER_PACKET_SIZE 6
+
/* in xy data packet */
#define BATTERY_NO_REPORT 0x40
#define BATTERY_LOW 0x41
@@ -311,6 +314,12 @@ static int pegasus_probe(struct usb_interface *intf,
}
pegasus->data_len = usb_maxpacket(dev, pipe);
+ if (pegasus->data_len < NOTETAKER_PACKET_SIZE) {
+ dev_err(&intf->dev, "packet size is too small (%d)\n",
+ pegasus->data_len);
+ error = -EINVAL;
+ goto err_free_mem;
+ }
pegasus->data = usb_alloc_coherent(dev, pegasus->data_len, GFP_KERNEL,
&pegasus->data_dma);
diff --git a/drivers/input/touchscreen/goodix.c b/drivers/input/touchscreen/goodix.c
index 252dcae039f8..f8798d11ec03 100644
--- a/drivers/input/touchscreen/goodix.c
+++ b/drivers/input/touchscreen/goodix.c
@@ -796,17 +796,6 @@ int goodix_reset_no_int_sync(struct goodix_ts_data *ts)
usleep_range(6000, 10000); /* T4: > 5ms */
- /*
- * Put the reset pin back in to input / high-impedance mode to save
- * power. Only do this in the non ACPI case since some ACPI boards
- * don't have a pull-up, so there the reset pin must stay active-high.
- */
- if (ts->irq_pin_access_method == IRQ_PIN_ACCESS_GPIO) {
- error = gpiod_direction_input(ts->gpiod_rst);
- if (error)
- goto error;
- }
-
return 0;
error:
@@ -957,14 +946,6 @@ static int goodix_add_acpi_gpio_mappings(struct goodix_ts_data *ts)
return -EINVAL;
}
- /*
- * Normally we put the reset pin in input / high-impedance mode to save
- * power. But some x86/ACPI boards don't have a pull-up, so for the ACPI
- * case, leave the pin as is. This results in the pin not being touched
- * at all on x86/ACPI boards, except when needed for error-recover.
- */
- ts->gpiod_rst_flags = GPIOD_ASIS;
-
return devm_acpi_dev_add_driver_gpios(dev, gpio_mapping);
}
#else
@@ -989,12 +970,6 @@ static int goodix_get_gpio_config(struct goodix_ts_data *ts)
return -EINVAL;
dev = &ts->client->dev;
- /*
- * By default we request the reset pin as input, leaving it in
- * high-impedance when not resetting the controller to save power.
- */
- ts->gpiod_rst_flags = GPIOD_IN;
-
ts->avdd28 = devm_regulator_get(dev, "AVDD28");
if (IS_ERR(ts->avdd28))
return dev_err_probe(dev, PTR_ERR(ts->avdd28), "Failed to get AVDD28 regulator\n");
@@ -1019,7 +994,7 @@ retry_get_irq_gpio:
ts->gpiod_int = gpiod;
/* Get the reset line GPIO pin number */
- gpiod = devm_gpiod_get_optional(dev, GOODIX_GPIO_RST_NAME, ts->gpiod_rst_flags);
+ gpiod = devm_gpiod_get_optional(dev, GOODIX_GPIO_RST_NAME, GPIOD_ASIS);
if (IS_ERR(gpiod))
return dev_err_probe(dev, PTR_ERR(gpiod), "Failed to get %s GPIO\n",
GOODIX_GPIO_RST_NAME);
@@ -1557,6 +1532,7 @@ MODULE_DEVICE_TABLE(i2c, goodix_ts_id);
static const struct acpi_device_id goodix_acpi_match[] = {
{ "GDIX1001", 0 },
{ "GDIX1002", 0 },
+ { "GDIX1003", 0 },
{ "GDX9110", 0 },
{ }
};
diff --git a/drivers/input/touchscreen/goodix.h b/drivers/input/touchscreen/goodix.h
index 87797cc88b32..0d1e8a8d2cba 100644
--- a/drivers/input/touchscreen/goodix.h
+++ b/drivers/input/touchscreen/goodix.h
@@ -88,7 +88,6 @@ struct goodix_ts_data {
struct gpio_desc *gpiod_rst;
int gpio_count;
int gpio_int_idx;
- enum gpiod_flags gpiod_rst_flags;
char id[GOODIX_ID_MAX_LEN + 1];
char cfg_name[64];
u16 version;
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 7944a3af4545..f1fb27681b0b 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -2008,7 +2008,7 @@ static void iommu_dma_iova_unlink_range_slow(struct device *dev,
end - addr, iovad->granule - iova_start_pad);
if (!dev_is_dma_coherent(dev) &&
- !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+ !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
arch_sync_dma_for_cpu(phys, len, dir);
swiotlb_tbl_unmap_single(dev, phys, len, dir, attrs);
@@ -2032,7 +2032,8 @@ static void __iommu_dma_iova_unlink(struct device *dev,
size_t unmapped;
if ((state->__size & DMA_IOVA_USE_SWIOTLB) ||
- (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)))
+ (!dev_is_dma_coherent(dev) &&
+ !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))))
iommu_dma_iova_unlink_range_slow(dev, addr, size, dir, attrs);
iommu_iotlb_gather_init(&iotlb_gather);
diff --git a/drivers/iommu/iommufd/driver.c b/drivers/iommu/iommufd/driver.c
index 6f1010da221c..21d4a35538f6 100644
--- a/drivers/iommu/iommufd/driver.c
+++ b/drivers/iommu/iommufd/driver.c
@@ -161,8 +161,8 @@ int iommufd_viommu_report_event(struct iommufd_viommu *viommu,
vevent = &veventq->lost_events_header;
goto out_set_header;
}
- memcpy(vevent->event_data, event_data, data_len);
vevent->data_len = data_len;
+ memcpy(vevent->event_data, event_data, data_len);
veventq->num_events++;
out_set_header:
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 627f9b78483a..85d0843ed07b 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -614,7 +614,6 @@ struct iommufd_veventq {
struct iommufd_eventq common;
struct iommufd_viommu *viommu;
struct list_head node; /* for iommufd_viommu::veventqs */
- struct iommufd_vevent lost_events_header;
enum iommu_veventq_type type;
unsigned int depth;
@@ -622,6 +621,9 @@ struct iommufd_veventq {
/* Use common.lock for protection */
u32 num_events;
u32 sequence;
+
+ /* Must be last as it ends in a flexible-array member. */
+ struct iommufd_vevent lost_events_header;
};
static inline struct iommufd_veventq *
diff --git a/drivers/irqchip/irq-riscv-intc.c b/drivers/irqchip/irq-riscv-intc.c
index e5805885394e..70290b35b317 100644
--- a/drivers/irqchip/irq-riscv-intc.c
+++ b/drivers/irqchip/irq-riscv-intc.c
@@ -166,7 +166,8 @@ static int riscv_intc_domain_alloc(struct irq_domain *domain,
static const struct irq_domain_ops riscv_intc_domain_ops = {
.map = riscv_intc_domain_map,
.xlate = irq_domain_xlate_onecell,
- .alloc = riscv_intc_domain_alloc
+ .alloc = riscv_intc_domain_alloc,
+ .free = irq_domain_free_irqs_top,
};
static struct fwnode_handle *riscv_intc_hwnode(void)
diff --git a/drivers/mailbox/mailbox-test.c b/drivers/mailbox/mailbox-test.c
index c9dd8c42c0cd..3a28ab5c42e5 100644
--- a/drivers/mailbox/mailbox-test.c
+++ b/drivers/mailbox/mailbox-test.c
@@ -268,7 +268,7 @@ static int mbox_test_add_debugfs(struct platform_device *pdev,
return 0;
tdev->root_debugfs_dir = debugfs_create_dir(dev_name(&pdev->dev), NULL);
- if (!tdev->root_debugfs_dir) {
+ if (IS_ERR(tdev->root_debugfs_dir)) {
dev_err(&pdev->dev, "Failed to create Mailbox debugfs\n");
return -EINVAL;
}
diff --git a/drivers/mailbox/mailbox-th1520.c b/drivers/mailbox/mailbox-th1520.c
index a6b2aa9ae952..626957c2e435 100644
--- a/drivers/mailbox/mailbox-th1520.c
+++ b/drivers/mailbox/mailbox-th1520.c
@@ -435,10 +435,8 @@ static int th1520_mbox_probe(struct platform_device *pdev)
}
ret = devm_add_action_or_reset(dev, th1520_disable_clk, priv);
- if (ret) {
- clk_bulk_disable_unprepare(ARRAY_SIZE(priv->clocks), priv->clocks);
+ if (ret)
return ret;
- }
/*
* The address mappings in the device tree align precisely with those
diff --git a/drivers/mailbox/mtk-cmdq-mailbox.c b/drivers/mailbox/mtk-cmdq-mailbox.c
index 654a60f63756..5791f80f995a 100644
--- a/drivers/mailbox/mtk-cmdq-mailbox.c
+++ b/drivers/mailbox/mtk-cmdq-mailbox.c
@@ -92,6 +92,18 @@ struct gce_plat {
u32 gce_num;
};
+static inline u32 cmdq_convert_gce_addr(dma_addr_t addr, const struct gce_plat *pdata)
+{
+ /* Convert DMA addr (PA or IOVA) to GCE readable addr */
+ return addr >> pdata->shift;
+}
+
+static inline dma_addr_t cmdq_revert_gce_addr(u32 addr, const struct gce_plat *pdata)
+{
+ /* Revert GCE readable addr to DMA addr (PA or IOVA) */
+ return (dma_addr_t)addr << pdata->shift;
+}
+
u8 cmdq_get_shift_pa(struct mbox_chan *chan)
{
struct cmdq *cmdq = container_of(chan->mbox, struct cmdq, mbox);
@@ -188,13 +200,12 @@ static void cmdq_task_insert_into_thread(struct cmdq_task *task)
struct cmdq_task *prev_task = list_last_entry(
&thread->task_busy_list, typeof(*task), list_entry);
u64 *prev_task_base = prev_task->pkt->va_base;
+ u32 gce_addr = cmdq_convert_gce_addr(task->pa_base, task->cmdq->pdata);
/* let previous task jump to this task */
dma_sync_single_for_cpu(dev, prev_task->pa_base,
prev_task->pkt->cmd_buf_size, DMA_TO_DEVICE);
- prev_task_base[CMDQ_NUM_CMD(prev_task->pkt) - 1] =
- (u64)CMDQ_JUMP_BY_PA << 32 |
- (task->pa_base >> task->cmdq->pdata->shift);
+ prev_task_base[CMDQ_NUM_CMD(prev_task->pkt) - 1] = (u64)CMDQ_JUMP_BY_PA << 32 | gce_addr;
dma_sync_single_for_device(dev, prev_task->pa_base,
prev_task->pkt->cmd_buf_size, DMA_TO_DEVICE);
@@ -237,7 +248,8 @@ static void cmdq_thread_irq_handler(struct cmdq *cmdq,
struct cmdq_thread *thread)
{
struct cmdq_task *task, *tmp, *curr_task = NULL;
- u32 curr_pa, irq_flag, task_end_pa;
+ u32 irq_flag, gce_addr;
+ dma_addr_t curr_pa, task_end_pa;
bool err;
irq_flag = readl(thread->base + CMDQ_THR_IRQ_STATUS);
@@ -259,7 +271,8 @@ static void cmdq_thread_irq_handler(struct cmdq *cmdq,
else
return;
- curr_pa = readl(thread->base + CMDQ_THR_CURR_ADDR) << cmdq->pdata->shift;
+ gce_addr = readl(thread->base + CMDQ_THR_CURR_ADDR);
+ curr_pa = cmdq_revert_gce_addr(gce_addr, cmdq->pdata);
list_for_each_entry_safe(task, tmp, &thread->task_busy_list,
list_entry) {
@@ -378,7 +391,8 @@ static int cmdq_mbox_send_data(struct mbox_chan *chan, void *data)
struct cmdq_thread *thread = (struct cmdq_thread *)chan->con_priv;
struct cmdq *cmdq = dev_get_drvdata(chan->mbox->dev);
struct cmdq_task *task;
- unsigned long curr_pa, end_pa;
+ u32 gce_addr;
+ dma_addr_t curr_pa, end_pa;
/* Client should not flush new tasks if suspended. */
WARN_ON(cmdq->suspended);
@@ -402,20 +416,20 @@ static int cmdq_mbox_send_data(struct mbox_chan *chan, void *data)
*/
WARN_ON(cmdq_thread_reset(cmdq, thread) < 0);
- writel(task->pa_base >> cmdq->pdata->shift,
- thread->base + CMDQ_THR_CURR_ADDR);
- writel((task->pa_base + pkt->cmd_buf_size) >> cmdq->pdata->shift,
- thread->base + CMDQ_THR_END_ADDR);
+ gce_addr = cmdq_convert_gce_addr(task->pa_base, cmdq->pdata);
+ writel(gce_addr, thread->base + CMDQ_THR_CURR_ADDR);
+ gce_addr = cmdq_convert_gce_addr(task->pa_base + pkt->cmd_buf_size, cmdq->pdata);
+ writel(gce_addr, thread->base + CMDQ_THR_END_ADDR);
writel(thread->priority, thread->base + CMDQ_THR_PRIORITY);
writel(CMDQ_THR_IRQ_EN, thread->base + CMDQ_THR_IRQ_ENABLE);
writel(CMDQ_THR_ENABLED, thread->base + CMDQ_THR_ENABLE_TASK);
} else {
WARN_ON(cmdq_thread_suspend(cmdq, thread) < 0);
- curr_pa = readl(thread->base + CMDQ_THR_CURR_ADDR) <<
- cmdq->pdata->shift;
- end_pa = readl(thread->base + CMDQ_THR_END_ADDR) <<
- cmdq->pdata->shift;
+ gce_addr = readl(thread->base + CMDQ_THR_CURR_ADDR);
+ curr_pa = cmdq_revert_gce_addr(gce_addr, cmdq->pdata);
+ gce_addr = readl(thread->base + CMDQ_THR_END_ADDR);
+ end_pa = cmdq_revert_gce_addr(gce_addr, cmdq->pdata);
/* check boundary */
if (curr_pa == end_pa - CMDQ_INST_SIZE ||
curr_pa == end_pa) {
@@ -646,6 +660,9 @@ static int cmdq_probe(struct platform_device *pdev)
if (err)
return err;
+ dma_set_coherent_mask(dev,
+ DMA_BIT_MASK(sizeof(u32) * BITS_PER_BYTE + cmdq->pdata->shift));
+
cmdq->mbox.dev = dev;
cmdq->mbox.chans = devm_kcalloc(dev, cmdq->pdata->thread_nr,
sizeof(*cmdq->mbox.chans), GFP_KERNEL);
diff --git a/drivers/mailbox/mtk-gpueb-mailbox.c b/drivers/mailbox/mtk-gpueb-mailbox.c
index 925bcf21f650..f6d2beccd91b 100644
--- a/drivers/mailbox/mtk-gpueb-mailbox.c
+++ b/drivers/mailbox/mtk-gpueb-mailbox.c
@@ -200,7 +200,7 @@ static bool mtk_gpueb_mbox_last_tx_done(struct mbox_chan *chan)
return !(readl(ch->ebm->mbox_ctl + GPUEB_MBOX_CTL_TX_STS) & BIT(ch->num));
}
-const struct mbox_chan_ops mtk_gpueb_mbox_ops = {
+static const struct mbox_chan_ops mtk_gpueb_mbox_ops = {
.send_data = mtk_gpueb_mbox_send_data,
.startup = mtk_gpueb_mbox_startup,
.shutdown = mtk_gpueb_mbox_shutdown,
diff --git a/drivers/mailbox/omap-mailbox.c b/drivers/mailbox/omap-mailbox.c
index 680243751d62..17fe6545875d 100644
--- a/drivers/mailbox/omap-mailbox.c
+++ b/drivers/mailbox/omap-mailbox.c
@@ -68,6 +68,7 @@ struct omap_mbox_fifo {
struct omap_mbox_match_data {
u32 intr_type;
+ bool is_exclusive;
};
struct omap_mbox_device {
@@ -78,6 +79,7 @@ struct omap_mbox_device {
u32 num_users;
u32 num_fifos;
u32 intr_type;
+ const struct omap_mbox_match_data *mbox_data;
};
struct omap_mbox {
@@ -341,11 +343,13 @@ static int omap_mbox_suspend(struct device *dev)
if (pm_runtime_status_suspended(dev))
return 0;
- for (fifo = 0; fifo < mdev->num_fifos; fifo++) {
- if (mbox_read_reg(mdev, MAILBOX_MSGSTATUS(fifo))) {
- dev_err(mdev->dev, "fifo %d has unexpected unread messages\n",
- fifo);
- return -EBUSY;
+ if (mdev->mbox_data->is_exclusive) {
+ for (fifo = 0; fifo < mdev->num_fifos; fifo++) {
+ if (mbox_read_reg(mdev, MAILBOX_MSGSTATUS(fifo))) {
+ dev_err(mdev->dev, "fifo %d has unexpected unread messages\n",
+ fifo);
+ return -EBUSY;
+ }
}
}
@@ -378,8 +382,9 @@ static const struct dev_pm_ops omap_mbox_pm_ops = {
SET_SYSTEM_SLEEP_PM_OPS(omap_mbox_suspend, omap_mbox_resume)
};
-static const struct omap_mbox_match_data omap2_data = { MBOX_INTR_CFG_TYPE1 };
-static const struct omap_mbox_match_data omap4_data = { MBOX_INTR_CFG_TYPE2 };
+static const struct omap_mbox_match_data omap2_data = { MBOX_INTR_CFG_TYPE1, true };
+static const struct omap_mbox_match_data omap4_data = { MBOX_INTR_CFG_TYPE2, true };
+static const struct omap_mbox_match_data am654_data = { MBOX_INTR_CFG_TYPE2, false };
static const struct of_device_id omap_mailbox_of_match[] = {
{
@@ -396,11 +401,11 @@ static const struct of_device_id omap_mailbox_of_match[] = {
},
{
.compatible = "ti,am654-mailbox",
- .data = &omap4_data,
+ .data = &am654_data,
},
{
.compatible = "ti,am64-mailbox",
- .data = &omap4_data,
+ .data = &am654_data,
},
{
/* end */
@@ -449,7 +454,6 @@ static int omap_mbox_probe(struct platform_device *pdev)
struct omap_mbox_fifo *fifo;
struct device_node *node = pdev->dev.of_node;
struct device_node *child;
- const struct omap_mbox_match_data *match_data;
struct mbox_controller *controller;
u32 intr_type, info_count;
u32 num_users, num_fifos;
@@ -462,11 +466,6 @@ static int omap_mbox_probe(struct platform_device *pdev)
return -ENODEV;
}
- match_data = of_device_get_match_data(&pdev->dev);
- if (!match_data)
- return -ENODEV;
- intr_type = match_data->intr_type;
-
if (of_property_read_u32(node, "ti,mbox-num-users", &num_users))
return -ENODEV;
@@ -483,6 +482,12 @@ static int omap_mbox_probe(struct platform_device *pdev)
if (!mdev)
return -ENOMEM;
+ mdev->mbox_data = device_get_match_data(&pdev->dev);
+ if (!mdev->mbox_data)
+ return -ENODEV;
+
+ intr_type = mdev->mbox_data->intr_type;
+
mdev->mbox_base = devm_platform_ioremap_resource(pdev, 0);
if (IS_ERR(mdev->mbox_base))
return PTR_ERR(mdev->mbox_base);
diff --git a/drivers/mailbox/pcc.c b/drivers/mailbox/pcc.c
index 0a00719b2482..ff292b9e0be9 100644
--- a/drivers/mailbox/pcc.c
+++ b/drivers/mailbox/pcc.c
@@ -276,9 +276,8 @@ static int pcc_mbox_error_check_and_clear(struct pcc_chan_info *pchan)
if (ret)
return ret;
- val &= pchan->error.status_mask;
- if (val) {
- val &= ~pchan->error.status_mask;
+ if (val & pchan->error.status_mask) {
+ val &= pchan->error.preserve_mask;
pcc_chan_reg_write(&pchan->error, val);
return -EIO;
}
@@ -745,7 +744,8 @@ static int pcc_parse_subspace_db_reg(struct pcc_chan_info *pchan,
ret = pcc_chan_reg_init(&pchan->error,
&pcct_ext->error_status_register,
- 0, 0, pcct_ext->error_status_mask,
+ ~pcct_ext->error_status_mask, 0,
+ pcct_ext->error_status_mask,
"Error Status");
}
return ret;
diff --git a/drivers/md/dm-pcache/Makefile b/drivers/md/dm-pcache/Makefile
index 86776e4acad2..cedfd38854f6 100644
--- a/drivers/md/dm-pcache/Makefile
+++ b/drivers/md/dm-pcache/Makefile
@@ -1,3 +1,3 @@
dm-pcache-y := dm_pcache.o cache_dev.o segment.o backing_dev.o cache.o cache_gc.o cache_writeback.o cache_segment.o cache_key.o cache_req.o
-obj-m += dm-pcache.o
+obj-$(CONFIG_DM_PCACHE) += dm-pcache.o
diff --git a/drivers/md/dm-pcache/cache.c b/drivers/md/dm-pcache/cache.c
index d8e92367d947..698697a7a73c 100644
--- a/drivers/md/dm-pcache/cache.c
+++ b/drivers/md/dm-pcache/cache.c
@@ -181,7 +181,7 @@ static void cache_info_init_default(struct pcache_cache *cache)
{
struct pcache_cache_info *cache_info = &cache->cache_info;
- cache_info->header.seq = 0;
+ memset(cache_info, 0, sizeof(*cache_info));
cache_info->n_segs = cache->cache_dev->seg_num;
cache_info_set_gc_percent(cache_info, PCACHE_CACHE_GC_PERCENT_DEFAULT);
}
@@ -411,7 +411,7 @@ void pcache_cache_stop(struct dm_pcache *pcache)
{
struct pcache_cache *cache = &pcache->cache;
- cache_flush(cache);
+ pcache_cache_flush(cache);
cancel_delayed_work_sync(&cache->gc_work);
flush_work(&cache->clean_work);
diff --git a/drivers/md/dm-pcache/cache.h b/drivers/md/dm-pcache/cache.h
index 1136d86958c8..27613b56be54 100644
--- a/drivers/md/dm-pcache/cache.h
+++ b/drivers/md/dm-pcache/cache.h
@@ -339,7 +339,7 @@ void cache_seg_put(struct pcache_cache_segment *cache_seg);
void cache_seg_set_next_seg(struct pcache_cache_segment *cache_seg, u32 seg_id);
/* cache request*/
-int cache_flush(struct pcache_cache *cache);
+int pcache_cache_flush(struct pcache_cache *cache);
void miss_read_end_work_fn(struct work_struct *work);
int pcache_cache_handle_req(struct pcache_cache *cache, struct pcache_request *pcache_req);
diff --git a/drivers/md/dm-pcache/cache_req.c b/drivers/md/dm-pcache/cache_req.c
index 27f94c1fa968..7854a30e07b7 100644
--- a/drivers/md/dm-pcache/cache_req.c
+++ b/drivers/md/dm-pcache/cache_req.c
@@ -790,7 +790,7 @@ err:
}
/**
- * cache_flush - Flush all ksets to persist any pending cache data
+ * pcache_cache_flush - Flush all ksets to persist any pending cache data
* @cache: Pointer to the cache structure
*
* This function iterates through all ksets associated with the provided `cache`
@@ -802,7 +802,7 @@ err:
* the respective error code, preventing the flush operation from proceeding to
* subsequent ksets.
*/
-int cache_flush(struct pcache_cache *cache)
+int pcache_cache_flush(struct pcache_cache *cache)
{
struct pcache_cache_kset *kset;
int ret;
@@ -827,7 +827,7 @@ int pcache_cache_handle_req(struct pcache_cache *cache, struct pcache_request *p
struct bio *bio = pcache_req->bio;
if (unlikely(bio->bi_opf & REQ_PREFLUSH))
- return cache_flush(cache);
+ return pcache_cache_flush(cache);
if (bio_data_dir(bio) == READ)
return cache_read(cache, pcache_req);
diff --git a/drivers/md/dm-pcache/pcache_internal.h b/drivers/md/dm-pcache/pcache_internal.h
index d427e534727c..b7a3319d2bd3 100644
--- a/drivers/md/dm-pcache/pcache_internal.h
+++ b/drivers/md/dm-pcache/pcache_internal.h
@@ -99,7 +99,7 @@ static inline void __must_check *pcache_meta_find_latest(struct pcache_meta_head
/* Update latest if a more recent sequence is found */
if (!latest || pcache_meta_seq_after(meta->seq, seq_latest)) {
seq_latest = meta->seq;
- latest = (void *)header + (i * meta_max_size);
+ latest = meta_addr;
}
}
diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index d382a390d39a..72047b47a7a0 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -320,11 +320,7 @@ static int fec_alloc_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio)
if (fio->bufs[n])
continue;
- fio->bufs[n] = mempool_alloc(&v->fec->prealloc_pool, GFP_NOWAIT);
- if (unlikely(!fio->bufs[n])) {
- DMERR("failed to allocate FEC buffer");
- return -ENOMEM;
- }
+ fio->bufs[n] = mempool_alloc(&v->fec->prealloc_pool, GFP_NOIO);
}
/* try to allocate the maximum number of buffers */
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index f5e5e59b232b..6c83ab940af7 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2005,7 +2005,7 @@ static void dm_split_and_process_bio(struct mapped_device *md,
* linear target or multiple linear targets pointing to the same
* device), we can send the flush with data directly to it.
*/
- if (map->flush_bypasses_map) {
+ if (bio->bi_iter.bi_size && map->flush_bypasses_map) {
struct list_head *devices = dm_table_get_devices(map);
if (devices->next == devices->prev)
goto send_preflush_with_data;
diff --git a/drivers/media/mc/mc-request.c b/drivers/media/mc/mc-request.c
index f66f728b1b43..2ac9ac0a740b 100644
--- a/drivers/media/mc/mc-request.c
+++ b/drivers/media/mc/mc-request.c
@@ -282,8 +282,6 @@ EXPORT_SYMBOL_GPL(media_request_get_by_fd);
int media_request_alloc(struct media_device *mdev, int *alloc_fd)
{
struct media_request *req;
- struct file *filp;
- int fd;
int ret;
/* Either both are NULL or both are non-NULL */
@@ -297,19 +295,6 @@ int media_request_alloc(struct media_device *mdev, int *alloc_fd)
if (!req)
return -ENOMEM;
- fd = get_unused_fd_flags(O_CLOEXEC);
- if (fd < 0) {
- ret = fd;
- goto err_free_req;
- }
-
- filp = anon_inode_getfile("request", &request_fops, NULL, O_CLOEXEC);
- if (IS_ERR(filp)) {
- ret = PTR_ERR(filp);
- goto err_put_fd;
- }
-
- filp->private_data = req;
req->mdev = mdev;
req->state = MEDIA_REQUEST_STATE_IDLE;
req->num_incomplete_objects = 0;
@@ -320,19 +305,24 @@ int media_request_alloc(struct media_device *mdev, int *alloc_fd)
req->updating_count = 0;
req->access_count = 0;
- *alloc_fd = fd;
+ FD_PREPARE(fdf, O_CLOEXEC,
+ anon_inode_getfile("request", &request_fops, NULL,
+ O_CLOEXEC));
+ if (fdf.err) {
+ ret = fdf.err;
+ goto err_free_req;
+ }
+
+ fd_prepare_file(fdf)->private_data = req;
+
+ *alloc_fd = fd_publish(fdf);
snprintf(req->debug_str, sizeof(req->debug_str), "%u:%d",
- atomic_inc_return(&mdev->request_id), fd);
+ atomic_inc_return(&mdev->request_id), *alloc_fd);
dev_dbg(mdev->dev, "request: allocated %s\n", req->debug_str);
- fd_install(fd, filp);
-
return 0;
-err_put_fd:
- put_unused_fd(fd);
-
err_free_req:
if (mdev->ops->req_free)
mdev->ops->req_free(req);
diff --git a/drivers/memory/tegra/tegra210.c b/drivers/memory/tegra/tegra210.c
index cfa61dd88557..3c2949c16fde 100644
--- a/drivers/memory/tegra/tegra210.c
+++ b/drivers/memory/tegra/tegra210.c
@@ -1015,7 +1015,7 @@ static const struct tegra_mc_client tegra210_mc_clients[] = {
},
},
}, {
- .id = TEGRA210_MC_SESRD,
+ .id = TEGRA210_MC_SESWR,
.name = "seswr",
.swgroup = TEGRA_SWGROUP_SE,
.regs = {
@@ -1079,7 +1079,7 @@ static const struct tegra_mc_client tegra210_mc_clients[] = {
},
},
}, {
- .id = TEGRA210_MC_ETRR,
+ .id = TEGRA210_MC_ETRW,
.name = "etrw",
.swgroup = TEGRA_SWGROUP_ETR,
.regs = {
diff --git a/drivers/misc/mei/pci-me.c b/drivers/misc/mei/pci-me.c
index b017ff29dbd1..73cad914be9f 100644
--- a/drivers/misc/mei/pci-me.c
+++ b/drivers/misc/mei/pci-me.c
@@ -223,6 +223,10 @@ static int mei_me_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
hw->mem_addr = pcim_iomap_table(pdev)[0];
hw->read_fws = mei_me_read_fws;
+ err = mei_register(dev, &pdev->dev);
+ if (err)
+ goto end;
+
pci_enable_msi(pdev);
hw->irq = pdev->irq;
@@ -237,13 +241,9 @@ static int mei_me_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
if (err) {
dev_err(&pdev->dev, "request_threaded_irq failure. irq = %d\n",
pdev->irq);
- goto end;
+ goto deregister;
}
- err = mei_register(dev, &pdev->dev);
- if (err)
- goto release_irq;
-
if (mei_start(dev)) {
dev_err(&pdev->dev, "init hw failure.\n");
err = -ENODEV;
@@ -283,11 +283,10 @@ static int mei_me_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
return 0;
deregister:
- mei_deregister(dev);
-release_irq:
mei_cancel_work(dev);
mei_disable_interrupts(dev);
free_irq(pdev->irq, dev);
+ mei_deregister(dev);
end:
dev_err(&pdev->dev, "initialization failed.\n");
return err;
diff --git a/drivers/misc/mei/pci-txe.c b/drivers/misc/mei/pci-txe.c
index 06b55a891c6b..98d1bc2c7f4b 100644
--- a/drivers/misc/mei/pci-txe.c
+++ b/drivers/misc/mei/pci-txe.c
@@ -87,6 +87,10 @@ static int mei_txe_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
hw = to_txe_hw(dev);
hw->mem_addr = pcim_iomap_table(pdev);
+ err = mei_register(dev, &pdev->dev);
+ if (err)
+ goto end;
+
pci_enable_msi(pdev);
/* clear spurious interrupts */
@@ -106,13 +110,9 @@ static int mei_txe_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
if (err) {
dev_err(&pdev->dev, "mei: request_threaded_irq failure. irq = %d\n",
pdev->irq);
- goto end;
+ goto deregister;
}
- err = mei_register(dev, &pdev->dev);
- if (err)
- goto release_irq;
-
if (mei_start(dev)) {
dev_err(&pdev->dev, "init hw failure.\n");
err = -ENODEV;
@@ -145,11 +145,10 @@ static int mei_txe_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
return 0;
deregister:
- mei_deregister(dev);
-release_irq:
mei_cancel_work(dev);
mei_disable_interrupts(dev);
free_irq(pdev->irq, dev);
+ mei_deregister(dev);
end:
dev_err(&pdev->dev, "initialization failed.\n");
return err;
diff --git a/drivers/misc/mei/platform-vsc.c b/drivers/misc/mei/platform-vsc.c
index 288e7b72e942..9787b9cee71c 100644
--- a/drivers/misc/mei/platform-vsc.c
+++ b/drivers/misc/mei/platform-vsc.c
@@ -362,28 +362,27 @@ static int mei_vsc_probe(struct platform_device *pdev)
ret = mei_register(mei_dev, dev);
if (ret)
- goto err_dereg;
+ goto err;
ret = mei_start(mei_dev);
if (ret) {
dev_err_probe(dev, ret, "init hw failed\n");
- goto err_cancel;
+ goto err;
}
pm_runtime_enable(mei_dev->parent);
return 0;
-err_dereg:
- mei_deregister(mei_dev);
-
-err_cancel:
+err:
mei_cancel_work(mei_dev);
vsc_tp_register_event_cb(tp, NULL, NULL);
mei_disable_interrupts(mei_dev);
+ mei_deregister(mei_dev);
+
return ret;
}
diff --git a/drivers/misc/ntsync.c b/drivers/misc/ntsync.c
index 999026a1ae04..9087f045e362 100644
--- a/drivers/misc/ntsync.c
+++ b/drivers/misc/ntsync.c
@@ -721,21 +721,12 @@ static struct ntsync_obj *ntsync_alloc_obj(struct ntsync_device *dev,
static int ntsync_obj_get_fd(struct ntsync_obj *obj)
{
- struct file *file;
- int fd;
-
- fd = get_unused_fd_flags(O_CLOEXEC);
- if (fd < 0)
- return fd;
- file = anon_inode_getfile("ntsync", &ntsync_obj_fops, obj, O_RDWR);
- if (IS_ERR(file)) {
- put_unused_fd(fd);
- return PTR_ERR(file);
- }
- obj->file = file;
- fd_install(fd, file);
-
- return fd;
+ FD_PREPARE(fdf, O_CLOEXEC,
+ anon_inode_getfile("ntsync", &ntsync_obj_fops, obj, O_RDWR));
+ if (fdf.err)
+ return fdf.err;
+ obj->file = fd_prepare_file(fdf);
+ return fd_publish(fdf);
}
static int ntsync_create_sem(struct ntsync_device *dev, void __user *argp)
diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig
index 2c963cb6724b..10d0ef58ef49 100644
--- a/drivers/mmc/host/Kconfig
+++ b/drivers/mmc/host/Kconfig
@@ -950,7 +950,7 @@ config MMC_USHC
config MMC_WMT
tristate "Wondermedia SD/MMC Host Controller support"
depends on ARCH_VT8500 || COMPILE_TEST
- default y
+ default ARCH_VT8500
help
This selects support for the SD/MMC Host Controller on
Wondermedia WM8505/WM8650 based SoCs.
diff --git a/drivers/mmc/host/dw_mmc-rockchip.c b/drivers/mmc/host/dw_mmc-rockchip.c
index 82dd906bb002..681354942e97 100644
--- a/drivers/mmc/host/dw_mmc-rockchip.c
+++ b/drivers/mmc/host/dw_mmc-rockchip.c
@@ -42,7 +42,7 @@ struct dw_mci_rockchip_priv_data {
*/
static int rockchip_mmc_get_internal_phase(struct dw_mci *host, bool sample)
{
- unsigned long rate = clk_get_rate(host->ciu_clk);
+ unsigned long rate = clk_get_rate(host->ciu_clk) / RK3288_CLKGEN_DIV;
u32 raw_value;
u16 degrees;
u32 delay_num = 0;
@@ -85,7 +85,7 @@ static int rockchip_mmc_get_phase(struct dw_mci *host, bool sample)
static int rockchip_mmc_set_internal_phase(struct dw_mci *host, bool sample, int degrees)
{
- unsigned long rate = clk_get_rate(host->ciu_clk);
+ unsigned long rate = clk_get_rate(host->ciu_clk) / RK3288_CLKGEN_DIV;
u8 nineties, remainder;
u8 delay_num;
u32 raw_value;
diff --git a/drivers/mmc/host/pxamci.c b/drivers/mmc/host/pxamci.c
index 26d03352af63..b5ea058ed467 100644
--- a/drivers/mmc/host/pxamci.c
+++ b/drivers/mmc/host/pxamci.c
@@ -652,10 +652,9 @@ static int pxamci_probe(struct platform_device *pdev)
host->clkrt = CLKRT_OFF;
host->clk = devm_clk_get(dev, NULL);
- if (IS_ERR(host->clk)) {
- host->clk = NULL;
- return PTR_ERR(host->clk);
- }
+ if (IS_ERR(host->clk))
+ return dev_err_probe(dev, PTR_ERR(host->clk),
+ "Failed to acquire clock\n");
host->clkrate = clk_get_rate(host->clk);
@@ -703,46 +702,37 @@ static int pxamci_probe(struct platform_device *pdev)
platform_set_drvdata(pdev, mmc);
- host->dma_chan_rx = dma_request_chan(dev, "rx");
- if (IS_ERR(host->dma_chan_rx)) {
- host->dma_chan_rx = NULL;
+ host->dma_chan_rx = devm_dma_request_chan(dev, "rx");
+ if (IS_ERR(host->dma_chan_rx))
return dev_err_probe(dev, PTR_ERR(host->dma_chan_rx),
"unable to request rx dma channel\n");
- }
- host->dma_chan_tx = dma_request_chan(dev, "tx");
- if (IS_ERR(host->dma_chan_tx)) {
- dev_err(dev, "unable to request tx dma channel\n");
- ret = PTR_ERR(host->dma_chan_tx);
- host->dma_chan_tx = NULL;
- goto out;
- }
+
+ host->dma_chan_tx = devm_dma_request_chan(dev, "tx");
+ if (IS_ERR(host->dma_chan_tx))
+ return dev_err_probe(dev, PTR_ERR(host->dma_chan_tx),
+ "unable to request tx dma channel\n");
if (host->pdata) {
host->detect_delay_ms = host->pdata->detect_delay_ms;
host->power = devm_gpiod_get_optional(dev, "power", GPIOD_OUT_LOW);
- if (IS_ERR(host->power)) {
- ret = PTR_ERR(host->power);
- dev_err(dev, "Failed requesting gpio_power\n");
- goto out;
- }
+ if (IS_ERR(host->power))
+ return dev_err_probe(dev, PTR_ERR(host->power),
+ "Failed requesting gpio_power\n");
/* FIXME: should we pass detection delay to debounce? */
ret = mmc_gpiod_request_cd(mmc, "cd", 0, false, 0);
- if (ret && ret != -ENOENT) {
- dev_err(dev, "Failed requesting gpio_cd\n");
- goto out;
- }
+ if (ret && ret != -ENOENT)
+ return dev_err_probe(dev, ret, "Failed requesting gpio_cd\n");
if (!host->pdata->gpio_card_ro_invert)
mmc->caps2 |= MMC_CAP2_RO_ACTIVE_HIGH;
ret = mmc_gpiod_request_ro(mmc, "wp", 0, 0);
- if (ret && ret != -ENOENT) {
- dev_err(dev, "Failed requesting gpio_ro\n");
- goto out;
- }
+ if (ret && ret != -ENOENT)
+ return dev_err_probe(dev, ret, "Failed requesting gpio_ro\n");
+
if (!ret)
host->use_ro_gpio = true;
@@ -759,16 +749,8 @@ static int pxamci_probe(struct platform_device *pdev)
if (ret) {
if (host->pdata && host->pdata->exit)
host->pdata->exit(dev, mmc);
- goto out;
}
- return 0;
-
-out:
- if (host->dma_chan_rx)
- dma_release_channel(host->dma_chan_rx);
- if (host->dma_chan_tx)
- dma_release_channel(host->dma_chan_tx);
return ret;
}
@@ -791,8 +773,6 @@ static void pxamci_remove(struct platform_device *pdev)
dmaengine_terminate_all(host->dma_chan_rx);
dmaengine_terminate_all(host->dma_chan_tx);
- dma_release_channel(host->dma_chan_rx);
- dma_release_channel(host->dma_chan_tx);
}
}
diff --git a/drivers/mmc/host/sdhci-of-dwcmshc.c b/drivers/mmc/host/sdhci-of-dwcmshc.c
index eebd45389956..4e256673a098 100644
--- a/drivers/mmc/host/sdhci-of-dwcmshc.c
+++ b/drivers/mmc/host/sdhci-of-dwcmshc.c
@@ -94,7 +94,7 @@
#define DLL_TXCLK_TAPNUM_DEFAULT 0x10
#define DLL_TXCLK_TAPNUM_90_DEGREES 0xA
#define DLL_TXCLK_TAPNUM_FROM_SW BIT(24)
-#define DLL_STRBIN_TAPNUM_DEFAULT 0x8
+#define DLL_STRBIN_TAPNUM_DEFAULT 0x4
#define DLL_STRBIN_TAPNUM_FROM_SW BIT(24)
#define DLL_STRBIN_DELAY_NUM_SEL BIT(26)
#define DLL_STRBIN_DELAY_NUM_OFFSET 16
@@ -289,6 +289,19 @@ static void dwcmshc_adma_write_desc(struct sdhci_host *host, void **desc,
sdhci_adma_write_desc(host, desc, addr, len, cmd);
}
+static void dwcmshc_reset(struct sdhci_host *host, u8 mask)
+{
+ sdhci_reset(host, mask);
+
+ /* The dwcmshc does not comply with the SDHCI specification
+ * regarding the "Software Reset for CMD line should clear 'Command
+ * Complete' in the Normal Interrupt Status Register." Clear the bit
+ * here to compensate for this quirk.
+ */
+ if (mask & SDHCI_RESET_CMD)
+ sdhci_writel(host, SDHCI_INT_RESPONSE, SDHCI_INT_STATUS);
+}
+
static unsigned int dwcmshc_get_max_clock(struct sdhci_host *host)
{
struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host);
@@ -832,15 +845,7 @@ static void th1520_sdhci_reset(struct sdhci_host *host, u8 mask)
struct dwcmshc_priv *priv = sdhci_pltfm_priv(pltfm_host);
u16 ctrl_2;
- sdhci_reset(host, mask);
-
- /* The T-Head 1520 SoC does not comply with the SDHCI specification
- * regarding the "Software Reset for CMD line should clear 'Command
- * Complete' in the Normal Interrupt Status Register." Clear the bit
- * here to compensate for this quirk.
- */
- if (mask & SDHCI_RESET_CMD)
- sdhci_writel(host, SDHCI_INT_RESPONSE, SDHCI_INT_STATUS);
+ dwcmshc_reset(host, mask);
if (priv->flags & FLAG_IO_FIXED_1V8) {
ctrl_2 = sdhci_readw(host, SDHCI_HOST_CONTROL2);
@@ -886,7 +891,7 @@ static void cv18xx_sdhci_reset(struct sdhci_host *host, u8 mask)
struct dwcmshc_priv *priv = sdhci_pltfm_priv(pltfm_host);
u32 val, emmc_caps = MMC_CAP2_NO_SD | MMC_CAP2_NO_SDIO;
- sdhci_reset(host, mask);
+ dwcmshc_reset(host, mask);
if ((host->mmc->caps2 & emmc_caps) == emmc_caps) {
val = sdhci_readl(host, priv->vendor_specific_area1 + CV18XX_SDHCI_MSHC_CTRL);
@@ -958,7 +963,7 @@ static void cv18xx_sdhci_post_tuning(struct sdhci_host *host)
val |= SDHCI_INT_DATA_AVAIL;
sdhci_writel(host, val, SDHCI_INT_STATUS);
- sdhci_reset(host, SDHCI_RESET_CMD | SDHCI_RESET_DATA);
+ dwcmshc_reset(host, SDHCI_RESET_CMD | SDHCI_RESET_DATA);
}
static int cv18xx_sdhci_execute_tuning(struct sdhci_host *host, u32 opcode)
@@ -1100,7 +1105,7 @@ static const struct sdhci_ops sdhci_dwcmshc_ops = {
.set_bus_width = sdhci_set_bus_width,
.set_uhs_signaling = dwcmshc_set_uhs_signaling,
.get_max_clock = dwcmshc_get_max_clock,
- .reset = sdhci_reset,
+ .reset = dwcmshc_reset,
.adma_write_desc = dwcmshc_adma_write_desc,
.irq = dwcmshc_cqe_irq_handler,
};
diff --git a/drivers/most/most_usb.c b/drivers/most/most_usb.c
index 10064d7b7249..41ee169f80c5 100644
--- a/drivers/most/most_usb.c
+++ b/drivers/most/most_usb.c
@@ -1058,7 +1058,7 @@ hdm_probe(struct usb_interface *interface, const struct usb_device_id *id)
ret = most_register_interface(&mdev->iface);
if (ret)
- goto err_free_busy_urbs;
+ return ret;
mutex_lock(&mdev->io_mutex);
if (le16_to_cpu(usb_dev->descriptor.idProduct) == USB_DEV_ID_OS81118 ||
@@ -1068,8 +1068,7 @@ hdm_probe(struct usb_interface *interface, const struct usb_device_id *id)
if (!mdev->dci) {
mutex_unlock(&mdev->io_mutex);
most_deregister_interface(&mdev->iface);
- ret = -ENOMEM;
- goto err_free_busy_urbs;
+ return -ENOMEM;
}
mdev->dci->dev.init_name = "dci";
@@ -1078,18 +1077,15 @@ hdm_probe(struct usb_interface *interface, const struct usb_device_id *id)
mdev->dci->dev.release = release_dci;
if (device_register(&mdev->dci->dev)) {
mutex_unlock(&mdev->io_mutex);
+ put_device(&mdev->dci->dev);
most_deregister_interface(&mdev->iface);
- ret = -ENOMEM;
- goto err_free_dci;
+ return -ENOMEM;
}
mdev->dci->usb_device = mdev->usb_device;
}
mutex_unlock(&mdev->io_mutex);
return 0;
-err_free_dci:
- put_device(&mdev->dci->dev);
-err_free_busy_urbs:
- kfree(mdev->busy_urbs);
+
err_free_ep_address:
kfree(mdev->ep_address);
err_free_cap:
diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index 8dc4f5c493fc..335c702633ff 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -599,6 +599,7 @@ mtdchar_write_ioctl(struct mtd_info *mtd, struct mtd_write_req __user *argp)
uint8_t *datbuf = NULL, *oobbuf = NULL;
size_t datbuf_len, oobbuf_len;
int ret = 0;
+ u64 end;
if (copy_from_user(&req, argp, sizeof(req)))
return -EFAULT;
@@ -618,7 +619,7 @@ mtdchar_write_ioctl(struct mtd_info *mtd, struct mtd_write_req __user *argp)
req.len &= 0xffffffff;
req.ooblen &= 0xffffffff;
- if (req.start + req.len > mtd->size)
+ if (check_add_overflow(req.start, req.len, &end) || end > mtd->size)
return -EINVAL;
datbuf_len = min_t(size_t, req.len, mtd->erasesize);
@@ -698,6 +699,7 @@ mtdchar_read_ioctl(struct mtd_info *mtd, struct mtd_read_req __user *argp)
size_t datbuf_len, oobbuf_len;
size_t orig_len, orig_ooblen;
int ret = 0;
+ u64 end;
if (copy_from_user(&req, argp, sizeof(req)))
return -EFAULT;
@@ -724,7 +726,7 @@ mtdchar_read_ioctl(struct mtd_info *mtd, struct mtd_read_req __user *argp)
req.len &= 0xffffffff;
req.ooblen &= 0xffffffff;
- if (req.start + req.len > mtd->size) {
+ if (check_add_overflow(req.start, req.len, &end) || end > mtd->size) {
ret = -EINVAL;
goto out;
}
diff --git a/drivers/mtd/nand/Kconfig b/drivers/mtd/nand/Kconfig
index 4a17271076bc..1e57c8de8578 100644
--- a/drivers/mtd/nand/Kconfig
+++ b/drivers/mtd/nand/Kconfig
@@ -63,7 +63,7 @@ config MTD_NAND_ECC_MEDIATEK
config MTD_NAND_ECC_REALTEK
tristate "Realtek RTL93xx hardware ECC engine"
- depends on HAS_IOMEM
+ depends on HAS_IOMEM && HAS_DMA
depends on MACH_REALTEK_RTL || COMPILE_TEST
select MTD_NAND_ECC
help
diff --git a/drivers/mtd/nand/ecc-realtek.c b/drivers/mtd/nand/ecc-realtek.c
index 7d718934c909..0046da37ea3e 100644
--- a/drivers/mtd/nand/ecc-realtek.c
+++ b/drivers/mtd/nand/ecc-realtek.c
@@ -380,7 +380,7 @@ static void rtl_ecc_cleanup_ctx(struct nand_device *nand)
nand_ecc_cleanup_req_tweaking(&ctx->req_ctx);
}
-static struct nand_ecc_engine_ops rtl_ecc_engine_ops = {
+static const struct nand_ecc_engine_ops rtl_ecc_engine_ops = {
.init_ctx = rtl_ecc_init_ctx,
.cleanup_ctx = rtl_ecc_cleanup_ctx,
.prepare_io_req = rtl_ecc_prepare_io_req,
@@ -418,8 +418,8 @@ static int rtl_ecc_probe(struct platform_device *pdev)
rtlc->buf = dma_alloc_noncoherent(dev, RTL_ECC_DMA_SIZE, &rtlc->buf_dma,
DMA_BIDIRECTIONAL, GFP_KERNEL);
- if (IS_ERR(rtlc->buf))
- return PTR_ERR(rtlc->buf);
+ if (!rtlc->buf)
+ return -ENOMEM;
rtlc->dev = dev;
rtlc->engine.dev = dev;
diff --git a/drivers/mtd/nand/onenand/onenand_samsung.c b/drivers/mtd/nand/onenand/onenand_samsung.c
index f37a6138e461..6d6aa709a21f 100644
--- a/drivers/mtd/nand/onenand/onenand_samsung.c
+++ b/drivers/mtd/nand/onenand/onenand_samsung.c
@@ -906,7 +906,7 @@ static int s3c_onenand_probe(struct platform_device *pdev)
err = devm_request_irq(&pdev->dev, r->start,
s5pc110_onenand_irq,
IRQF_SHARED, "onenand",
- &onenand);
+ onenand);
if (err) {
dev_err(&pdev->dev, "failed to get irq\n");
return err;
diff --git a/drivers/mtd/nand/raw/cadence-nand-controller.c b/drivers/mtd/nand/raw/cadence-nand-controller.c
index 6667eea95597..32ed38b89394 100644
--- a/drivers/mtd/nand/raw/cadence-nand-controller.c
+++ b/drivers/mtd/nand/raw/cadence-nand-controller.c
@@ -2871,7 +2871,7 @@ cadence_nand_irq_cleanup(int irqnum, struct cdns_nand_ctrl *cdns_ctrl)
static int cadence_nand_init(struct cdns_nand_ctrl *cdns_ctrl)
{
dma_cap_mask_t mask;
- struct dma_device *dma_dev = cdns_ctrl->dmac->device;
+ struct dma_device *dma_dev;
int ret;
cdns_ctrl->cdma_desc = dma_alloc_coherent(cdns_ctrl->dev,
@@ -2915,6 +2915,7 @@ static int cadence_nand_init(struct cdns_nand_ctrl *cdns_ctrl)
}
}
+ dma_dev = cdns_ctrl->dmac->device;
cdns_ctrl->io.iova_dma = dma_map_resource(dma_dev->dev, cdns_ctrl->io.dma,
cdns_ctrl->io.size,
DMA_BIDIRECTIONAL, 0);
diff --git a/drivers/mtd/nand/spi/fmsh.c b/drivers/mtd/nand/spi/fmsh.c
index 8b2097bfc771..c2b9a8c113cb 100644
--- a/drivers/mtd/nand/spi/fmsh.c
+++ b/drivers/mtd/nand/spi/fmsh.c
@@ -58,7 +58,7 @@ static const struct spinand_info fmsh_spinand_table[] = {
SPINAND_INFO_OP_VARIANTS(&read_cache_variants,
&write_cache_variants,
&update_cache_variants),
- SPINAND_HAS_QE_BIT,
+ 0,
SPINAND_ECCINFO(&fm25s01a_ooblayout, NULL)),
};
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index e95e593cd12d..5abef8a3b775 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -2120,7 +2120,7 @@ skip_mac_set:
/* check for initial state */
new_slave->link = BOND_LINK_NOCHANGE;
if (bond->params.miimon) {
- if (netif_carrier_ok(slave_dev)) {
+ if (netif_running(slave_dev) && netif_carrier_ok(slave_dev)) {
if (bond->params.updelay) {
bond_set_slave_link_state(new_slave,
BOND_LINK_BACK,
@@ -2665,7 +2665,8 @@ static int bond_miimon_inspect(struct bonding *bond)
bond_for_each_slave_rcu(bond, slave, iter) {
bond_propose_link_state(slave, BOND_LINK_NOCHANGE);
- link_state = netif_carrier_ok(slave->dev);
+ link_state = netif_running(slave->dev) &&
+ netif_carrier_ok(slave->dev);
switch (slave->link) {
case BOND_LINK_UP:
diff --git a/drivers/net/can/rcar/rcar_canfd.c b/drivers/net/can/rcar/rcar_canfd.c
index 45d36adb51b7..4c0d7d26df9f 100644
--- a/drivers/net/can/rcar/rcar_canfd.c
+++ b/drivers/net/can/rcar/rcar_canfd.c
@@ -709,6 +709,11 @@ static void rcar_canfd_set_bit_reg(void __iomem *addr, u32 val)
rcar_canfd_update(val, val, addr);
}
+static void rcar_canfd_clear_bit_reg(void __iomem *addr, u32 val)
+{
+ rcar_canfd_update(val, 0, addr);
+}
+
static void rcar_canfd_update_bit_reg(void __iomem *addr, u32 mask, u32 val)
{
rcar_canfd_update(mask, val, addr);
@@ -755,25 +760,6 @@ static void rcar_canfd_set_rnc(struct rcar_canfd_global *gpriv, unsigned int ch,
rcar_canfd_set_bit(gpriv->base, RCANFD_GAFLCFG(w), rnc);
}
-static void rcar_canfd_set_mode(struct rcar_canfd_global *gpriv)
-{
- if (gpriv->info->ch_interface_mode) {
- u32 ch, val = gpriv->fdmode ? RCANFD_GEN4_FDCFG_FDOE
- : RCANFD_GEN4_FDCFG_CLOE;
-
- for_each_set_bit(ch, &gpriv->channels_mask,
- gpriv->info->max_channels)
- rcar_canfd_set_bit_reg(&gpriv->fcbase[ch].cfdcfg, val);
- } else {
- if (gpriv->fdmode)
- rcar_canfd_set_bit(gpriv->base, RCANFD_GRMCFG,
- RCANFD_GRMCFG_RCMC);
- else
- rcar_canfd_clear_bit(gpriv->base, RCANFD_GRMCFG,
- RCANFD_GRMCFG_RCMC);
- }
-}
-
static int rcar_canfd_reset_controller(struct rcar_canfd_global *gpriv)
{
struct device *dev = &gpriv->pdev->dev;
@@ -806,6 +792,16 @@ static int rcar_canfd_reset_controller(struct rcar_canfd_global *gpriv)
/* Reset Global error flags */
rcar_canfd_write(gpriv->base, RCANFD_GERFL, 0x0);
+ /* Set the controller into appropriate mode */
+ if (!gpriv->info->ch_interface_mode) {
+ if (gpriv->fdmode)
+ rcar_canfd_set_bit(gpriv->base, RCANFD_GRMCFG,
+ RCANFD_GRMCFG_RCMC);
+ else
+ rcar_canfd_clear_bit(gpriv->base, RCANFD_GRMCFG,
+ RCANFD_GRMCFG_RCMC);
+ }
+
/* Transition all Channels to reset mode */
for_each_set_bit(ch, &gpriv->channels_mask, gpriv->info->max_channels) {
rcar_canfd_clear_bit(gpriv->base,
@@ -823,10 +819,23 @@ static int rcar_canfd_reset_controller(struct rcar_canfd_global *gpriv)
dev_dbg(dev, "channel %u reset failed\n", ch);
return err;
}
- }
- /* Set the controller into appropriate mode */
- rcar_canfd_set_mode(gpriv);
+ /* Set the controller into appropriate mode */
+ if (gpriv->info->ch_interface_mode) {
+ /* Do not set CLOE and FDOE simultaneously */
+ if (!gpriv->fdmode) {
+ rcar_canfd_clear_bit_reg(&gpriv->fcbase[ch].cfdcfg,
+ RCANFD_GEN4_FDCFG_FDOE);
+ rcar_canfd_set_bit_reg(&gpriv->fcbase[ch].cfdcfg,
+ RCANFD_GEN4_FDCFG_CLOE);
+ } else {
+ rcar_canfd_clear_bit_reg(&gpriv->fcbase[ch].cfdcfg,
+ RCANFD_GEN4_FDCFG_FDOE);
+ rcar_canfd_clear_bit_reg(&gpriv->fcbase[ch].cfdcfg,
+ RCANFD_GEN4_FDCFG_CLOE);
+ }
+ }
+ }
return 0;
}
diff --git a/drivers/net/can/sja1000/sja1000.c b/drivers/net/can/sja1000/sja1000.c
index 4d245857ef1c..83476af8adb5 100644
--- a/drivers/net/can/sja1000/sja1000.c
+++ b/drivers/net/can/sja1000/sja1000.c
@@ -548,8 +548,8 @@ irqreturn_t sja1000_interrupt(int irq, void *dev_id)
if (priv->read_reg(priv, SJA1000_IER) == IRQ_OFF)
goto out;
- while ((isrc = priv->read_reg(priv, SJA1000_IR)) &&
- (n < SJA1000_MAX_IRQ)) {
+ while ((n < SJA1000_MAX_IRQ) &&
+ (isrc = priv->read_reg(priv, SJA1000_IR))) {
status = priv->read_reg(priv, SJA1000_SR);
/* check for absent controller due to hw unplug */
diff --git a/drivers/net/can/sun4i_can.c b/drivers/net/can/sun4i_can.c
index 53bfd873de9b..0a7ba0942839 100644
--- a/drivers/net/can/sun4i_can.c
+++ b/drivers/net/can/sun4i_can.c
@@ -657,8 +657,8 @@ static irqreturn_t sun4i_can_interrupt(int irq, void *dev_id)
u8 isrc, status;
int n = 0;
- while ((isrc = readl(priv->base + SUN4I_REG_INT_ADDR)) &&
- (n < SUN4I_CAN_MAX_IRQ)) {
+ while ((n < SUN4I_CAN_MAX_IRQ) &&
+ (isrc = readl(priv->base + SUN4I_REG_INT_ADDR))) {
n++;
status = readl(priv->base + SUN4I_REG_STA_ADDR);
diff --git a/drivers/net/can/usb/gs_usb.c b/drivers/net/can/usb/gs_usb.c
index 69b8d6da651b..8d8a610f9144 100644
--- a/drivers/net/can/usb/gs_usb.c
+++ b/drivers/net/can/usb/gs_usb.c
@@ -261,14 +261,21 @@ struct canfd_quirk {
u8 quirk;
} __packed;
+/* struct gs_host_frame::echo_id == GS_HOST_FRAME_ECHO_ID_RX indicates
+ * a regular RX'ed CAN frame
+ */
+#define GS_HOST_FRAME_ECHO_ID_RX 0xffffffff
+
struct gs_host_frame {
- u32 echo_id;
- __le32 can_id;
+ struct_group(header,
+ u32 echo_id;
+ __le32 can_id;
- u8 can_dlc;
- u8 channel;
- u8 flags;
- u8 reserved;
+ u8 can_dlc;
+ u8 channel;
+ u8 flags;
+ u8 reserved;
+ );
union {
DECLARE_FLEX_ARRAY(struct classic_can, classic_can);
@@ -568,6 +575,37 @@ gs_usb_get_echo_skb(struct gs_can *dev, struct sk_buff *skb,
return len;
}
+static unsigned int
+gs_usb_get_minimum_rx_length(const struct gs_can *dev, const struct gs_host_frame *hf,
+ unsigned int *data_length_p)
+{
+ unsigned int minimum_length, data_length = 0;
+
+ if (hf->flags & GS_CAN_FLAG_FD) {
+ if (hf->echo_id == GS_HOST_FRAME_ECHO_ID_RX)
+ data_length = can_fd_dlc2len(hf->can_dlc);
+
+ if (dev->feature & GS_CAN_FEATURE_HW_TIMESTAMP)
+ /* timestamp follows data field of max size */
+ minimum_length = struct_size(hf, canfd_ts, 1);
+ else
+ minimum_length = sizeof(hf->header) + data_length;
+ } else {
+ if (hf->echo_id == GS_HOST_FRAME_ECHO_ID_RX &&
+ !(hf->can_id & cpu_to_le32(CAN_RTR_FLAG)))
+ data_length = can_cc_dlc2len(hf->can_dlc);
+
+ if (dev->feature & GS_CAN_FEATURE_HW_TIMESTAMP)
+ /* timestamp follows data field of max size */
+ minimum_length = struct_size(hf, classic_can_ts, 1);
+ else
+ minimum_length = sizeof(hf->header) + data_length;
+ }
+
+ *data_length_p = data_length;
+ return minimum_length;
+}
+
static void gs_usb_receive_bulk_callback(struct urb *urb)
{
struct gs_usb *parent = urb->context;
@@ -576,6 +614,7 @@ static void gs_usb_receive_bulk_callback(struct urb *urb)
int rc;
struct net_device_stats *stats;
struct gs_host_frame *hf = urb->transfer_buffer;
+ unsigned int minimum_length, data_length;
struct gs_tx_context *txc;
struct can_frame *cf;
struct canfd_frame *cfd;
@@ -594,6 +633,15 @@ static void gs_usb_receive_bulk_callback(struct urb *urb)
return;
}
+ minimum_length = sizeof(hf->header);
+ if (urb->actual_length < minimum_length) {
+ dev_err_ratelimited(&parent->udev->dev,
+ "short read (actual_length=%u, minimum_length=%u)\n",
+ urb->actual_length, minimum_length);
+
+ goto resubmit_urb;
+ }
+
/* device reports out of range channel id */
if (hf->channel >= parent->channel_cnt)
goto device_detach;
@@ -609,20 +657,33 @@ static void gs_usb_receive_bulk_callback(struct urb *urb)
if (!netif_running(netdev))
goto resubmit_urb;
- if (hf->echo_id == -1) { /* normal rx */
+ minimum_length = gs_usb_get_minimum_rx_length(dev, hf, &data_length);
+ if (urb->actual_length < minimum_length) {
+ stats->rx_errors++;
+ stats->rx_length_errors++;
+
+ if (net_ratelimit())
+ netdev_err(netdev,
+ "short read (actual_length=%u, minimum_length=%u)\n",
+ urb->actual_length, minimum_length);
+
+ goto resubmit_urb;
+ }
+
+ if (hf->echo_id == GS_HOST_FRAME_ECHO_ID_RX) { /* normal rx */
if (hf->flags & GS_CAN_FLAG_FD) {
skb = alloc_canfd_skb(netdev, &cfd);
if (!skb)
return;
cfd->can_id = le32_to_cpu(hf->can_id);
- cfd->len = can_fd_dlc2len(hf->can_dlc);
+ cfd->len = data_length;
if (hf->flags & GS_CAN_FLAG_BRS)
cfd->flags |= CANFD_BRS;
if (hf->flags & GS_CAN_FLAG_ESI)
cfd->flags |= CANFD_ESI;
- memcpy(cfd->data, hf->canfd->data, cfd->len);
+ memcpy(cfd->data, hf->canfd->data, data_length);
} else {
skb = alloc_can_skb(netdev, &cf);
if (!skb)
@@ -631,7 +692,7 @@ static void gs_usb_receive_bulk_callback(struct urb *urb)
cf->can_id = le32_to_cpu(hf->can_id);
can_frame_set_cc_len(cf, hf->can_dlc, dev->can.ctrlmode);
- memcpy(cf->data, hf->classic_can->data, 8);
+ memcpy(cf->data, hf->classic_can->data, data_length);
/* ERROR frames tell us information about the controller */
if (le32_to_cpu(hf->can_id) & CAN_ERR_FLAG)
@@ -687,7 +748,7 @@ static void gs_usb_receive_bulk_callback(struct urb *urb)
resubmit_urb:
usb_fill_bulk_urb(urb, parent->udev,
parent->pipe_in,
- hf, dev->parent->hf_size_rx,
+ hf, parent->hf_size_rx,
gs_usb_receive_bulk_callback, parent);
rc = usb_submit_urb(urb, GFP_ATOMIC);
@@ -750,8 +811,21 @@ static void gs_usb_xmit_callback(struct urb *urb)
struct gs_can *dev = txc->dev;
struct net_device *netdev = dev->netdev;
- if (urb->status)
- netdev_info(netdev, "usb xmit fail %u\n", txc->echo_id);
+ if (!urb->status)
+ return;
+
+ if (urb->status != -ESHUTDOWN && net_ratelimit())
+ netdev_info(netdev, "failed to xmit URB %u: %pe\n",
+ txc->echo_id, ERR_PTR(urb->status));
+
+ netdev->stats.tx_dropped++;
+ netdev->stats.tx_errors++;
+
+ can_free_echo_skb(netdev, txc->echo_id, NULL);
+ gs_free_tx_context(txc);
+ atomic_dec(&dev->active_tx_urbs);
+
+ netif_wake_queue(netdev);
}
static netdev_tx_t gs_can_start_xmit(struct sk_buff *skb,
diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c b/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c
index c29828a94ad0..1167d38344f1 100644
--- a/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c
+++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb_leaf.c
@@ -685,7 +685,7 @@ static int kvaser_usb_leaf_wait_cmd(const struct kvaser_usb *dev, u8 id,
* for further details.
*/
if (tmp->len == 0) {
- pos = round_up(pos,
+ pos = round_up(pos + 1,
le16_to_cpu
(dev->bulk_in->wMaxPacketSize));
continue;
@@ -1732,7 +1732,7 @@ static void kvaser_usb_leaf_read_bulk_callback(struct kvaser_usb *dev,
* number of events in case of a heavy rx load on the bus.
*/
if (cmd->len == 0) {
- pos = round_up(pos, le16_to_cpu
+ pos = round_up(pos + 1, le16_to_cpu
(dev->bulk_in->wMaxPacketSize));
continue;
}
diff --git a/drivers/net/dsa/hirschmann/hellcreek_ptp.c b/drivers/net/dsa/hirschmann/hellcreek_ptp.c
index bfe21f9f7dcd..cb23bea9c21b 100644
--- a/drivers/net/dsa/hirschmann/hellcreek_ptp.c
+++ b/drivers/net/dsa/hirschmann/hellcreek_ptp.c
@@ -376,8 +376,18 @@ static int hellcreek_led_setup(struct hellcreek *hellcreek)
hellcreek_set_brightness(hellcreek, STATUS_OUT_IS_GM, 1);
/* Register both leds */
- led_classdev_register(hellcreek->dev, &hellcreek->led_sync_good);
- led_classdev_register(hellcreek->dev, &hellcreek->led_is_gm);
+ ret = led_classdev_register(hellcreek->dev, &hellcreek->led_sync_good);
+ if (ret) {
+ dev_err(hellcreek->dev, "Failed to register sync_good LED\n");
+ goto out;
+ }
+
+ ret = led_classdev_register(hellcreek->dev, &hellcreek->led_is_gm);
+ if (ret) {
+ dev_err(hellcreek->dev, "Failed to register is_gm LED\n");
+ led_classdev_unregister(&hellcreek->led_sync_good);
+ goto out;
+ }
ret = 0;
diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c
index 933ae8dc6337..0c10351fe5eb 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -2587,8 +2587,8 @@ static int ksz_irq_phy_setup(struct ksz_device *dev)
irq = irq_find_mapping(dev->ports[port].pirq.domain,
PORT_SRC_PHY_INT);
- if (irq < 0) {
- ret = irq;
+ if (!irq) {
+ ret = -EINVAL;
goto out;
}
ds->user_mii_bus->irq[phy] = irq;
@@ -2952,8 +2952,8 @@ static int ksz_pirq_setup(struct ksz_device *dev, u8 p)
snprintf(pirq->name, sizeof(pirq->name), "port_irq-%d", p);
pirq->irq_num = irq_find_mapping(dev->girq.domain, p);
- if (pirq->irq_num < 0)
- return pirq->irq_num;
+ if (!pirq->irq_num)
+ return -EINVAL;
return ksz_irq_common_setup(dev, pirq);
}
@@ -3038,12 +3038,12 @@ static int ksz_setup(struct dsa_switch *ds)
dsa_switch_for_each_user_port(dp, dev->ds) {
ret = ksz_pirq_setup(dev, dp->index);
if (ret)
- goto out_girq;
+ goto port_release;
if (dev->info->ptp_capable) {
ret = ksz_ptp_irq_setup(ds, dp->index);
if (ret)
- goto out_pirq;
+ goto pirq_release;
}
}
}
@@ -3053,7 +3053,7 @@ static int ksz_setup(struct dsa_switch *ds)
if (ret) {
dev_err(dev->dev, "Failed to register PTP clock: %d\n",
ret);
- goto out_ptpirq;
+ goto port_release;
}
}
@@ -3076,17 +3076,16 @@ static int ksz_setup(struct dsa_switch *ds)
out_ptp_clock_unregister:
if (dev->info->ptp_capable)
ksz_ptp_clock_unregister(ds);
-out_ptpirq:
- if (dev->irq > 0 && dev->info->ptp_capable)
- dsa_switch_for_each_user_port(dp, dev->ds)
- ksz_ptp_irq_free(ds, dp->index);
-out_pirq:
- if (dev->irq > 0)
- dsa_switch_for_each_user_port(dp, dev->ds)
+port_release:
+ if (dev->irq > 0) {
+ dsa_switch_for_each_user_port_continue_reverse(dp, dev->ds) {
+ if (dev->info->ptp_capable)
+ ksz_ptp_irq_free(ds, dp->index);
+pirq_release:
ksz_irq_free(&dev->ports[dp->index].pirq);
-out_girq:
- if (dev->irq > 0)
+ }
ksz_irq_free(&dev->girq);
+ }
return ret;
}
diff --git a/drivers/net/dsa/microchip/ksz_ptp.c b/drivers/net/dsa/microchip/ksz_ptp.c
index 35fc21b1ee48..997e4a76d0a6 100644
--- a/drivers/net/dsa/microchip/ksz_ptp.c
+++ b/drivers/net/dsa/microchip/ksz_ptp.c
@@ -1093,19 +1093,19 @@ static int ksz_ptp_msg_irq_setup(struct ksz_port *port, u8 n)
static const char * const name[] = {"pdresp-msg", "xdreq-msg",
"sync-msg"};
const struct ksz_dev_ops *ops = port->ksz_dev->dev_ops;
+ struct ksz_irq *ptpirq = &port->ptpirq;
struct ksz_ptp_irq *ptpmsg_irq;
ptpmsg_irq = &port->ptpmsg_irq[n];
+ ptpmsg_irq->num = irq_create_mapping(ptpirq->domain, n);
+ if (!ptpmsg_irq->num)
+ return -EINVAL;
ptpmsg_irq->port = port;
ptpmsg_irq->ts_reg = ops->get_port_addr(port->num, ts_reg[n]);
strscpy(ptpmsg_irq->name, name[n]);
- ptpmsg_irq->num = irq_find_mapping(port->ptpirq.domain, n);
- if (ptpmsg_irq->num < 0)
- return ptpmsg_irq->num;
-
return request_threaded_irq(ptpmsg_irq->num, NULL,
ksz_ptp_msg_thread_fn, IRQF_ONESHOT,
ptpmsg_irq->name, ptpmsg_irq);
@@ -1135,12 +1135,9 @@ int ksz_ptp_irq_setup(struct dsa_switch *ds, u8 p)
if (!ptpirq->domain)
return -ENOMEM;
- for (irq = 0; irq < ptpirq->nirqs; irq++)
- irq_create_mapping(ptpirq->domain, irq);
-
ptpirq->irq_num = irq_find_mapping(port->pirq.domain, PORT_SRC_PTP_INT);
- if (ptpirq->irq_num < 0) {
- ret = ptpirq->irq_num;
+ if (!ptpirq->irq_num) {
+ ret = -EINVAL;
goto out;
}
@@ -1159,12 +1156,11 @@ int ksz_ptp_irq_setup(struct dsa_switch *ds, u8 p)
out_ptp_msg:
free_irq(ptpirq->irq_num, ptpirq);
- while (irq--)
+ while (irq--) {
free_irq(port->ptpmsg_irq[irq].num, &port->ptpmsg_irq[irq]);
-out:
- for (irq = 0; irq < ptpirq->nirqs; irq++)
irq_dispose_mapping(port->ptpmsg_irq[irq].num);
-
+ }
+out:
irq_domain_remove(ptpirq->domain);
return ret;
diff --git a/drivers/net/dsa/microchip/lan937x_main.c b/drivers/net/dsa/microchip/lan937x_main.c
index b1ae3b9de3d1..5a1496fff445 100644
--- a/drivers/net/dsa/microchip/lan937x_main.c
+++ b/drivers/net/dsa/microchip/lan937x_main.c
@@ -540,6 +540,7 @@ static void lan937x_set_tune_adj(struct ksz_device *dev, int port,
ksz_pread16(dev, port, reg, &data16);
/* Update tune Adjust */
+ data16 &= ~PORT_TUNE_ADJ;
data16 |= FIELD_PREP(PORT_TUNE_ADJ, val);
ksz_pwrite16(dev, port, reg, data16);
diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c
index f674c400f05b..aa2145cf29a6 100644
--- a/drivers/net/dsa/sja1105/sja1105_main.c
+++ b/drivers/net/dsa/sja1105/sja1105_main.c
@@ -1302,14 +1302,7 @@ static int sja1105_set_port_speed(struct sja1105_private *priv, int port,
* table, since this will be used for the clocking setup, and we no
* longer need to store it in the static config (already told hardware
* we want auto during upload phase).
- * Actually for the SGMII port, the MAC is fixed at 1 Gbps and
- * we need to configure the PCS only (if even that).
*/
- if (priv->phy_mode[port] == PHY_INTERFACE_MODE_SGMII)
- speed = priv->info->port_speed[SJA1105_SPEED_1000MBPS];
- else if (priv->phy_mode[port] == PHY_INTERFACE_MODE_2500BASEX)
- speed = priv->info->port_speed[SJA1105_SPEED_2500MBPS];
-
mac[port].speed = speed;
return 0;
diff --git a/drivers/net/ethernet/airoha/airoha_ppe.c b/drivers/net/ethernet/airoha/airoha_ppe.c
index 691361b25407..c0e17035db18 100644
--- a/drivers/net/ethernet/airoha/airoha_ppe.c
+++ b/drivers/net/ethernet/airoha/airoha_ppe.c
@@ -282,7 +282,7 @@ static int airoha_ppe_foe_entry_prepare(struct airoha_eth *eth,
if (!airoha_is_valid_gdm_port(eth, port))
return -EINVAL;
- if (dsa_port >= 0)
+ if (dsa_port >= 0 || eth->ports[1])
pse_port = port->id == 4 ? FE_PSE_PORT_GDM4
: port->id;
else
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_hw_utils.c b/drivers/net/ethernet/aquantia/atlantic/aq_hw_utils.c
index 1921741f7311..18b08277d2e1 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_hw_utils.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_hw_utils.c
@@ -15,6 +15,7 @@
#include "aq_hw.h"
#include "aq_nic.h"
+#include "hw_atl/hw_atl_llh.h"
void aq_hw_write_reg_bit(struct aq_hw_s *aq_hw, u32 addr, u32 msk,
u32 shift, u32 val)
@@ -81,6 +82,27 @@ void aq_hw_write_reg64(struct aq_hw_s *hw, u32 reg, u64 value)
lo_hi_writeq(value, hw->mmio + reg);
}
+int aq_hw_invalidate_descriptor_cache(struct aq_hw_s *hw)
+{
+ int err;
+ u32 val;
+
+ /* Invalidate Descriptor Cache to prevent writing to the cached
+ * descriptors and to the data pointer of those descriptors
+ */
+ hw_atl_rdm_rx_dma_desc_cache_init_tgl(hw);
+
+ err = aq_hw_err_from_flags(hw);
+ if (err)
+ goto err_exit;
+
+ readx_poll_timeout_atomic(hw_atl_rdm_rx_dma_desc_cache_init_done_get,
+ hw, val, val == 1, 1000U, 10000U);
+
+err_exit:
+ return err;
+}
+
int aq_hw_err_from_flags(struct aq_hw_s *hw)
{
int err = 0;
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_hw_utils.h b/drivers/net/ethernet/aquantia/atlantic/aq_hw_utils.h
index ffa6e4067c21..d89c63d88e4a 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_hw_utils.h
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_hw_utils.h
@@ -35,6 +35,7 @@ u32 aq_hw_read_reg(struct aq_hw_s *hw, u32 reg);
void aq_hw_write_reg(struct aq_hw_s *hw, u32 reg, u32 value);
u64 aq_hw_read_reg64(struct aq_hw_s *hw, u32 reg);
void aq_hw_write_reg64(struct aq_hw_s *hw, u32 reg, u64 value);
+int aq_hw_invalidate_descriptor_cache(struct aq_hw_s *hw);
int aq_hw_err_from_flags(struct aq_hw_s *hw);
int aq_hw_num_tcs(struct aq_hw_s *hw);
int aq_hw_q_per_tc(struct aq_hw_s *hw);
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
index f21de0c21e52..d23d23bed39f 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
@@ -547,6 +547,11 @@ static int __aq_ring_rx_clean(struct aq_ring_s *self, struct napi_struct *napi,
if (!buff->is_eop) {
unsigned int frag_cnt = 0U;
+
+ /* There will be an extra fragment */
+ if (buff->len > AQ_CFG_RX_HDR_SIZE)
+ frag_cnt++;
+
buff_ = buff;
do {
bool is_rsc_completed = true;
diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c
index 493432d036b9..c7895bfb2ecf 100644
--- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c
+++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c
@@ -1198,26 +1198,9 @@ static int hw_atl_b0_hw_interrupt_moderation_set(struct aq_hw_s *self)
static int hw_atl_b0_hw_stop(struct aq_hw_s *self)
{
- int err;
- u32 val;
-
hw_atl_b0_hw_irq_disable(self, HW_ATL_B0_INT_MASK);
- /* Invalidate Descriptor Cache to prevent writing to the cached
- * descriptors and to the data pointer of those descriptors
- */
- hw_atl_rdm_rx_dma_desc_cache_init_tgl(self);
-
- err = aq_hw_err_from_flags(self);
-
- if (err)
- goto err_exit;
-
- readx_poll_timeout_atomic(hw_atl_rdm_rx_dma_desc_cache_init_done_get,
- self, val, val == 1, 1000U, 10000U);
-
-err_exit:
- return err;
+ return aq_hw_invalidate_descriptor_cache(self);
}
int hw_atl_b0_hw_ring_tx_stop(struct aq_hw_s *self, struct aq_ring_s *ring)
diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2.c
index b0ed572e88c6..0ce9caae8799 100644
--- a/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2.c
+++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2.c
@@ -759,7 +759,7 @@ static int hw_atl2_hw_stop(struct aq_hw_s *self)
{
hw_atl_b0_hw_irq_disable(self, HW_ATL2_INT_MASK);
- return 0;
+ return aq_hw_invalidate_descriptor_cache(self);
}
static struct aq_stats_s *hw_atl2_utils_get_hw_stats(struct aq_hw_s *self)
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index cb004fd16252..5bb31c8fab39 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -1296,7 +1296,8 @@ static void be_xmit_flush(struct be_adapter *adapter, struct be_tx_obj *txo)
(adapter->bmc_filt_mask & BMC_FILT_MULTICAST)
static bool be_send_pkt_to_bmc(struct be_adapter *adapter,
- struct sk_buff **skb)
+ struct sk_buff **skb,
+ struct be_wrb_params *wrb_params)
{
struct ethhdr *eh = (struct ethhdr *)(*skb)->data;
bool os2bmc = false;
@@ -1360,7 +1361,7 @@ done:
* to BMC, asic expects the vlan to be inline in the packet.
*/
if (os2bmc)
- *skb = be_insert_vlan_in_pkt(adapter, *skb, NULL);
+ *skb = be_insert_vlan_in_pkt(adapter, *skb, wrb_params);
return os2bmc;
}
@@ -1387,7 +1388,7 @@ static netdev_tx_t be_xmit(struct sk_buff *skb, struct net_device *netdev)
/* if os2bmc is enabled and if the pkt is destined to bmc,
* enqueue the pkt a 2nd time with mgmt bit set.
*/
- if (be_send_pkt_to_bmc(adapter, &skb)) {
+ if (be_send_pkt_to_bmc(adapter, &skb, &wrb_params)) {
BE_WRB_F_SET(wrb_params.features, OS2BMC, 1);
wrb_cnt = be_xmit_enqueue(adapter, txo, skb, &wrb_params);
if (unlikely(!wrb_cnt))
diff --git a/drivers/net/ethernet/freescale/fec.h b/drivers/net/ethernet/freescale/fec.h
index 41e0d85d15da..abf1ef8e76c6 100644
--- a/drivers/net/ethernet/freescale/fec.h
+++ b/drivers/net/ethernet/freescale/fec.h
@@ -687,6 +687,7 @@ struct fec_enet_private {
unsigned int reload_period;
int pps_enable;
unsigned int next_counter;
+ bool perout_enable;
struct hrtimer perout_timer;
u64 perout_stime;
diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index 1edcfaee6819..3222359ac15b 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -1835,6 +1835,8 @@ fec_enet_rx_queue(struct net_device *ndev, u16 queue_id, int budget)
ndev->stats.rx_packets++;
pkt_len = fec16_to_cpu(bdp->cbd_datlen);
ndev->stats.rx_bytes += pkt_len;
+ if (fep->quirks & FEC_QUIRK_HAS_RACC)
+ ndev->stats.rx_bytes -= 2;
index = fec_enet_get_bd_index(bdp, &rxq->bd);
page = rxq->rx_skb_info[index].page;
diff --git a/drivers/net/ethernet/freescale/fec_ptp.c b/drivers/net/ethernet/freescale/fec_ptp.c
index fa88b47d526c..4b7bad9a485d 100644
--- a/drivers/net/ethernet/freescale/fec_ptp.c
+++ b/drivers/net/ethernet/freescale/fec_ptp.c
@@ -128,6 +128,12 @@ static int fec_ptp_enable_pps(struct fec_enet_private *fep, uint enable)
spin_lock_irqsave(&fep->tmreg_lock, flags);
+ if (fep->perout_enable) {
+ spin_unlock_irqrestore(&fep->tmreg_lock, flags);
+ dev_err(&fep->pdev->dev, "PEROUT is running");
+ return -EBUSY;
+ }
+
if (fep->pps_enable == enable) {
spin_unlock_irqrestore(&fep->tmreg_lock, flags);
return 0;
@@ -243,6 +249,7 @@ static int fec_ptp_pps_perout(struct fec_enet_private *fep)
* the FEC_TCCR register in time and missed the start time.
*/
if (fep->perout_stime < curr_time + 100 * NSEC_PER_MSEC) {
+ fep->perout_enable = false;
dev_err(&fep->pdev->dev, "Current time is too close to the start time!\n");
spin_unlock_irqrestore(&fep->tmreg_lock, flags);
return -1;
@@ -497,7 +504,10 @@ static int fec_ptp_pps_disable(struct fec_enet_private *fep, uint channel)
{
unsigned long flags;
+ hrtimer_cancel(&fep->perout_timer);
+
spin_lock_irqsave(&fep->tmreg_lock, flags);
+ fep->perout_enable = false;
writel(0, fep->hwp + FEC_TCSR(channel));
spin_unlock_irqrestore(&fep->tmreg_lock, flags);
@@ -529,6 +539,8 @@ static int fec_ptp_enable(struct ptp_clock_info *ptp,
return ret;
} else if (rq->type == PTP_CLK_REQ_PEROUT) {
+ u32 reload_period;
+
/* Reject requests with unsupported flags */
if (rq->perout.flags)
return -EOPNOTSUPP;
@@ -548,12 +560,14 @@ static int fec_ptp_enable(struct ptp_clock_info *ptp,
return -EOPNOTSUPP;
}
- fep->reload_period = div_u64(period_ns, 2);
- if (on && fep->reload_period) {
+ reload_period = div_u64(period_ns, 2);
+ if (on && reload_period) {
+ u64 perout_stime;
+
/* Convert 1588 timestamp to ns*/
start_time.tv_sec = rq->perout.start.sec;
start_time.tv_nsec = rq->perout.start.nsec;
- fep->perout_stime = timespec64_to_ns(&start_time);
+ perout_stime = timespec64_to_ns(&start_time);
mutex_lock(&fep->ptp_clk_mutex);
if (!fep->ptp_clk_on) {
@@ -562,18 +576,41 @@ static int fec_ptp_enable(struct ptp_clock_info *ptp,
return -EOPNOTSUPP;
}
spin_lock_irqsave(&fep->tmreg_lock, flags);
+
+ if (fep->pps_enable) {
+ dev_err(&fep->pdev->dev, "PPS is running");
+ ret = -EBUSY;
+ goto unlock;
+ }
+
+ if (fep->perout_enable) {
+ dev_err(&fep->pdev->dev,
+ "PEROUT has been enabled\n");
+ ret = -EBUSY;
+ goto unlock;
+ }
+
/* Read current timestamp */
curr_time = timecounter_read(&fep->tc);
- spin_unlock_irqrestore(&fep->tmreg_lock, flags);
- mutex_unlock(&fep->ptp_clk_mutex);
+ if (perout_stime <= curr_time) {
+ dev_err(&fep->pdev->dev,
+ "Start time must be greater than current time\n");
+ ret = -EINVAL;
+ goto unlock;
+ }
/* Calculate time difference */
- delta = fep->perout_stime - curr_time;
+ delta = perout_stime - curr_time;
+ fep->reload_period = reload_period;
+ fep->perout_stime = perout_stime;
+ fep->perout_enable = true;
- if (fep->perout_stime <= curr_time) {
- dev_err(&fep->pdev->dev, "Start time must larger than current time!\n");
- return -EINVAL;
- }
+unlock:
+ spin_unlock_irqrestore(&fep->tmreg_lock, flags);
+ mutex_unlock(&fep->ptp_clk_mutex);
+
+ if (ret)
+ return ret;
/* Because the timer counter of FEC only has 31-bits, correspondingly,
* the time comparison register FEC_TCCR also only low 31 bits can be
@@ -681,8 +718,11 @@ static irqreturn_t fec_pps_interrupt(int irq, void *dev_id)
fep->next_counter = (fep->next_counter + fep->reload_period) &
fep->cc.mask;
- event.type = PTP_CLOCK_PPS;
- ptp_clock_event(fep->ptp_clock, &event);
+ if (fep->pps_enable) {
+ event.type = PTP_CLOCK_PPS;
+ ptp_clock_event(fep->ptp_clock, &event);
+ }
+
return IRQ_HANDLED;
}
diff --git a/drivers/net/ethernet/intel/ice/ice_ptp.c b/drivers/net/ethernet/intel/ice/ice_ptp.c
index fb0f6365a6d6..8ec0f7d0fceb 100644
--- a/drivers/net/ethernet/intel/ice/ice_ptp.c
+++ b/drivers/net/ethernet/intel/ice/ice_ptp.c
@@ -3246,7 +3246,7 @@ void ice_ptp_init(struct ice_pf *pf)
err = ice_ptp_init_port(pf, &ptp->port);
if (err)
- goto err_exit;
+ goto err_clean_pf;
/* Start the PHY timestamping block */
ice_ptp_reset_phy_timestamping(pf);
@@ -3263,13 +3263,19 @@ void ice_ptp_init(struct ice_pf *pf)
dev_info(ice_pf_to_dev(pf), "PTP init successful\n");
return;
+err_clean_pf:
+ mutex_destroy(&ptp->port.ps_lock);
+ ice_ptp_cleanup_pf(pf);
err_exit:
/* If we registered a PTP clock, release it */
if (pf->ptp.clock) {
ptp_clock_unregister(ptp->clock);
pf->ptp.clock = NULL;
}
- ptp->state = ICE_PTP_ERROR;
+ /* Keep ICE_PTP_UNINIT state to avoid ambiguity at driver unload
+ * and to avoid duplicated resources release.
+ */
+ ptp->state = ICE_PTP_UNINIT;
dev_err(ice_pf_to_dev(pf), "PTP failed %d\n", err);
}
@@ -3282,9 +3288,19 @@ err_exit:
*/
void ice_ptp_release(struct ice_pf *pf)
{
- if (pf->ptp.state != ICE_PTP_READY)
+ if (pf->ptp.state == ICE_PTP_UNINIT)
return;
+ if (pf->ptp.state != ICE_PTP_READY) {
+ mutex_destroy(&pf->ptp.port.ps_lock);
+ ice_ptp_cleanup_pf(pf);
+ if (pf->ptp.clock) {
+ ptp_clock_unregister(pf->ptp.clock);
+ pf->ptp.clock = NULL;
+ }
+ return;
+ }
+
pf->ptp.state = ICE_PTP_UNINIT;
/* Disable timestamping for both Tx and Rx */
diff --git a/drivers/net/ethernet/intel/idpf/idpf_main.c b/drivers/net/ethernet/intel/idpf/idpf_main.c
index 8c46481d2e1f..8cf4ff697572 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_main.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_main.c
@@ -63,6 +63,8 @@ destroy_wqs:
destroy_workqueue(adapter->vc_event_wq);
for (i = 0; i < adapter->max_vports; i++) {
+ if (!adapter->vport_config[i])
+ continue;
kfree(adapter->vport_config[i]->user_config.q_coalesce);
kfree(adapter->vport_config[i]);
adapter->vport_config[i] = NULL;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cq.c b/drivers/net/ethernet/mellanox/mlx5/core/cq.c
index e9f319a9bdd6..60f7ab1d72e7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cq.c
@@ -66,8 +66,8 @@ void mlx5_cq_tasklet_cb(struct tasklet_struct *t)
tasklet_schedule(&ctx->task);
}
-static void mlx5_add_cq_to_tasklet(struct mlx5_core_cq *cq,
- struct mlx5_eqe *eqe)
+void mlx5_add_cq_to_tasklet(struct mlx5_core_cq *cq,
+ struct mlx5_eqe *eqe)
{
unsigned long flags;
struct mlx5_eq_tasklet *tasklet_ctx = cq->tasklet_ctx.priv;
@@ -95,7 +95,15 @@ static void mlx5_add_cq_to_tasklet(struct mlx5_core_cq *cq,
if (schedule_tasklet)
tasklet_schedule(&tasklet_ctx->task);
}
+EXPORT_SYMBOL(mlx5_add_cq_to_tasklet);
+static void mlx5_core_cq_dummy_cb(struct mlx5_core_cq *cq, struct mlx5_eqe *eqe)
+{
+ mlx5_core_err(cq->eq->core.dev,
+ "CQ default completion callback, CQ #%u\n", cq->cqn);
+}
+
+#define MLX5_CQ_INIT_CMD_SN cpu_to_be32(2 << 28)
/* Callers must verify outbox status in case of err */
int mlx5_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
u32 *in, int inlen, u32 *out, int outlen)
@@ -121,10 +129,19 @@ int mlx5_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
cq->arm_sn = 0;
cq->eq = eq;
cq->uid = MLX5_GET(create_cq_in, in, uid);
+
+ /* Kernel CQs must set the arm_db address prior to calling
+ * this function, allowing for the proper value to be
+ * initialized. User CQs are responsible for their own
+ * initialization since they do not use the arm_db field.
+ */
+ if (cq->arm_db)
+ *cq->arm_db = MLX5_CQ_INIT_CMD_SN;
+
refcount_set(&cq->refcount, 1);
init_completion(&cq->free);
if (!cq->comp)
- cq->comp = mlx5_add_cq_to_tasklet;
+ cq->comp = mlx5_core_cq_dummy_cb;
/* assuming CQ will be deleted before the EQ */
cq->tasklet_ctx.priv = &eq->tasklet_ctx;
INIT_LIST_HEAD(&cq->tasklet_ctx.list);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
index fceea83abbd7..887adf4807d1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
@@ -541,7 +541,7 @@ static int mlx5_devlink_num_doorbells_validate(struct devlink *devlink, u32 id,
max_num_channels = mlx5e_get_max_num_channels(mdev);
if (val32 > max_num_channels) {
NL_SET_ERR_MSG_FMT_MOD(extack,
- "Requested num_doorbells (%u) exceeds maximum number of channels (%u)",
+ "Requested num_doorbells (%u) exceeds max number of channels (%u)",
val32, max_num_channels);
return -EINVAL;
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c
index 0a4fb8c92268..35d9530037a6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c
@@ -804,7 +804,8 @@ static int mlx5e_xfrm_add_state(struct net_device *dev,
goto err_xfrm;
}
- if (mlx5_eswitch_block_mode(priv->mdev))
+ err = mlx5_eswitch_block_mode(priv->mdev);
+ if (err)
goto unblock_ipsec;
if (x->props.mode == XFRM_MODE_TUNNEL &&
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
index d166c0d5189e..cf8f14ce4cd5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
@@ -595,32 +595,55 @@ static int mlx5e_dcbnl_ieee_setmaxrate(struct net_device *netdev,
struct mlx5_core_dev *mdev = priv->mdev;
u8 max_bw_value[IEEE_8021QAZ_MAX_TCS];
u8 max_bw_unit[IEEE_8021QAZ_MAX_TCS];
- __u64 upper_limit_mbps = roundup(255 * MLX5E_100MB, MLX5E_1GB);
+ __u64 upper_limit_mbps;
+ __u64 upper_limit_gbps;
int i;
+ struct {
+ int scale;
+ const char *units_str;
+ } units[] = {
+ [MLX5_100_MBPS_UNIT] = {
+ .scale = 100,
+ .units_str = "Mbps",
+ },
+ [MLX5_GBPS_UNIT] = {
+ .scale = 1,
+ .units_str = "Gbps",
+ },
+ };
memset(max_bw_value, 0, sizeof(max_bw_value));
memset(max_bw_unit, 0, sizeof(max_bw_unit));
+ upper_limit_mbps = 255 * MLX5E_100MB;
+ upper_limit_gbps = 255 * MLX5E_1GB;
for (i = 0; i <= mlx5_max_tc(mdev); i++) {
if (!maxrate->tc_maxrate[i]) {
max_bw_unit[i] = MLX5_BW_NO_LIMIT;
continue;
}
- if (maxrate->tc_maxrate[i] < upper_limit_mbps) {
+ if (maxrate->tc_maxrate[i] <= upper_limit_mbps) {
max_bw_value[i] = div_u64(maxrate->tc_maxrate[i],
MLX5E_100MB);
max_bw_value[i] = max_bw_value[i] ? max_bw_value[i] : 1;
max_bw_unit[i] = MLX5_100_MBPS_UNIT;
- } else {
+ } else if (maxrate->tc_maxrate[i] <= upper_limit_gbps) {
max_bw_value[i] = div_u64(maxrate->tc_maxrate[i],
MLX5E_1GB);
max_bw_unit[i] = MLX5_GBPS_UNIT;
+ } else {
+ netdev_err(netdev,
+ "tc_%d maxrate %llu Kbps exceeds limit %llu\n",
+ i, maxrate->tc_maxrate[i],
+ upper_limit_gbps);
+ return -EINVAL;
}
}
for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) {
- netdev_dbg(netdev, "%s: tc_%d <=> max_bw %d Gbps\n",
- __func__, i, max_bw_value[i]);
+ netdev_dbg(netdev, "%s: tc_%d <=> max_bw %u %s\n", __func__, i,
+ max_bw_value[i] * units[max_bw_unit[i]].scale,
+ units[max_bw_unit[i]].units_str);
}
return mlx5_modify_port_ets_rate_limit(mdev, max_bw_value, max_bw_unit);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 6023bbbf3f39..5e17eae81f4b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -2219,7 +2219,6 @@ static int mlx5e_alloc_cq_common(struct mlx5_core_dev *mdev,
mcq->set_ci_db = cq->wq_ctrl.db.db;
mcq->arm_db = cq->wq_ctrl.db.db + 1;
*mcq->set_ci_db = 0;
- *mcq->arm_db = 0;
mcq->vector = param->eq_ix;
mcq->comp = mlx5e_completion_event;
mcq->event = mlx5e_cq_error_event;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
index cb1319974f83..ccef64fb40b6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
@@ -421,6 +421,13 @@ static int mlx5_fpga_conn_create_cq(struct mlx5_fpga_conn *conn, int cq_size)
__be64 *pas;
u32 i;
+ conn->cq.mcq.cqe_sz = 64;
+ conn->cq.mcq.set_ci_db = conn->cq.wq_ctrl.db.db;
+ conn->cq.mcq.arm_db = conn->cq.wq_ctrl.db.db + 1;
+ *conn->cq.mcq.set_ci_db = 0;
+ conn->cq.mcq.vector = 0;
+ conn->cq.mcq.comp = mlx5_fpga_conn_cq_complete;
+
cq_size = roundup_pow_of_two(cq_size);
MLX5_SET(cqc, temp_cqc, log_cq_size, ilog2(cq_size));
@@ -468,15 +475,7 @@ static int mlx5_fpga_conn_create_cq(struct mlx5_fpga_conn *conn, int cq_size)
if (err)
goto err_cqwq;
- conn->cq.mcq.cqe_sz = 64;
- conn->cq.mcq.set_ci_db = conn->cq.wq_ctrl.db.db;
- conn->cq.mcq.arm_db = conn->cq.wq_ctrl.db.db + 1;
- *conn->cq.mcq.set_ci_db = 0;
- *conn->cq.mcq.arm_db = 0;
- conn->cq.mcq.vector = 0;
- conn->cq.mcq.comp = mlx5_fpga_conn_cq_complete;
tasklet_setup(&conn->cq.tasklet, mlx5_fpga_conn_cq_tasklet);
-
mlx5_fpga_dbg(fdev, "Created CQ #0x%x\n", conn->cq.mcq.cqn);
goto out;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
index e18a850c615c..aa3b5878e3da 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
@@ -324,10 +324,8 @@ err_xa:
free_irq(irq->map.virq, &irq->nh);
err_req_irq:
#ifdef CONFIG_RFS_ACCEL
- if (i && rmap && *rmap) {
- free_irq_cpu_rmap(*rmap);
- *rmap = NULL;
- }
+ if (i && rmap && *rmap)
+ irq_cpu_rmap_remove(*rmap, irq->map.virq);
err_irq_rmap:
#endif
if (i && pci_msix_can_alloc_dyn(dev->pdev))
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c
index 24ef7d66fa8a..7510c46e58a5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c
@@ -873,12 +873,6 @@ err_free_sqc:
return err;
}
-static void hws_cq_complete(struct mlx5_core_cq *mcq,
- struct mlx5_eqe *eqe)
-{
- pr_err("CQ completion CQ: #%u\n", mcq->cqn);
-}
-
static int hws_send_ring_alloc_cq(struct mlx5_core_dev *mdev,
int numa_node,
struct mlx5hws_send_engine *queue,
@@ -901,7 +895,6 @@ static int hws_send_ring_alloc_cq(struct mlx5_core_dev *mdev,
mcq->cqe_sz = 64;
mcq->set_ci_db = cq->wq_ctrl.db.db;
mcq->arm_db = cq->wq_ctrl.db.db + 1;
- mcq->comp = hws_cq_complete;
for (i = 0; i < mlx5_cqwq_get_size(&cq->wq); i++) {
cqe = mlx5_cqwq_get_wqe(&cq->wq, i);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c
index 077a77fde670..d034372fa047 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c
@@ -1049,12 +1049,6 @@ static int dr_prepare_qp_to_rts(struct mlx5dr_domain *dmn)
return 0;
}
-static void dr_cq_complete(struct mlx5_core_cq *mcq,
- struct mlx5_eqe *eqe)
-{
- pr_err("CQ completion CQ: #%u\n", mcq->cqn);
-}
-
static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev,
struct mlx5_uars_page *uar,
size_t ncqe)
@@ -1089,6 +1083,13 @@ static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev,
cqe->op_own = MLX5_CQE_INVALID << 4 | MLX5_CQE_OWNER_MASK;
}
+ cq->mcq.cqe_sz = 64;
+ cq->mcq.set_ci_db = cq->wq_ctrl.db.db;
+ cq->mcq.arm_db = cq->wq_ctrl.db.db + 1;
+ *cq->mcq.set_ci_db = 0;
+ cq->mcq.vector = 0;
+ cq->mdev = mdev;
+
inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
sizeof(u64) * cq->wq_ctrl.buf.npages;
in = kvzalloc(inlen, GFP_KERNEL);
@@ -1112,27 +1113,12 @@ static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev,
pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
mlx5_fill_page_frag_array(&cq->wq_ctrl.buf, pas);
- cq->mcq.comp = dr_cq_complete;
-
err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out));
kvfree(in);
if (err)
goto err_cqwq;
- cq->mcq.cqe_sz = 64;
- cq->mcq.set_ci_db = cq->wq_ctrl.db.db;
- cq->mcq.arm_db = cq->wq_ctrl.db.db + 1;
- *cq->mcq.set_ci_db = 0;
-
- /* set no-zero value, in order to avoid the HW to run db-recovery on
- * CQ that used in polling mode.
- */
- *cq->mcq.arm_db = cpu_to_be32(2 << 28);
-
- cq->mcq.vector = 0;
- cq->mdev = mdev;
-
return cq;
err_cqwq:
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_linecards.c b/drivers/net/ethernet/mellanox/mlxsw/core_linecards.c
index b032d5a4b3b8..10f5bc4892fc 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core_linecards.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core_linecards.c
@@ -601,6 +601,8 @@ int mlxsw_linecard_devlink_info_get(struct mlxsw_linecard *linecard,
err = devlink_info_version_fixed_put(req,
DEVLINK_INFO_VERSION_GENERIC_FW_PSID,
info->psid);
+ if (err)
+ goto unlock;
sprintf(buf, "%u.%u.%u", info->fw_major, info->fw_minor,
info->fw_sub_minor);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
index 6a4a81c63451..353fd9ca89a6 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
@@ -830,8 +830,10 @@ int mlxsw_sp_flower_stats(struct mlxsw_sp *mlxsw_sp,
return -EINVAL;
rule = mlxsw_sp_acl_rule_lookup(mlxsw_sp, ruleset, f->cookie);
- if (!rule)
- return -EINVAL;
+ if (!rule) {
+ err = -EINVAL;
+ goto err_rule_get_stats;
+ }
err = mlxsw_sp_acl_rule_get_stats(mlxsw_sp, rule, &packets, &bytes,
&drops, &lastuse, &used_hw_stats);
diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_fw.c b/drivers/net/ethernet/meta/fbnic/fbnic_fw.c
index c87cb9ed09e7..fcd9912e7ad3 100644
--- a/drivers/net/ethernet/meta/fbnic/fbnic_fw.c
+++ b/drivers/net/ethernet/meta/fbnic/fbnic_fw.c
@@ -201,7 +201,7 @@ static int fbnic_mbx_alloc_rx_msgs(struct fbnic_dev *fbd)
return -ENODEV;
/* Fill all but 1 unused descriptors in the Rx queue. */
- count = (head - tail - 1) % FBNIC_IPC_MBX_DESC_LEN;
+ count = (head - tail - 1) & (FBNIC_IPC_MBX_DESC_LEN - 1);
while (!err && count--) {
struct fbnic_tlv_msg *msg;
diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c b/drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c
index b4377b8613c3..8c40db90ee8f 100644
--- a/drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c
+++ b/drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c
@@ -1,11 +1,14 @@
// SPDX-License-Identifier: GPL-2.0+
#include <linux/ptp_classify.h>
+#include <linux/units.h>
#include "lan966x_main.h"
#include "vcap_api.h"
#include "vcap_api_client.h"
+#define LAN9X66_CLOCK_RATE 165617754
+
#define LAN966X_MAX_PTP_ID 512
/* Represents 1ppm adjustment in 2^59 format with 6.037735849ns as reference
@@ -1126,5 +1129,5 @@ void lan966x_ptp_rxtstamp(struct lan966x *lan966x, struct sk_buff *skb,
u32 lan966x_ptp_get_period_ps(void)
{
/* This represents the system clock period in picoseconds */
- return 15125;
+ return PICO / LAN9X66_CLOCK_RATE;
}
diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c
index 847fa62c80df..e338bfc8b7b2 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_fp.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c
@@ -4,6 +4,7 @@
* Copyright (c) 2019-2020 Marvell International Ltd.
*/
+#include <linux/array_size.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/skbuff.h>
@@ -960,7 +961,7 @@ static inline void qede_tpa_cont(struct qede_dev *edev,
{
int i;
- for (i = 0; cqe->len_list[i]; i++)
+ for (i = 0; cqe->len_list[i] && i < ARRAY_SIZE(cqe->len_list); i++)
qede_fill_frag_skb(edev, rxq, cqe->tpa_agg_index,
le16_to_cpu(cqe->len_list[i]));
@@ -985,7 +986,7 @@ static int qede_tpa_end(struct qede_dev *edev,
dma_unmap_page(rxq->dev, tpa_info->buffer.mapping,
PAGE_SIZE, rxq->data_direction);
- for (i = 0; cqe->len_list[i]; i++)
+ for (i = 0; cqe->len_list[i] && i < ARRAY_SIZE(cqe->len_list); i++)
qede_fill_frag_skb(edev, rxq, cqe->tpa_agg_index,
le16_to_cpu(cqe->len_list[i]));
if (unlikely(i > 1))
diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c
index d18734fe12e4..853aabedb128 100644
--- a/drivers/net/ethernet/realtek/r8169_main.c
+++ b/drivers/net/ethernet/realtek/r8169_main.c
@@ -1514,11 +1514,20 @@ static enum rtl_dash_type rtl_get_dash_type(struct rtl8169_private *tp)
static void rtl_set_d3_pll_down(struct rtl8169_private *tp, bool enable)
{
- if (tp->mac_version >= RTL_GIGA_MAC_VER_25 &&
- tp->mac_version != RTL_GIGA_MAC_VER_28 &&
- tp->mac_version != RTL_GIGA_MAC_VER_31 &&
- tp->mac_version != RTL_GIGA_MAC_VER_38)
- r8169_mod_reg8_cond(tp, PMCH, D3_NO_PLL_DOWN, !enable);
+ switch (tp->mac_version) {
+ case RTL_GIGA_MAC_VER_02 ... RTL_GIGA_MAC_VER_24:
+ case RTL_GIGA_MAC_VER_28:
+ case RTL_GIGA_MAC_VER_31:
+ case RTL_GIGA_MAC_VER_38:
+ break;
+ case RTL_GIGA_MAC_VER_80:
+ r8169_mod_reg8_cond(tp, PMCH, D3_NO_PLL_DOWN, true);
+ break;
+ default:
+ r8169_mod_reg8_cond(tp, PMCH, D3HOT_NO_PLL_DOWN, true);
+ r8169_mod_reg8_cond(tp, PMCH, D3COLD_NO_PLL_DOWN, !enable);
+ break;
+ }
}
static void rtl_reset_packet_filter(struct rtl8169_private *tp)
diff --git a/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c b/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c
index 75bad561b352..849c5a6c2af1 100644
--- a/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c
+++ b/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c
@@ -1521,8 +1521,10 @@ static int sxgbe_rx(struct sxgbe_priv_data *priv, int limit)
skb = priv->rxq[qnum]->rx_skbuff[entry];
- if (unlikely(!skb))
+ if (unlikely(!skb)) {
netdev_err(priv->dev, "rx descriptor is not consistent\n");
+ break;
+ }
prefetch(skb->data - NET_IP_ALIGN);
priv->rxq[qnum]->rx_skbuff[entry] = NULL;
diff --git a/drivers/net/ethernet/ti/am65-cpsw-qos.c b/drivers/net/ethernet/ti/am65-cpsw-qos.c
index fa96db7c1a13..66e8b224827b 100644
--- a/drivers/net/ethernet/ti/am65-cpsw-qos.c
+++ b/drivers/net/ethernet/ti/am65-cpsw-qos.c
@@ -276,9 +276,31 @@ static int am65_cpsw_iet_set_verify_timeout_count(struct am65_cpsw_port *port)
/* The number of wireside clocks contained in the verify
* timeout counter. The default is 0x1312d0
* (10ms at 125Mhz in 1G mode).
+ * The frequency of the clock depends on the link speed
+ * and the PHY interface.
*/
- val = 125 * HZ_PER_MHZ; /* assuming 125MHz wireside clock */
+ switch (port->slave.phy_if) {
+ case PHY_INTERFACE_MODE_RGMII:
+ case PHY_INTERFACE_MODE_RGMII_ID:
+ case PHY_INTERFACE_MODE_RGMII_RXID:
+ case PHY_INTERFACE_MODE_RGMII_TXID:
+ if (port->qos.link_speed == SPEED_1000)
+ val = 125 * HZ_PER_MHZ; /* 125 MHz at 1000Mbps*/
+ else if (port->qos.link_speed == SPEED_100)
+ val = 25 * HZ_PER_MHZ; /* 25 MHz at 100Mbps*/
+ else
+ val = (25 * HZ_PER_MHZ) / 10; /* 2.5 MHz at 10Mbps*/
+ break;
+
+ case PHY_INTERFACE_MODE_QSGMII:
+ case PHY_INTERFACE_MODE_SGMII:
+ val = 125 * HZ_PER_MHZ; /* 125 MHz */
+ break;
+ default:
+ netdev_err(port->ndev, "selected mode does not supported IET\n");
+ return -EOPNOTSUPP;
+ }
val /= MILLIHZ_PER_HZ; /* count per ms timeout */
val *= verify_time_ms; /* count for timeout ms */
@@ -295,20 +317,21 @@ static int am65_cpsw_iet_verify_wait(struct am65_cpsw_port *port)
u32 ctrl, status;
int try;
- try = 20;
- do {
- /* Reset the verify state machine by writing 1
- * to LINKFAIL
- */
- ctrl = readl(port->port_base + AM65_CPSW_PN_REG_IET_CTRL);
- ctrl |= AM65_CPSW_PN_IET_MAC_LINKFAIL;
- writel(ctrl, port->port_base + AM65_CPSW_PN_REG_IET_CTRL);
+ try = 3;
- /* Clear MAC_LINKFAIL bit to start Verify. */
- ctrl = readl(port->port_base + AM65_CPSW_PN_REG_IET_CTRL);
- ctrl &= ~AM65_CPSW_PN_IET_MAC_LINKFAIL;
- writel(ctrl, port->port_base + AM65_CPSW_PN_REG_IET_CTRL);
+ /* Reset the verify state machine by writing 1
+ * to LINKFAIL
+ */
+ ctrl = readl(port->port_base + AM65_CPSW_PN_REG_IET_CTRL);
+ ctrl |= AM65_CPSW_PN_IET_MAC_LINKFAIL;
+ writel(ctrl, port->port_base + AM65_CPSW_PN_REG_IET_CTRL);
+ /* Clear MAC_LINKFAIL bit to start Verify. */
+ ctrl = readl(port->port_base + AM65_CPSW_PN_REG_IET_CTRL);
+ ctrl &= ~AM65_CPSW_PN_IET_MAC_LINKFAIL;
+ writel(ctrl, port->port_base + AM65_CPSW_PN_REG_IET_CTRL);
+
+ do {
msleep(port->qos.iet.verify_time_ms);
status = readl(port->port_base + AM65_CPSW_PN_REG_IET_STATUS);
@@ -330,7 +353,7 @@ static int am65_cpsw_iet_verify_wait(struct am65_cpsw_port *port)
netdev_dbg(port->ndev, "MAC Merge verify error\n");
return -ENODEV;
}
- } while (try-- > 0);
+ } while (--try > 0);
netdev_dbg(port->ndev, "MAC Merge verify timeout\n");
return -ETIMEDOUT;
diff --git a/drivers/net/ethernet/toshiba/ps3_gelic_net.c b/drivers/net/ethernet/toshiba/ps3_gelic_net.c
index 5ee8e8980393..591866fc9055 100644
--- a/drivers/net/ethernet/toshiba/ps3_gelic_net.c
+++ b/drivers/net/ethernet/toshiba/ps3_gelic_net.c
@@ -260,6 +260,7 @@ void gelic_card_down(struct gelic_card *card)
if (atomic_dec_if_positive(&card->users) == 0) {
pr_debug("%s: real do\n", __func__);
napi_disable(&card->napi);
+ timer_delete_sync(&card->rx_oom_timer);
/*
* Disable irq. Wireless interrupts will
* be disabled later if any
@@ -970,7 +971,8 @@ static void gelic_net_pass_skb_up(struct gelic_descr *descr,
* gelic_card_decode_one_descr - processes an rx descriptor
* @card: card structure
*
- * returns 1 if a packet has been sent to the stack, otherwise 0
+ * returns 1 if a packet has been sent to the stack, -ENOMEM on skb alloc
+ * failure, otherwise 0
*
* processes an rx descriptor by iommu-unmapping the data buffer and passing
* the packet up to the stack
@@ -981,16 +983,18 @@ static int gelic_card_decode_one_descr(struct gelic_card *card)
struct gelic_descr_chain *chain = &card->rx_chain;
struct gelic_descr *descr = chain->head;
struct net_device *netdev = NULL;
- int dmac_chain_ended;
+ int dmac_chain_ended = 0;
+ int prepare_rx_ret;
status = gelic_descr_get_status(descr);
if (status == GELIC_DESCR_DMA_CARDOWNED)
return 0;
- if (status == GELIC_DESCR_DMA_NOT_IN_USE) {
+ if (status == GELIC_DESCR_DMA_NOT_IN_USE || !descr->skb) {
dev_dbg(ctodev(card), "dormant descr? %p\n", descr);
- return 0;
+ dmac_chain_ended = 1;
+ goto refill;
}
/* netdevice select */
@@ -1048,9 +1052,10 @@ static int gelic_card_decode_one_descr(struct gelic_card *card)
refill:
/* is the current descriptor terminated with next_descr == NULL? */
- dmac_chain_ended =
- be32_to_cpu(descr->hw_regs.dmac_cmd_status) &
- GELIC_DESCR_RX_DMA_CHAIN_END;
+ if (!dmac_chain_ended)
+ dmac_chain_ended =
+ be32_to_cpu(descr->hw_regs.dmac_cmd_status) &
+ GELIC_DESCR_RX_DMA_CHAIN_END;
/*
* So that always DMAC can see the end
* of the descriptor chain to avoid
@@ -1062,10 +1067,11 @@ refill:
gelic_descr_set_status(descr, GELIC_DESCR_DMA_NOT_IN_USE);
/*
- * this call can fail, but for now, just leave this
- * descriptor without skb
+ * this call can fail, propagate the error
*/
- gelic_descr_prepare_rx(card, descr);
+ prepare_rx_ret = gelic_descr_prepare_rx(card, descr);
+ if (prepare_rx_ret)
+ return prepare_rx_ret;
chain->tail = descr;
chain->head = descr->next;
@@ -1087,6 +1093,13 @@ refill:
return 1;
}
+static void gelic_rx_oom_timer(struct timer_list *t)
+{
+ struct gelic_card *card = timer_container_of(card, t, rx_oom_timer);
+
+ napi_schedule(&card->napi);
+}
+
/**
* gelic_net_poll - NAPI poll function called by the stack to return packets
* @napi: napi structure
@@ -1099,14 +1112,22 @@ static int gelic_net_poll(struct napi_struct *napi, int budget)
{
struct gelic_card *card = container_of(napi, struct gelic_card, napi);
int packets_done = 0;
+ int work_result = 0;
while (packets_done < budget) {
- if (!gelic_card_decode_one_descr(card))
+ work_result = gelic_card_decode_one_descr(card);
+ if (work_result != 1)
break;
packets_done++;
}
+ if (work_result == -ENOMEM) {
+ napi_complete_done(napi, packets_done);
+ mod_timer(&card->rx_oom_timer, jiffies + 1);
+ return packets_done;
+ }
+
if (packets_done < budget) {
napi_complete_done(napi, packets_done);
gelic_card_rx_irq_on(card);
@@ -1576,6 +1597,8 @@ static struct gelic_card *gelic_alloc_card_net(struct net_device **netdev)
mutex_init(&card->updown_lock);
atomic_set(&card->users, 0);
+ timer_setup(&card->rx_oom_timer, gelic_rx_oom_timer, 0);
+
return card;
}
diff --git a/drivers/net/ethernet/toshiba/ps3_gelic_net.h b/drivers/net/ethernet/toshiba/ps3_gelic_net.h
index f7d7931e51b7..c10f1984a5a1 100644
--- a/drivers/net/ethernet/toshiba/ps3_gelic_net.h
+++ b/drivers/net/ethernet/toshiba/ps3_gelic_net.h
@@ -268,6 +268,7 @@ struct gelic_vlan_id {
struct gelic_card {
struct napi_struct napi;
struct net_device *netdev[GELIC_PORT_MAX];
+ struct timer_list rx_oom_timer;
/*
* hypervisor requires irq_status should be
* 8 bytes aligned, but u64 member is
diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c
index cad6ed3aa10b..4354241137d5 100644
--- a/drivers/net/phy/mdio_bus.c
+++ b/drivers/net/phy/mdio_bus.c
@@ -73,8 +73,11 @@ int mdiobus_register_device(struct mdio_device *mdiodev)
return err;
err = mdiobus_register_reset(mdiodev);
- if (err)
+ if (err) {
+ gpiod_put(mdiodev->reset_gpio);
+ mdiodev->reset_gpio = NULL;
return err;
+ }
/* Assert the reset signal */
mdio_device_reset(mdiodev, 1);
diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
index 6a1a424e3b30..01c87c9b7702 100644
--- a/drivers/net/phy/micrel.c
+++ b/drivers/net/phy/micrel.c
@@ -4380,12 +4380,6 @@ static int lan8814_config_init(struct phy_device *phydev)
{
struct kszphy_priv *lan8814 = phydev->priv;
- /* Reset the PHY */
- lanphy_modify_page_reg(phydev, LAN8814_PAGE_COMMON_REGS,
- LAN8814_QSGMII_SOFT_RESET,
- LAN8814_QSGMII_SOFT_RESET_BIT,
- LAN8814_QSGMII_SOFT_RESET_BIT);
-
/* Disable ANEG with QSGMII PCS Host side */
lanphy_modify_page_reg(phydev, LAN8814_PAGE_PORT_REGS,
LAN8814_QSGMII_PCS1G_ANEG_CONFIG,
@@ -4471,6 +4465,12 @@ static int lan8814_probe(struct phy_device *phydev)
addr, sizeof(struct lan8814_shared_priv));
if (phy_package_init_once(phydev)) {
+ /* Reset the PHY */
+ lanphy_modify_page_reg(phydev, LAN8814_PAGE_COMMON_REGS,
+ LAN8814_QSGMII_SOFT_RESET,
+ LAN8814_QSGMII_SOFT_RESET_BIT,
+ LAN8814_QSGMII_SOFT_RESET_BIT);
+
err = lan8814_release_coma_mode(phydev);
if (err)
return err;
diff --git a/drivers/net/phy/mxl-gpy.c b/drivers/net/phy/mxl-gpy.c
index 0c8dc16ee7bd..2a873f791733 100644
--- a/drivers/net/phy/mxl-gpy.c
+++ b/drivers/net/phy/mxl-gpy.c
@@ -540,7 +540,7 @@ static int gpy_update_interface(struct phy_device *phydev)
/* Interface mode is fixed for USXGMII and integrated PHY */
if (phydev->interface == PHY_INTERFACE_MODE_USXGMII ||
phydev->interface == PHY_INTERFACE_MODE_INTERNAL)
- return -EINVAL;
+ return 0;
/* Automatically switch SERDES interface between SGMII and 2500-BaseX
* according to speed. Disable ANEG in 2500-BaseX mode.
@@ -578,13 +578,7 @@ static int gpy_update_interface(struct phy_device *phydev)
break;
}
- if (phydev->speed == SPEED_2500 || phydev->speed == SPEED_1000) {
- ret = genphy_read_master_slave(phydev);
- if (ret < 0)
- return ret;
- }
-
- return gpy_update_mdix(phydev);
+ return 0;
}
static int gpy_read_status(struct phy_device *phydev)
@@ -639,6 +633,16 @@ static int gpy_read_status(struct phy_device *phydev)
ret = gpy_update_interface(phydev);
if (ret < 0)
return ret;
+
+ if (phydev->speed == SPEED_2500 || phydev->speed == SPEED_1000) {
+ ret = genphy_read_master_slave(phydev);
+ if (ret < 0)
+ return ret;
+ }
+
+ ret = gpy_update_mdix(phydev);
+ if (ret < 0)
+ return ret;
}
return 0;
diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 9d7799ea1c17..918244308215 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -637,6 +637,9 @@ static int phylink_validate(struct phylink *pl, unsigned long *supported,
static void phylink_fill_fixedlink_supported(unsigned long *supported)
{
+ linkmode_set_bit(ETHTOOL_LINK_MODE_Pause_BIT, supported);
+ linkmode_set_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, supported);
+ linkmode_set_bit(ETHTOOL_LINK_MODE_Autoneg_BIT, supported);
linkmode_set_bit(ETHTOOL_LINK_MODE_10baseT_Half_BIT, supported);
linkmode_set_bit(ETHTOOL_LINK_MODE_10baseT_Full_BIT, supported);
linkmode_set_bit(ETHTOOL_LINK_MODE_100baseT_Half_BIT, supported);
diff --git a/drivers/net/team/team_core.c b/drivers/net/team/team_core.c
index 17f07eb0ee52..25562b17debe 100644
--- a/drivers/net/team/team_core.c
+++ b/drivers/net/team/team_core.c
@@ -1191,10 +1191,6 @@ static int team_port_add(struct team *team, struct net_device *port_dev,
return -EPERM;
}
- err = team_dev_type_check_change(dev, port_dev);
- if (err)
- return err;
-
if (port_dev->flags & IFF_UP) {
NL_SET_ERR_MSG(extack, "Device is up. Set it down before adding it as a team port");
netdev_err(dev, "Device %s is up. Set it down before adding it as a team port\n",
@@ -1212,10 +1208,16 @@ static int team_port_add(struct team *team, struct net_device *port_dev,
INIT_LIST_HEAD(&port->qom_list);
port->orig.mtu = port_dev->mtu;
- err = dev_set_mtu(port_dev, dev->mtu);
- if (err) {
- netdev_dbg(dev, "Error %d calling dev_set_mtu\n", err);
- goto err_set_mtu;
+ /*
+ * MTU assignment will be handled in team_dev_type_check_change
+ * if dev and port_dev are of different types
+ */
+ if (dev->type == port_dev->type) {
+ err = dev_set_mtu(port_dev, dev->mtu);
+ if (err) {
+ netdev_dbg(dev, "Error %d calling dev_set_mtu\n", err);
+ goto err_set_mtu;
+ }
}
memcpy(port->orig.dev_addr, port_dev->dev_addr, port_dev->addr_len);
@@ -1290,6 +1292,10 @@ static int team_port_add(struct team *team, struct net_device *port_dev,
}
}
+ err = team_dev_type_check_change(dev, port_dev);
+ if (err)
+ goto err_set_dev_type;
+
if (dev->flags & IFF_UP) {
netif_addr_lock_bh(dev);
dev_uc_sync_multiple(port_dev, dev);
@@ -1308,6 +1314,7 @@ static int team_port_add(struct team *team, struct net_device *port_dev,
return 0;
+err_set_dev_type:
err_set_slave_promisc:
__team_option_inst_del_port(team, port);
diff --git a/drivers/net/tun_vnet.h b/drivers/net/tun_vnet.h
index 81662328b2c7..a5f93b6c4482 100644
--- a/drivers/net/tun_vnet.h
+++ b/drivers/net/tun_vnet.h
@@ -244,7 +244,7 @@ tun_vnet_hdr_tnl_from_skb(unsigned int flags,
if (virtio_net_hdr_tnl_from_skb(skb, tnl_hdr, has_tnl_offload,
tun_vnet_is_little_endian(flags),
- vlan_hlen)) {
+ vlan_hlen, true)) {
struct virtio_net_hdr_v1 *hdr = &tnl_hdr->hash_hdr.hdr;
struct skb_shared_info *sinfo = skb_shinfo(skb);
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index a3046142cb8e..cc502bf022d5 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -392,14 +392,12 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
}
/* Restore Eth hdr pulled by dev_forward_skb/eth_type_trans */
__skb_push(skb, ETH_HLEN);
- /* Depend on prior success packets started NAPI consumer via
- * __veth_xdp_flush(). Cancel TXQ stop if consumer stopped,
- * paired with empty check in veth_poll().
- */
netif_tx_stop_queue(txq);
- smp_mb__after_atomic();
- if (unlikely(__ptr_ring_empty(&rq->xdp_ring)))
- netif_tx_wake_queue(txq);
+ /* Makes sure NAPI peer consumer runs. Consumer is responsible
+ * for starting txq again, until then ndo_start_xmit (this
+ * function) will not be invoked by the netstack again.
+ */
+ __veth_xdp_flush(rq);
break;
case NET_RX_DROP: /* same as NET_XMIT_DROP */
drop:
@@ -900,17 +898,9 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget,
struct veth_xdp_tx_bq *bq,
struct veth_stats *stats)
{
- struct veth_priv *priv = netdev_priv(rq->dev);
- int queue_idx = rq->xdp_rxq.queue_index;
- struct netdev_queue *peer_txq;
- struct net_device *peer_dev;
int i, done = 0, n_xdpf = 0;
void *xdpf[VETH_XDP_BATCH];
- /* NAPI functions as RCU section */
- peer_dev = rcu_dereference_check(priv->peer, rcu_read_lock_bh_held());
- peer_txq = peer_dev ? netdev_get_tx_queue(peer_dev, queue_idx) : NULL;
-
for (i = 0; i < budget; i++) {
void *ptr = __ptr_ring_consume(&rq->xdp_ring);
@@ -959,9 +949,6 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget,
rq->stats.vs.xdp_packets += done;
u64_stats_update_end(&rq->stats.syncp);
- if (peer_txq && unlikely(netif_tx_queue_stopped(peer_txq)))
- netif_tx_wake_queue(peer_txq);
-
return done;
}
@@ -969,17 +956,28 @@ static int veth_poll(struct napi_struct *napi, int budget)
{
struct veth_rq *rq =
container_of(napi, struct veth_rq, xdp_napi);
+ struct veth_priv *priv = netdev_priv(rq->dev);
+ int queue_idx = rq->xdp_rxq.queue_index;
+ struct netdev_queue *peer_txq;
struct veth_stats stats = {};
+ struct net_device *peer_dev;
struct veth_xdp_tx_bq bq;
int done;
bq.count = 0;
+ /* NAPI functions as RCU section */
+ peer_dev = rcu_dereference_check(priv->peer, rcu_read_lock_bh_held());
+ peer_txq = peer_dev ? netdev_get_tx_queue(peer_dev, queue_idx) : NULL;
+
xdp_set_return_frame_no_direct();
done = veth_xdp_rcv(rq, budget, &bq, &stats);
if (stats.xdp_redirect > 0)
xdp_do_flush();
+ if (stats.xdp_tx > 0)
+ veth_xdp_flush(rq, &bq);
+ xdp_clear_return_frame_no_direct();
if (done < budget && napi_complete_done(napi, done)) {
/* Write rx_notify_masked before reading ptr_ring */
@@ -992,9 +990,12 @@ static int veth_poll(struct napi_struct *napi, int budget)
}
}
- if (stats.xdp_tx > 0)
- veth_xdp_flush(rq, &bq);
- xdp_clear_return_frame_no_direct();
+ /* Release backpressure per NAPI poll */
+ smp_rmb(); /* Paired with netif_tx_stop_queue set_bit */
+ if (peer_txq && netif_tx_queue_stopped(peer_txq)) {
+ txq_trans_cond_update(peer_txq);
+ netif_tx_wake_queue(peer_txq);
+ }
return done;
}
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 8855a994e12b..8e04adb57f52 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -2631,22 +2631,28 @@ static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
return;
}
- /* 1. Save the flags early, as the XDP program might overwrite them.
+ /* About the flags below:
+ * 1. Save the flags early, as the XDP program might overwrite them.
* These flags ensure packets marked as VIRTIO_NET_HDR_F_DATA_VALID
* stay valid after XDP processing.
* 2. XDP doesn't work with partially checksummed packets (refer to
* virtnet_xdp_set()), so packets marked as
* VIRTIO_NET_HDR_F_NEEDS_CSUM get dropped during XDP processing.
*/
- flags = ((struct virtio_net_common_hdr *)buf)->hdr.flags;
- if (vi->mergeable_rx_bufs)
+ if (vi->mergeable_rx_bufs) {
+ flags = ((struct virtio_net_common_hdr *)buf)->hdr.flags;
skb = receive_mergeable(dev, vi, rq, buf, ctx, len, xdp_xmit,
stats);
- else if (vi->big_packets)
+ } else if (vi->big_packets) {
+ void *p = page_address((struct page *)buf);
+
+ flags = ((struct virtio_net_common_hdr *)p)->hdr.flags;
skb = receive_big(dev, vi, rq, buf, len, stats);
- else
+ } else {
+ flags = ((struct virtio_net_common_hdr *)buf)->hdr.flags;
skb = receive_small(dev, vi, rq, buf, ctx, len, xdp_xmit, stats);
+ }
if (unlikely(!skb))
return;
@@ -3333,7 +3339,8 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb, bool orphan)
hdr = &skb_vnet_common_hdr(skb)->tnl_hdr;
if (virtio_net_hdr_tnl_from_skb(skb, hdr, vi->tx_tnl,
- virtio_is_little_endian(vi->vdev), 0))
+ virtio_is_little_endian(vi->vdev), 0,
+ false))
return -EPROTO;
if (vi->mergeable_rx_bufs)
diff --git a/drivers/net/wireless/ath/ath11k/wmi.c b/drivers/net/wireless/ath/ath11k/wmi.c
index 0491e3fd6b5e..e3b444333dee 100644
--- a/drivers/net/wireless/ath/ath11k/wmi.c
+++ b/drivers/net/wireless/ath/ath11k/wmi.c
@@ -5961,6 +5961,9 @@ static int wmi_process_mgmt_tx_comp(struct ath11k *ar,
dma_unmap_single(ar->ab->dev, skb_cb->paddr, msdu->len, DMA_TO_DEVICE);
info = IEEE80211_SKB_CB(msdu);
+ memset(&info->status, 0, sizeof(info->status));
+ info->status.rates[0].idx = -1;
+
if ((!(info->flags & IEEE80211_TX_CTL_NO_ACK)) &&
!tx_compl_param->status) {
info->flags |= IEEE80211_TX_STAT_ACK;
diff --git a/drivers/net/wireless/intel/iwlwifi/mld/link.c b/drivers/net/wireless/intel/iwlwifi/mld/link.c
index 60d814bf5779..f6f52d297a72 100644
--- a/drivers/net/wireless/intel/iwlwifi/mld/link.c
+++ b/drivers/net/wireless/intel/iwlwifi/mld/link.c
@@ -708,18 +708,13 @@ static int
iwl_mld_get_chan_load_from_element(struct iwl_mld *mld,
struct ieee80211_bss_conf *link_conf)
{
- struct ieee80211_vif *vif = link_conf->vif;
const struct cfg80211_bss_ies *ies;
const struct element *bss_load_elem = NULL;
const struct ieee80211_bss_load_elem *bss_load;
guard(rcu)();
- if (ieee80211_vif_link_active(vif, link_conf->link_id))
- ies = rcu_dereference(link_conf->bss->beacon_ies);
- else
- ies = rcu_dereference(link_conf->bss->ies);
-
+ ies = rcu_dereference(link_conf->bss->beacon_ies);
if (ies)
bss_load_elem = cfg80211_find_elem(WLAN_EID_QBSS_LOAD,
ies->data, ies->len);
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c
index 9c9e0e1c6e1d..867807abde66 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c
@@ -938,19 +938,12 @@ u8 iwl_mvm_mac_ctxt_get_lowest_rate(struct iwl_mvm *mvm,
u16 iwl_mvm_mac_ctxt_get_beacon_flags(const struct iwl_fw *fw, u8 rate_idx)
{
+ u16 flags = iwl_mvm_mac80211_idx_to_hwrate(fw, rate_idx);
bool is_new_rate = iwl_fw_lookup_cmd_ver(fw, BEACON_TEMPLATE_CMD, 0) > 10;
- u16 flags, cck_flag;
-
- if (is_new_rate) {
- flags = iwl_mvm_mac80211_idx_to_hwrate(fw, rate_idx);
- cck_flag = IWL_MAC_BEACON_CCK;
- } else {
- cck_flag = IWL_MAC_BEACON_CCK_V1;
- flags = iwl_fw_rate_idx_to_plcp(rate_idx);
- }
if (rate_idx <= IWL_LAST_CCK_RATE)
- flags |= cck_flag;
+ flags |= is_new_rate ? IWL_MAC_BEACON_CCK
+ : IWL_MAC_BEACON_CCK_V1;
return flags;
}
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c b/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c
index 0c9c2492d8a7..0b12ee8ad618 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c
@@ -463,7 +463,7 @@ static int iwl_mvm_aux_roc_te_handle_notif(struct iwl_mvm *mvm,
if (!aux_roc_te) /* Not a Aux ROC time event */
return -EINVAL;
- iwl_mvm_te_check_trigger(mvm, notif, te_data);
+ iwl_mvm_te_check_trigger(mvm, notif, aux_roc_te);
IWL_DEBUG_TE(mvm,
"Aux ROC time event notification - UID = 0x%x action %d (error = %d)\n",
@@ -475,14 +475,14 @@ static int iwl_mvm_aux_roc_te_handle_notif(struct iwl_mvm *mvm,
/* End TE, notify mac80211 */
ieee80211_remain_on_channel_expired(mvm->hw);
iwl_mvm_roc_finished(mvm); /* flush aux queue */
- list_del(&te_data->list); /* remove from list */
- te_data->running = false;
- te_data->vif = NULL;
- te_data->uid = 0;
- te_data->id = TE_MAX;
+ list_del(&aux_roc_te->list); /* remove from list */
+ aux_roc_te->running = false;
+ aux_roc_te->vif = NULL;
+ aux_roc_te->uid = 0;
+ aux_roc_te->id = TE_MAX;
} else if (le32_to_cpu(notif->action) == TE_V2_NOTIF_HOST_EVENT_START) {
set_bit(IWL_MVM_STATUS_ROC_AUX_RUNNING, &mvm->status);
- te_data->running = true;
+ aux_roc_te->running = true;
ieee80211_ready_on_channel(mvm->hw); /* Start TE */
} else {
IWL_DEBUG_TE(mvm,
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/utils.c b/drivers/net/wireless/intel/iwlwifi/mvm/utils.c
index 22602c32faa5..fa995e235d9b 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/utils.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/utils.c
@@ -159,9 +159,15 @@ int iwl_mvm_legacy_rate_to_mac80211_idx(u32 rate_n_flags,
u8 iwl_mvm_mac80211_idx_to_hwrate(const struct iwl_fw *fw, int rate_idx)
{
- return (rate_idx >= IWL_FIRST_OFDM_RATE ?
- rate_idx - IWL_FIRST_OFDM_RATE :
- rate_idx);
+ if (iwl_fw_lookup_cmd_ver(fw, TX_CMD, 0) > 8)
+ /* In the new rate legacy rates are indexed:
+ * 0 - 3 for CCK and 0 - 7 for OFDM.
+ */
+ return (rate_idx >= IWL_FIRST_OFDM_RATE ?
+ rate_idx - IWL_FIRST_OFDM_RATE :
+ rate_idx);
+
+ return iwl_fw_rate_idx_to_plcp(rate_idx);
}
u8 iwl_mvm_mac80211_ac_to_ucode_ac(enum ieee80211_ac_numbers ac)
diff --git a/drivers/net/wireless/marvell/mwl8k.c b/drivers/net/wireless/marvell/mwl8k.c
index 891e125ad30b..54d6d00ecdf1 100644
--- a/drivers/net/wireless/marvell/mwl8k.c
+++ b/drivers/net/wireless/marvell/mwl8k.c
@@ -2966,6 +2966,51 @@ mwl8k_cmd_rf_antenna(struct ieee80211_hw *hw, int antenna, int mask)
/*
* CMD_SET_BEACON.
*/
+
+static bool mwl8k_beacon_has_ds_params(const u8 *buf, int len)
+{
+ const struct ieee80211_mgmt *mgmt = (const void *)buf;
+ int ies_len;
+
+ if (len <= offsetof(struct ieee80211_mgmt, u.beacon.variable))
+ return false;
+
+ ies_len = len - offsetof(struct ieee80211_mgmt, u.beacon.variable);
+
+ return cfg80211_find_ie(WLAN_EID_DS_PARAMS, mgmt->u.beacon.variable,
+ ies_len) != NULL;
+}
+
+static void mwl8k_beacon_copy_inject_ds_params(struct ieee80211_hw *hw,
+ u8 *buf_dst, const u8 *buf_src,
+ int src_len)
+{
+ const struct ieee80211_mgmt *mgmt = (const void *)buf_src;
+ static const u8 before_ds_params[] = {
+ WLAN_EID_SSID,
+ WLAN_EID_SUPP_RATES,
+ };
+ const u8 *ies;
+ int hdr_len, left, offs, pos;
+
+ ies = mgmt->u.beacon.variable;
+ hdr_len = offsetof(struct ieee80211_mgmt, u.beacon.variable);
+
+ offs = ieee80211_ie_split(ies, src_len - hdr_len, before_ds_params,
+ ARRAY_SIZE(before_ds_params), 0);
+
+ pos = hdr_len + offs;
+ left = src_len - pos;
+
+ memcpy(buf_dst, buf_src, pos);
+
+ /* Inject a DSSS Parameter Set after SSID + Supp Rates */
+ buf_dst[pos + 0] = WLAN_EID_DS_PARAMS;
+ buf_dst[pos + 1] = 1;
+ buf_dst[pos + 2] = hw->conf.chandef.chan->hw_value;
+
+ memcpy(buf_dst + pos + 3, buf_src + pos, left);
+}
struct mwl8k_cmd_set_beacon {
struct mwl8k_cmd_pkt_hdr header;
__le16 beacon_len;
@@ -2975,17 +3020,33 @@ struct mwl8k_cmd_set_beacon {
static int mwl8k_cmd_set_beacon(struct ieee80211_hw *hw,
struct ieee80211_vif *vif, u8 *beacon, int len)
{
+ bool ds_params_present = mwl8k_beacon_has_ds_params(beacon, len);
struct mwl8k_cmd_set_beacon *cmd;
- int rc;
+ int rc, final_len = len;
- cmd = kzalloc(sizeof(*cmd) + len, GFP_KERNEL);
+ if (!ds_params_present) {
+ /*
+ * mwl8k firmware requires a DS Params IE with the current
+ * channel in AP beacons. If mac80211/hostapd does not
+ * include it, inject one here. IE ID + length + channel
+ * number = 3 bytes.
+ */
+ final_len += 3;
+ }
+
+ cmd = kzalloc(sizeof(*cmd) + final_len, GFP_KERNEL);
if (cmd == NULL)
return -ENOMEM;
cmd->header.code = cpu_to_le16(MWL8K_CMD_SET_BEACON);
- cmd->header.length = cpu_to_le16(sizeof(*cmd) + len);
- cmd->beacon_len = cpu_to_le16(len);
- memcpy(cmd->beacon, beacon, len);
+ cmd->header.length = cpu_to_le16(sizeof(*cmd) + final_len);
+ cmd->beacon_len = cpu_to_le16(final_len);
+
+ if (ds_params_present)
+ memcpy(cmd->beacon, beacon, len);
+ else
+ mwl8k_beacon_copy_inject_ds_params(hw, cmd->beacon, beacon,
+ len);
rc = mwl8k_post_pervif_cmd(hw, vif, &cmd->header);
kfree(cmd);
diff --git a/drivers/net/wireless/realtek/rtw89/fw.c b/drivers/net/wireless/realtek/rtw89/fw.c
index ab904a7def1b..080c4f8a655a 100644
--- a/drivers/net/wireless/realtek/rtw89/fw.c
+++ b/drivers/net/wireless/realtek/rtw89/fw.c
@@ -7694,6 +7694,13 @@ int rtw89_hw_scan_add_chan_list_ax(struct rtw89_dev *rtwdev,
INIT_LIST_HEAD(&list);
list_for_each_entry_safe(ch_info, tmp, &scan_info->chan_list, list) {
+ /* The operating channel (tx_null == true) should
+ * not be last in the list, to avoid breaking
+ * RTL8851BU and RTL8832BU.
+ */
+ if (list_len + 1 == RTW89_SCAN_LIST_LIMIT_AX && ch_info->tx_null)
+ break;
+
list_move_tail(&ch_info->list, &list);
list_len++;
diff --git a/drivers/net/wireless/virtual/mac80211_hwsim.c b/drivers/net/wireless/virtual/mac80211_hwsim.c
index d28bf18d57ec..5903d82e1ab1 100644
--- a/drivers/net/wireless/virtual/mac80211_hwsim.c
+++ b/drivers/net/wireless/virtual/mac80211_hwsim.c
@@ -2003,8 +2003,14 @@ static void mac80211_hwsim_tx(struct ieee80211_hw *hw,
struct ieee80211_sta *sta = control->sta;
struct ieee80211_bss_conf *bss_conf;
+ /* This can happen in case of monitor injection */
+ if (!vif) {
+ ieee80211_free_txskb(hw, skb);
+ return;
+ }
+
if (link != IEEE80211_LINK_UNSPECIFIED) {
- bss_conf = rcu_dereference(txi->control.vif->link_conf[link]);
+ bss_conf = rcu_dereference(vif->link_conf[link]);
if (sta)
link_sta = rcu_dereference(sta->link[link]);
} else {
@@ -2065,13 +2071,13 @@ static void mac80211_hwsim_tx(struct ieee80211_hw *hw,
return;
}
- if (txi->control.vif)
- hwsim_check_magic(txi->control.vif);
+ if (vif)
+ hwsim_check_magic(vif);
if (control->sta)
hwsim_check_sta_magic(control->sta);
if (ieee80211_hw_check(hw, SUPPORTS_RC_TABLE))
- ieee80211_get_tx_rates(txi->control.vif, control->sta, skb,
+ ieee80211_get_tx_rates(vif, control->sta, skb,
txi->control.rates,
ARRAY_SIZE(txi->control.rates));
diff --git a/drivers/net/wwan/mhi_wwan_mbim.c b/drivers/net/wwan/mhi_wwan_mbim.c
index c814fbd756a1..f8bc9a39bfa3 100644
--- a/drivers/net/wwan/mhi_wwan_mbim.c
+++ b/drivers/net/wwan/mhi_wwan_mbim.c
@@ -98,7 +98,7 @@ static struct mhi_mbim_link *mhi_mbim_get_link_rcu(struct mhi_mbim_context *mbim
static int mhi_mbim_get_link_mux_id(struct mhi_controller *cntrl)
{
if (strcmp(cntrl->name, "foxconn-dw5934e") == 0 ||
- strcmp(cntrl->name, "foxconn-t99w515") == 0)
+ strcmp(cntrl->name, "foxconn-t99w640") == 0)
return WDS_BIND_MUX_DATA_PORT_MUX_ID;
return 0;
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index fa4181d7de73..f1f719351f3f 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4901,7 +4901,6 @@ void nvme_remove_admin_tag_set(struct nvme_ctrl *ctrl)
*/
nvme_stop_keep_alive(ctrl);
blk_mq_destroy_queue(ctrl->admin_q);
- blk_put_queue(ctrl->admin_q);
if (ctrl->ops->flags & NVME_F_FABRICS) {
blk_mq_destroy_queue(ctrl->fabrics_q);
blk_put_queue(ctrl->fabrics_q);
@@ -5045,6 +5044,8 @@ static void nvme_free_ctrl(struct device *dev)
container_of(dev, struct nvme_ctrl, ctrl_device);
struct nvme_subsystem *subsys = ctrl->subsys;
+ if (ctrl->admin_q)
+ blk_put_queue(ctrl->admin_q);
if (!subsys || ctrl->instance != subsys->instance)
ida_free(&nvme_instance_ida, ctrl->instance);
nvme_free_cels(ctrl);
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 03987f497a5b..2c903729b0b9 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2355,17 +2355,11 @@ nvme_fc_ctrl_free(struct kref *ref)
container_of(ref, struct nvme_fc_ctrl, ref);
unsigned long flags;
- if (ctrl->ctrl.tagset)
- nvme_remove_io_tag_set(&ctrl->ctrl);
-
/* remove from rport list */
spin_lock_irqsave(&ctrl->rport->lock, flags);
list_del(&ctrl->ctrl_list);
spin_unlock_irqrestore(&ctrl->rport->lock, flags);
- nvme_unquiesce_admin_queue(&ctrl->ctrl);
- nvme_remove_admin_tag_set(&ctrl->ctrl);
-
kfree(ctrl->queues);
put_device(ctrl->dev);
@@ -3259,13 +3253,20 @@ nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl)
{
struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);
- cancel_work_sync(&ctrl->ioerr_work);
cancel_delayed_work_sync(&ctrl->connect_work);
+
/*
* kill the association on the link side. this will block
* waiting for io to terminate
*/
nvme_fc_delete_association(ctrl);
+ cancel_work_sync(&ctrl->ioerr_work);
+
+ if (ctrl->ctrl.tagset)
+ nvme_remove_io_tag_set(&ctrl->ctrl);
+
+ nvme_unquiesce_admin_queue(&ctrl->ctrl);
+ nvme_remove_admin_tag_set(&ctrl->ctrl);
}
static void
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 543e17aead12..e35eccacee8c 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -793,7 +793,7 @@ static void nvme_mpath_set_live(struct nvme_ns *ns)
return;
}
nvme_add_ns_head_cdev(head);
- kblockd_schedule_work(&head->partition_scan_work);
+ queue_work(nvme_wq, &head->partition_scan_work);
}
nvme_mpath_add_sysfs_link(ns->head);
diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c
index ceba21684e82..300d5e032f6d 100644
--- a/drivers/nvme/target/auth.c
+++ b/drivers/nvme/target/auth.c
@@ -298,7 +298,7 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response,
const char *hash_name;
u8 *challenge = req->sq->dhchap_c1;
struct nvme_dhchap_key *transformed_key;
- u8 buf[4], sc_c = ctrl->concat ? 1 : 0;
+ u8 buf[4];
int ret;
hash_name = nvme_auth_hmac_name(ctrl->shash_id);
@@ -367,7 +367,7 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response,
ret = crypto_shash_update(shash, buf, 2);
if (ret)
goto out;
- *buf = sc_c;
+ *buf = req->sq->sc_c;
ret = crypto_shash_update(shash, buf, 1);
if (ret)
goto out;
diff --git a/drivers/nvme/target/fabrics-cmd-auth.c b/drivers/nvme/target/fabrics-cmd-auth.c
index bf01ec414c55..5946681cb0e3 100644
--- a/drivers/nvme/target/fabrics-cmd-auth.c
+++ b/drivers/nvme/target/fabrics-cmd-auth.c
@@ -43,6 +43,7 @@ static u8 nvmet_auth_negotiate(struct nvmet_req *req, void *d)
data->auth_protocol[0].dhchap.halen,
data->auth_protocol[0].dhchap.dhlen);
req->sq->dhchap_tid = le16_to_cpu(data->t_id);
+ req->sq->sc_c = data->sc_c;
if (data->sc_c != NVME_AUTH_SECP_NOSC) {
if (!IS_ENABLED(CONFIG_NVME_TARGET_TCP_TLS))
return NVME_AUTH_DHCHAP_FAILURE_CONCAT_MISMATCH;
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 51df72f5e89b..f3b09f4099f0 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -159,6 +159,7 @@ struct nvmet_sq {
bool authenticated;
struct delayed_work auth_expired_work;
u16 dhchap_tid;
+ u8 sc_c;
u8 dhchap_status;
u8 dhchap_step;
u8 *dhchap_c1;
diff --git a/drivers/nvmem/layouts.c b/drivers/nvmem/layouts.c
index f381ce1e84bd..7ebe53249035 100644
--- a/drivers/nvmem/layouts.c
+++ b/drivers/nvmem/layouts.c
@@ -51,7 +51,7 @@ static int nvmem_layout_bus_uevent(const struct device *dev,
int ret;
ret = of_device_uevent_modalias(dev, env);
- if (ret != ENODEV)
+ if (ret != -ENODEV)
return ret;
return 0;
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 4492b809094b..36f8c0985430 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -958,6 +958,7 @@ void pci_save_aspm_l1ss_state(struct pci_dev *dev);
void pci_restore_aspm_l1ss_state(struct pci_dev *dev);
#ifdef CONFIG_PCIEASPM
+void pcie_aspm_remove_cap(struct pci_dev *pdev, u32 lnkcap);
void pcie_aspm_init_link_state(struct pci_dev *pdev);
void pcie_aspm_exit_link_state(struct pci_dev *pdev);
void pcie_aspm_pm_state_change(struct pci_dev *pdev, bool locked);
@@ -965,6 +966,7 @@ void pcie_aspm_powersave_config_link(struct pci_dev *pdev);
void pci_configure_ltr(struct pci_dev *pdev);
void pci_bridge_reconfigure_ltr(struct pci_dev *pdev);
#else
+static inline void pcie_aspm_remove_cap(struct pci_dev *pdev, u32 lnkcap) { }
static inline void pcie_aspm_init_link_state(struct pci_dev *pdev) { }
static inline void pcie_aspm_exit_link_state(struct pci_dev *pdev) { }
static inline void pcie_aspm_pm_state_change(struct pci_dev *pdev, bool locked) { }
diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c
index 79b965158473..cedea47a3547 100644
--- a/drivers/pci/pcie/aspm.c
+++ b/drivers/pci/pcie/aspm.c
@@ -814,7 +814,6 @@ static void pcie_aspm_override_default_link_state(struct pcie_link_state *link)
static void pcie_aspm_cap_init(struct pcie_link_state *link, int blacklist)
{
struct pci_dev *child = link->downstream, *parent = link->pdev;
- u32 parent_lnkcap, child_lnkcap;
u16 parent_lnkctl, child_lnkctl;
struct pci_bus *linkbus = parent->subordinate;
@@ -829,9 +828,8 @@ static void pcie_aspm_cap_init(struct pcie_link_state *link, int blacklist)
* If ASPM not supported, don't mess with the clocks and link,
* bail out now.
*/
- pcie_capability_read_dword(parent, PCI_EXP_LNKCAP, &parent_lnkcap);
- pcie_capability_read_dword(child, PCI_EXP_LNKCAP, &child_lnkcap);
- if (!(parent_lnkcap & child_lnkcap & PCI_EXP_LNKCAP_ASPMS))
+ if (!(parent->aspm_l0s_support && child->aspm_l0s_support) &&
+ !(parent->aspm_l1_support && child->aspm_l1_support))
return;
/* Configure common clock before checking latencies */
@@ -843,8 +841,6 @@ static void pcie_aspm_cap_init(struct pcie_link_state *link, int blacklist)
* read-only Link Capabilities may change depending on common clock
* configuration (PCIe r5.0, sec 7.5.3.6).
*/
- pcie_capability_read_dword(parent, PCI_EXP_LNKCAP, &parent_lnkcap);
- pcie_capability_read_dword(child, PCI_EXP_LNKCAP, &child_lnkcap);
pcie_capability_read_word(parent, PCI_EXP_LNKCTL, &parent_lnkctl);
pcie_capability_read_word(child, PCI_EXP_LNKCTL, &child_lnkctl);
@@ -864,7 +860,7 @@ static void pcie_aspm_cap_init(struct pcie_link_state *link, int blacklist)
* given link unless components on both sides of the link each
* support L0s.
*/
- if (parent_lnkcap & child_lnkcap & PCI_EXP_LNKCAP_ASPM_L0S)
+ if (parent->aspm_l0s_support && child->aspm_l0s_support)
link->aspm_support |= PCIE_LINK_STATE_L0S;
if (child_lnkctl & PCI_EXP_LNKCTL_ASPM_L0S)
@@ -873,7 +869,7 @@ static void pcie_aspm_cap_init(struct pcie_link_state *link, int blacklist)
link->aspm_enabled |= PCIE_LINK_STATE_L0S_DW;
/* Setup L1 state */
- if (parent_lnkcap & child_lnkcap & PCI_EXP_LNKCAP_ASPM_L1)
+ if (parent->aspm_l1_support && child->aspm_l1_support)
link->aspm_support |= PCIE_LINK_STATE_L1;
if (parent_lnkctl & child_lnkctl & PCI_EXP_LNKCTL_ASPM_L1)
@@ -1530,6 +1526,19 @@ int pci_enable_link_state_locked(struct pci_dev *pdev, int state)
}
EXPORT_SYMBOL(pci_enable_link_state_locked);
+void pcie_aspm_remove_cap(struct pci_dev *pdev, u32 lnkcap)
+{
+ if (lnkcap & PCI_EXP_LNKCAP_ASPM_L0S)
+ pdev->aspm_l0s_support = 0;
+ if (lnkcap & PCI_EXP_LNKCAP_ASPM_L1)
+ pdev->aspm_l1_support = 0;
+
+ pci_info(pdev, "ASPM: Link Capabilities%s%s treated as unsupported to avoid device defect\n",
+ lnkcap & PCI_EXP_LNKCAP_ASPM_L0S ? " L0s" : "",
+ lnkcap & PCI_EXP_LNKCAP_ASPM_L1 ? " L1" : "");
+
+}
+
static int pcie_aspm_set_policy(const char *val,
const struct kernel_param *kp)
{
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 0ce98e18b5a8..9cd032dff31e 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1656,6 +1656,13 @@ void set_pcie_port_type(struct pci_dev *pdev)
if (reg32 & PCI_EXP_LNKCAP_DLLLARC)
pdev->link_active_reporting = 1;
+#ifdef CONFIG_PCIEASPM
+ if (reg32 & PCI_EXP_LNKCAP_ASPM_L0S)
+ pdev->aspm_l0s_support = 1;
+ if (reg32 & PCI_EXP_LNKCAP_ASPM_L1)
+ pdev->aspm_l1_support = 1;
+#endif
+
parent = pci_upstream_bridge(pdev);
if (!parent)
return;
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 214ed060ca1b..b9c252aa6fe0 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -2494,28 +2494,27 @@ DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_INTEL, PCI_ANY_ID,
*/
static void quirk_disable_aspm_l0s(struct pci_dev *dev)
{
- pci_info(dev, "Disabling L0s\n");
- pci_disable_link_state(dev, PCIE_LINK_STATE_L0S);
-}
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x10a7, quirk_disable_aspm_l0s);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x10a9, quirk_disable_aspm_l0s);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x10b6, quirk_disable_aspm_l0s);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x10c6, quirk_disable_aspm_l0s);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x10c7, quirk_disable_aspm_l0s);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x10c8, quirk_disable_aspm_l0s);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x10d6, quirk_disable_aspm_l0s);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x10db, quirk_disable_aspm_l0s);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x10dd, quirk_disable_aspm_l0s);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x10e1, quirk_disable_aspm_l0s);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x10ec, quirk_disable_aspm_l0s);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x10f1, quirk_disable_aspm_l0s);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x10f4, quirk_disable_aspm_l0s);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x1508, quirk_disable_aspm_l0s);
+ pcie_aspm_remove_cap(dev, PCI_EXP_LNKCAP_ASPM_L0S);
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x10a7, quirk_disable_aspm_l0s);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x10a9, quirk_disable_aspm_l0s);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x10b6, quirk_disable_aspm_l0s);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x10c6, quirk_disable_aspm_l0s);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x10c7, quirk_disable_aspm_l0s);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x10c8, quirk_disable_aspm_l0s);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x10d6, quirk_disable_aspm_l0s);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x10db, quirk_disable_aspm_l0s);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x10dd, quirk_disable_aspm_l0s);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x10e1, quirk_disable_aspm_l0s);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x10ec, quirk_disable_aspm_l0s);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x10f1, quirk_disable_aspm_l0s);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x10f4, quirk_disable_aspm_l0s);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1508, quirk_disable_aspm_l0s);
static void quirk_disable_aspm_l0s_l1(struct pci_dev *dev)
{
- pci_info(dev, "Disabling ASPM L0s/L1\n");
- pci_disable_link_state(dev, PCIE_LINK_STATE_L0S | PCIE_LINK_STATE_L1);
+ pcie_aspm_remove_cap(dev,
+ PCI_EXP_LNKCAP_ASPM_L0S | PCI_EXP_LNKCAP_ASPM_L1);
}
/*
@@ -2523,7 +2522,10 @@ static void quirk_disable_aspm_l0s_l1(struct pci_dev *dev)
* upstream PCIe root port when ASPM is enabled. At least L0s mode is affected;
* disable both L0s and L1 for now to be safe.
*/
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ASMEDIA, 0x1080, quirk_disable_aspm_l0s_l1);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ASMEDIA, 0x1080, quirk_disable_aspm_l0s_l1);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_FREESCALE, 0x0451, quirk_disable_aspm_l0s_l1);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_PASEMI, 0xa002, quirk_disable_aspm_l0s_l1);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_HUAWEI, 0x1105, quirk_disable_aspm_l0s_l1);
/*
* Some Pericom PCIe-to-PCI bridges in reverse mode need the PCIe Retrain
diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c
index e255c1b069ec..7dd282da67ce 100644
--- a/drivers/perf/riscv_pmu_sbi.c
+++ b/drivers/perf/riscv_pmu_sbi.c
@@ -1109,7 +1109,7 @@ static irqreturn_t pmu_sbi_ovf_handler(int irq, void *dev)
/* compute hardware counter index */
hidx = info->csr - CSR_CYCLE;
- /* check if the corresponding bit is set in sscountovf or overflow mask in shmem */
+ /* check if the corresponding bit is set in scountovf or overflow mask in shmem */
if (!(overflow & BIT(hidx)))
continue;
diff --git a/drivers/pinctrl/cirrus/pinctrl-cs42l43.c b/drivers/pinctrl/cirrus/pinctrl-cs42l43.c
index 68abb6d6cecd..a8f82104a384 100644
--- a/drivers/pinctrl/cirrus/pinctrl-cs42l43.c
+++ b/drivers/pinctrl/cirrus/pinctrl-cs42l43.c
@@ -532,6 +532,11 @@ static int cs42l43_gpio_add_pin_ranges(struct gpio_chip *chip)
return ret;
}
+static void cs42l43_fwnode_put(void *data)
+{
+ fwnode_handle_put(data);
+}
+
static int cs42l43_pin_probe(struct platform_device *pdev)
{
struct cs42l43 *cs42l43 = dev_get_drvdata(pdev->dev.parent);
@@ -563,10 +568,20 @@ static int cs42l43_pin_probe(struct platform_device *pdev)
priv->gpio_chip.ngpio = CS42L43_NUM_GPIOS;
if (is_of_node(fwnode)) {
- fwnode = fwnode_get_named_child_node(fwnode, "pinctrl");
-
- if (fwnode && !fwnode->dev)
- fwnode->dev = priv->dev;
+ struct fwnode_handle *child;
+
+ child = fwnode_get_named_child_node(fwnode, "pinctrl");
+ if (child) {
+ ret = devm_add_action_or_reset(&pdev->dev,
+ cs42l43_fwnode_put, child);
+ if (ret) {
+ fwnode_handle_put(child);
+ return ret;
+ }
+ if (!child->dev)
+ child->dev = priv->dev;
+ fwnode = child;
+ }
}
priv->gpio_chip.fwnode = fwnode;
diff --git a/drivers/pinctrl/mediatek/pinctrl-mt8189.c b/drivers/pinctrl/mediatek/pinctrl-mt8189.c
index 7028aff55ae5..f6a3e584588b 100644
--- a/drivers/pinctrl/mediatek/pinctrl-mt8189.c
+++ b/drivers/pinctrl/mediatek/pinctrl-mt8189.c
@@ -1642,9 +1642,7 @@ static const struct mtk_pin_reg_calc mt8189_reg_cals[PINCTRL_PIN_REG_MAX] = {
};
static const char * const mt8189_pinctrl_register_base_names[] = {
- "gpio_base", "iocfg_bm0_base", "iocfg_bm1_base", "iocfg_bm2_base", "iocfg_lm_base",
- "iocfg_lt0_base", "iocfg_lt1_base", "iocfg_rb0_base", "iocfg_rb1_base",
- "iocfg_rt_base"
+ "base", "lm", "rb0", "rb1", "bm0", "bm1", "bm2", "lt0", "lt1", "rt",
};
static const struct mtk_eint_hw mt8189_eint_hw = {
diff --git a/drivers/pinctrl/mediatek/pinctrl-mt8196.c b/drivers/pinctrl/mediatek/pinctrl-mt8196.c
index 82a73929c7a0..dec957c1724b 100644
--- a/drivers/pinctrl/mediatek/pinctrl-mt8196.c
+++ b/drivers/pinctrl/mediatek/pinctrl-mt8196.c
@@ -1801,10 +1801,8 @@ static const struct mtk_pin_reg_calc mt8196_reg_cals[PINCTRL_PIN_REG_MAX] = {
};
static const char * const mt8196_pinctrl_register_base_names[] = {
- "iocfg0", "iocfg_rt", "iocfg_rm1", "iocfg_rm2",
- "iocfg_rb", "iocfg_bm1", "iocfg_bm2", "iocfg_bm3",
- "iocfg_lt", "iocfg_lm1", "iocfg_lm2", "iocfg_lb1",
- "iocfg_lb2", "iocfg_tm1", "iocfg_tm2", "iocfg_tm3",
+ "base", "rt", "rm1", "rm2", "rb", "bm1", "bm2", "bm3",
+ "lt", "lm1", "lm2", "lb1", "lb2", "tm1", "tm2", "tm3",
};
static const struct mtk_eint_hw mt8196_eint_hw = {
diff --git a/drivers/pinctrl/nxp/pinctrl-s32cc.c b/drivers/pinctrl/nxp/pinctrl-s32cc.c
index 501eb296c760..35511f83d056 100644
--- a/drivers/pinctrl/nxp/pinctrl-s32cc.c
+++ b/drivers/pinctrl/nxp/pinctrl-s32cc.c
@@ -392,6 +392,7 @@ static int s32_pmx_gpio_request_enable(struct pinctrl_dev *pctldev,
gpio_pin->pin_id = offset;
gpio_pin->config = config;
+ INIT_LIST_HEAD(&gpio_pin->list);
spin_lock_irqsave(&ipctl->gpio_configs_lock, flags);
list_add(&gpio_pin->list, &ipctl->gpio_configs);
@@ -951,7 +952,7 @@ int s32_pinctrl_probe(struct platform_device *pdev,
spin_lock_init(&ipctl->gpio_configs_lock);
s32_pinctrl_desc =
- devm_kmalloc(&pdev->dev, sizeof(*s32_pinctrl_desc), GFP_KERNEL);
+ devm_kzalloc(&pdev->dev, sizeof(*s32_pinctrl_desc), GFP_KERNEL);
if (!s32_pinctrl_desc)
return -ENOMEM;
diff --git a/drivers/pinctrl/qcom/pinctrl-msm.c b/drivers/pinctrl/qcom/pinctrl-msm.c
index 67525d542c5b..e99871b90ab9 100644
--- a/drivers/pinctrl/qcom/pinctrl-msm.c
+++ b/drivers/pinctrl/qcom/pinctrl-msm.c
@@ -189,7 +189,7 @@ static int msm_pinmux_set_mux(struct pinctrl_dev *pctldev,
*/
if (d && i != gpio_func &&
!test_and_set_bit(d->hwirq, pctrl->disabled_for_mux))
- disable_irq(irq);
+ disable_irq_nosync(irq);
raw_spin_lock_irqsave(&pctrl->lock, flags);
diff --git a/drivers/pinctrl/realtek/Kconfig b/drivers/pinctrl/realtek/Kconfig
index 0fc6bd4fcb7e..400c9e5b16ad 100644
--- a/drivers/pinctrl/realtek/Kconfig
+++ b/drivers/pinctrl/realtek/Kconfig
@@ -6,6 +6,7 @@ config PINCTRL_RTD
default y
select PINMUX
select GENERIC_PINCONF
+ select REGMAP_MMIO
config PINCTRL_RTD1619B
tristate "Realtek DHC 1619B pin controller driver"
diff --git a/drivers/platform/arm64/lenovo-thinkpad-t14s.c b/drivers/platform/arm64/lenovo-thinkpad-t14s.c
index 1d5d11adaf32..cf6a1d3b2617 100644
--- a/drivers/platform/arm64/lenovo-thinkpad-t14s.c
+++ b/drivers/platform/arm64/lenovo-thinkpad-t14s.c
@@ -120,6 +120,7 @@ static int t14s_ec_write(void *context, unsigned int reg,
if (ret < 0)
return ret;
+ fsleep(10000);
return 0;
}
@@ -157,6 +158,7 @@ static int t14s_ec_read(void *context, unsigned int reg,
out:
i2c_unlock_bus(client->adapter, I2C_LOCK_SEGMENT);
+ fsleep(10000);
return ret;
}
@@ -191,6 +193,8 @@ static int t14s_ec_read_evt(struct t14s_ec *ec, u8 *val)
if (ret < 0)
goto out;
+ fsleep(10000);
+
ret = 0;
out:
@@ -557,12 +561,6 @@ static int t14s_ec_probe(struct i2c_client *client)
return dev_err_probe(dev, PTR_ERR(ec->regmap),
"Failed to init regmap\n");
- ret = devm_request_threaded_irq(dev, client->irq, NULL,
- t14s_ec_irq_handler,
- IRQF_ONESHOT, dev_name(dev), ec);
- if (ret < 0)
- return dev_err_probe(dev, ret, "Failed to get IRQ\n");
-
ret = t14s_leds_probe(ec);
if (ret < 0)
return ret;
@@ -579,6 +577,12 @@ static int t14s_ec_probe(struct i2c_client *client)
if (ret < 0)
return ret;
+ ret = devm_request_threaded_irq(dev, client->irq, NULL,
+ t14s_ec_irq_handler,
+ IRQF_ONESHOT, dev_name(dev), ec);
+ if (ret < 0)
+ return dev_err_probe(dev, ret, "Failed to get IRQ\n");
+
/*
* Disable wakeup support by default, because the driver currently does
* not support masking any events and the laptop should not wake up when
diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index c122016d82f1..c883a28e0916 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -545,6 +545,7 @@ config MSI_WMI
config MSI_WMI_PLATFORM
tristate "MSI WMI Platform features"
depends on ACPI_WMI
+ depends on DMI
depends on HWMON
help
Say Y here if you want to have support for WMI-based platform features
diff --git a/drivers/platform/x86/acer-wmi.c b/drivers/platform/x86/acer-wmi.c
index 13eb22b35aa8..d848afc91f87 100644
--- a/drivers/platform/x86/acer-wmi.c
+++ b/drivers/platform/x86/acer-wmi.c
@@ -102,6 +102,7 @@ MODULE_ALIAS("wmi:676AA15E-6A47-4D9F-A2CC-1E6D18D14026");
enum acer_wmi_event_ids {
WMID_HOTKEY_EVENT = 0x1,
+ WMID_BACKLIGHT_EVENT = 0x4,
WMID_ACCEL_OR_KBD_DOCK_EVENT = 0x5,
WMID_GAMING_TURBO_KEY_EVENT = 0x7,
WMID_AC_EVENT = 0x8,
@@ -2369,6 +2370,9 @@ static void acer_wmi_notify(union acpi_object *obj, void *context)
sparse_keymap_report_event(acer_wmi_input_dev, scancode, 1, true);
}
break;
+ case WMID_BACKLIGHT_EVENT:
+ /* Already handled by acpi-video */
+ break;
case WMID_ACCEL_OR_KBD_DOCK_EVENT:
acer_gsensor_event();
acer_kbd_dock_event(&return_value);
diff --git a/drivers/platform/x86/amd/pmc/pmc-quirks.c b/drivers/platform/x86/amd/pmc/pmc-quirks.c
index d63aaad7ef59..404e62ad293a 100644
--- a/drivers/platform/x86/amd/pmc/pmc-quirks.c
+++ b/drivers/platform/x86/amd/pmc/pmc-quirks.c
@@ -122,6 +122,14 @@ static const struct dmi_system_id fwbug_list[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "21A1"),
}
},
+ {
+ .ident = "ROG Xbox Ally RC73YA",
+ .driver_data = &quirk_spurious_8042,
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK COMPUTER INC."),
+ DMI_MATCH(DMI_BOARD_NAME, "RC73YA"),
+ }
+ },
/* https://bugzilla.kernel.org/show_bug.cgi?id=218024 */
{
.ident = "V14 G4 AMN",
@@ -204,6 +212,23 @@ static const struct dmi_system_id fwbug_list[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "82ND"),
}
},
+ /* https://gitlab.freedesktop.org/drm/amd/-/issues/4618 */
+ {
+ .ident = "Lenovo Legion Go 2",
+ .driver_data = &quirk_s2idle_bug,
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "83N0"),
+ }
+ },
+ {
+ .ident = "Lenovo Legion Go 2",
+ .driver_data = &quirk_s2idle_bug,
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "83N1"),
+ }
+ },
/* https://gitlab.freedesktop.org/drm/amd/-/issues/2684 */
{
.ident = "HP Laptop 15s-eq2xxx",
diff --git a/drivers/platform/x86/amd/pmc/pmc.c b/drivers/platform/x86/amd/pmc/pmc.c
index bd318fd02ccf..cae3fcafd4d7 100644
--- a/drivers/platform/x86/amd/pmc/pmc.c
+++ b/drivers/platform/x86/amd/pmc/pmc.c
@@ -106,6 +106,7 @@ static void amd_pmc_get_ip_info(struct amd_pmc_dev *dev)
switch (dev->cpu_id) {
case AMD_CPU_ID_PCO:
case AMD_CPU_ID_RN:
+ case AMD_CPU_ID_VG:
case AMD_CPU_ID_YC:
case AMD_CPU_ID_CB:
dev->num_ips = 12;
@@ -517,6 +518,7 @@ static int amd_pmc_get_os_hint(struct amd_pmc_dev *dev)
case AMD_CPU_ID_PCO:
return MSG_OS_HINT_PCO;
case AMD_CPU_ID_RN:
+ case AMD_CPU_ID_VG:
case AMD_CPU_ID_YC:
case AMD_CPU_ID_CB:
case AMD_CPU_ID_PS:
@@ -717,6 +719,7 @@ static const struct pci_device_id pmc_pci_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, AMD_CPU_ID_RV) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, AMD_CPU_ID_SP) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, AMD_CPU_ID_SHP) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, AMD_CPU_ID_VG) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M60H_ROOT) },
{ }
diff --git a/drivers/platform/x86/amd/pmc/pmc.h b/drivers/platform/x86/amd/pmc/pmc.h
index 62f3e51020fd..fe3f53eb5955 100644
--- a/drivers/platform/x86/amd/pmc/pmc.h
+++ b/drivers/platform/x86/amd/pmc/pmc.h
@@ -156,6 +156,7 @@ void amd_mp2_stb_deinit(struct amd_pmc_dev *dev);
#define AMD_CPU_ID_RN 0x1630
#define AMD_CPU_ID_PCO AMD_CPU_ID_RV
#define AMD_CPU_ID_CZN AMD_CPU_ID_RN
+#define AMD_CPU_ID_VG 0x1645
#define AMD_CPU_ID_YC 0x14B5
#define AMD_CPU_ID_CB 0x14D8
#define AMD_CPU_ID_PS 0x14E8
diff --git a/drivers/platform/x86/dell/alienware-wmi-wmax.c b/drivers/platform/x86/dell/alienware-wmi-wmax.c
index f417dcc9af35..fadf7aac6779 100644
--- a/drivers/platform/x86/dell/alienware-wmi-wmax.c
+++ b/drivers/platform/x86/dell/alienware-wmi-wmax.c
@@ -90,34 +90,34 @@ static struct awcc_quirks empty_quirks;
static const struct dmi_system_id awcc_dmi_table[] __initconst = {
{
- .ident = "Alienware Area-51m",
+ .ident = "Alienware 16 Aurora",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Alienware"),
- DMI_MATCH(DMI_PRODUCT_NAME, "Alienware Area-51m"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Alienware 16 Aurora"),
},
- .driver_data = &generic_quirks,
+ .driver_data = &g_series_quirks,
},
{
- .ident = "Alienware Area-51m R2",
+ .ident = "Alienware Area-51m",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Alienware"),
- DMI_MATCH(DMI_PRODUCT_NAME, "Alienware Area-51m R2"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Alienware Area-51m"),
},
.driver_data = &generic_quirks,
},
{
- .ident = "Alienware m15 R5",
+ .ident = "Alienware m15",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Alienware"),
- DMI_MATCH(DMI_PRODUCT_NAME, "Alienware m15 R5"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Alienware m15"),
},
.driver_data = &generic_quirks,
},
{
- .ident = "Alienware m15 R7",
+ .ident = "Alienware m16 R1 AMD",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Alienware"),
- DMI_MATCH(DMI_PRODUCT_NAME, "Alienware m15 R7"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Alienware m16 R1 AMD"),
},
.driver_data = &generic_quirks,
},
@@ -130,14 +130,6 @@ static const struct dmi_system_id awcc_dmi_table[] __initconst = {
.driver_data = &g_series_quirks,
},
{
- .ident = "Alienware m16 R1 AMD",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Alienware"),
- DMI_MATCH(DMI_PRODUCT_NAME, "Alienware m16 R1 AMD"),
- },
- .driver_data = &generic_quirks,
- },
- {
.ident = "Alienware m16 R2",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Alienware"),
@@ -146,114 +138,66 @@ static const struct dmi_system_id awcc_dmi_table[] __initconst = {
.driver_data = &generic_quirks,
},
{
- .ident = "Alienware m17 R5",
+ .ident = "Alienware m17",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Alienware"),
- DMI_MATCH(DMI_PRODUCT_NAME, "Alienware m17 R5 AMD"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Alienware m17"),
},
.driver_data = &generic_quirks,
},
{
- .ident = "Alienware m18 R2",
+ .ident = "Alienware m18",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Alienware"),
- DMI_MATCH(DMI_PRODUCT_NAME, "Alienware m18 R2"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Alienware m18"),
},
.driver_data = &generic_quirks,
},
{
- .ident = "Alienware x15 R1",
+ .ident = "Alienware x15",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Alienware"),
- DMI_MATCH(DMI_PRODUCT_NAME, "Alienware x15 R1"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Alienware x15"),
},
.driver_data = &generic_quirks,
},
{
- .ident = "Alienware x15 R2",
+ .ident = "Alienware x17",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Alienware"),
- DMI_MATCH(DMI_PRODUCT_NAME, "Alienware x15 R2"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Alienware x17"),
},
.driver_data = &generic_quirks,
},
{
- .ident = "Alienware x17 R2",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Alienware"),
- DMI_MATCH(DMI_PRODUCT_NAME, "Alienware x17 R2"),
- },
- .driver_data = &generic_quirks,
- },
- {
- .ident = "Dell Inc. G15 5510",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "Dell G15 5510"),
- },
- .driver_data = &g_series_quirks,
- },
- {
- .ident = "Dell Inc. G15 5511",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "Dell G15 5511"),
- },
- .driver_data = &g_series_quirks,
- },
- {
- .ident = "Dell Inc. G15 5515",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "Dell G15 5515"),
- },
- .driver_data = &g_series_quirks,
- },
- {
- .ident = "Dell Inc. G15 5530",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "Dell G15 5530"),
- },
- .driver_data = &g_series_quirks,
- },
- {
- .ident = "Dell Inc. G16 7630",
- .matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "Dell G16 7630"),
- },
- .driver_data = &g_series_quirks,
- },
- {
- .ident = "Dell Inc. G3 3500",
+ .ident = "Dell Inc. G15",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "G3 3500"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Dell G15"),
},
.driver_data = &g_series_quirks,
},
{
- .ident = "Dell Inc. G3 3590",
+ .ident = "Dell Inc. G16",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "G3 3590"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Dell G16"),
},
.driver_data = &g_series_quirks,
},
{
- .ident = "Dell Inc. G5 5500",
+ .ident = "Dell Inc. G3",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "G5 5500"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "G3"),
},
.driver_data = &g_series_quirks,
},
{
- .ident = "Dell Inc. G5 5505",
+ .ident = "Dell Inc. G5",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "G5 5505"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "G5"),
},
.driver_data = &g_series_quirks,
},
diff --git a/drivers/platform/x86/hp/hp-wmi.c b/drivers/platform/x86/hp/hp-wmi.c
index 8b3533d6ba09..ad9d9f97960f 100644
--- a/drivers/platform/x86/hp/hp-wmi.c
+++ b/drivers/platform/x86/hp/hp-wmi.c
@@ -92,9 +92,11 @@ static const char * const victus_thermal_profile_boards[] = {
"8A25"
};
-/* DMI Board names of Victus 16-r1000 and Victus 16-s1000 laptops */
+/* DMI Board names of Victus 16-r and Victus 16-s laptops */
static const char * const victus_s_thermal_profile_boards[] = {
- "8C99", "8C9C"
+ "8BBE", "8BD4", "8BD5",
+ "8C78", "8C99", "8C9C",
+ "8D41",
};
enum hp_wmi_radio {
diff --git a/drivers/platform/x86/huawei-wmi.c b/drivers/platform/x86/huawei-wmi.c
index c3772df34679..8a4c54089ace 100644
--- a/drivers/platform/x86/huawei-wmi.c
+++ b/drivers/platform/x86/huawei-wmi.c
@@ -81,6 +81,10 @@ static const struct key_entry huawei_wmi_keymap[] = {
{ KE_KEY, 0x289, { KEY_WLAN } },
// Huawei |M| key
{ KE_KEY, 0x28a, { KEY_CONFIG } },
+ // HONOR YOYO key
+ { KE_KEY, 0x28b, { KEY_NOTIFICATION_CENTER } },
+ // HONOR print screen
+ { KE_KEY, 0x28e, { KEY_PRINT } },
// Keyboard backlit
{ KE_IGNORE, 0x293, { KEY_KBDILLUMTOGGLE } },
{ KE_IGNORE, 0x294, { KEY_KBDILLUMUP } },
diff --git a/drivers/platform/x86/intel/hid.c b/drivers/platform/x86/intel/hid.c
index f25a427cccda..9c07a7faf18f 100644
--- a/drivers/platform/x86/intel/hid.c
+++ b/drivers/platform/x86/intel/hid.c
@@ -55,6 +55,7 @@ static const struct acpi_device_id intel_hid_ids[] = {
{ "INTC10CB" },
{ "INTC10CC" },
{ "INTC10F1" },
+ { "INTC10F2" },
{ }
};
MODULE_DEVICE_TABLE(acpi, intel_hid_ids);
diff --git a/drivers/platform/x86/intel/punit_ipc.c b/drivers/platform/x86/intel/punit_ipc.c
index bafac8aa2baf..14513010daad 100644
--- a/drivers/platform/x86/intel/punit_ipc.c
+++ b/drivers/platform/x86/intel/punit_ipc.c
@@ -250,7 +250,7 @@ static int intel_punit_ipc_probe(struct platform_device *pdev)
} else {
ret = devm_request_irq(&pdev->dev, irq, intel_punit_ioc,
IRQF_NO_SUSPEND, "intel_punit_ipc",
- &punit_ipcdev);
+ punit_ipcdev);
if (ret) {
dev_err(&pdev->dev, "Failed to request irq: %d\n", irq);
return ret;
diff --git a/drivers/platform/x86/intel/speed_select_if/isst_if_mmio.c b/drivers/platform/x86/intel/speed_select_if/isst_if_mmio.c
index 3f4343147dad..950ede5eab76 100644
--- a/drivers/platform/x86/intel/speed_select_if/isst_if_mmio.c
+++ b/drivers/platform/x86/intel/speed_select_if/isst_if_mmio.c
@@ -108,11 +108,11 @@ static int isst_if_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
ret = pci_read_config_dword(pdev, 0xD0, &mmio_base);
if (ret)
- return ret;
+ return pcibios_err_to_errno(ret);
ret = pci_read_config_dword(pdev, 0xFC, &pcu_base);
if (ret)
- return ret;
+ return pcibios_err_to_errno(ret);
pcu_base &= GENMASK(10, 0);
base_addr = (u64)mmio_base << 23 | (u64) pcu_base << 12;
diff --git a/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-common.h b/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-common.h
index 70ae11519837..0abe850ef54e 100644
--- a/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-common.h
+++ b/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-common.h
@@ -40,7 +40,7 @@
* @agent_type_mask: Bit mask of all hardware agents for this domain
* @uncore_attr_group: Attribute group storage
* @max_freq_khz_kobj_attr: Storage for kobject attribute max_freq_khz
- * @mix_freq_khz_kobj_attr: Storage for kobject attribute min_freq_khz
+ * @min_freq_khz_kobj_attr: Storage for kobject attribute min_freq_khz
* @initial_max_freq_khz_kobj_attr: Storage for kobject attribute initial_max_freq_khz
* @initial_min_freq_khz_kobj_attr: Storage for kobject attribute initial_min_freq_khz
* @current_freq_khz_kobj_attr: Storage for kobject attribute current_freq_khz
@@ -48,13 +48,14 @@
* @fabric_cluster_id_kobj_attr: Storage for kobject attribute fabric_cluster_id
* @package_id_kobj_attr: Storage for kobject attribute package_id
* @elc_low_threshold_percent_kobj_attr:
- Storage for kobject attribute elc_low_threshold_percent
+ * Storage for kobject attribute elc_low_threshold_percent
* @elc_high_threshold_percent_kobj_attr:
- Storage for kobject attribute elc_high_threshold_percent
+ * Storage for kobject attribute elc_high_threshold_percent
* @elc_high_threshold_enable_kobj_attr:
- Storage for kobject attribute elc_high_threshold_enable
+ * Storage for kobject attribute elc_high_threshold_enable
* @elc_floor_freq_khz_kobj_attr: Storage for kobject attribute elc_floor_freq_khz
* @agent_types_kobj_attr: Storage for kobject attribute agent_type
+ * @die_id_kobj_attr: Attribute storage for die_id information
* @uncore_attrs: Attribute storage for group creation
*
* This structure is used to encapsulate all data related to uncore sysfs
diff --git a/drivers/platform/x86/intel/uncore-frequency/uncore-frequency.c b/drivers/platform/x86/intel/uncore-frequency/uncore-frequency.c
index 2a6897035150..0dfc552b2802 100644
--- a/drivers/platform/x86/intel/uncore-frequency/uncore-frequency.c
+++ b/drivers/platform/x86/intel/uncore-frequency/uncore-frequency.c
@@ -256,6 +256,10 @@ static const struct x86_cpu_id intel_uncore_cpu_ids[] = {
X86_MATCH_VFM(INTEL_ARROWLAKE, NULL),
X86_MATCH_VFM(INTEL_ARROWLAKE_H, NULL),
X86_MATCH_VFM(INTEL_LUNARLAKE_M, NULL),
+ X86_MATCH_VFM(INTEL_PANTHERLAKE_L, NULL),
+ X86_MATCH_VFM(INTEL_WILDCATLAKE_L, NULL),
+ X86_MATCH_VFM(INTEL_NOVALAKE, NULL),
+ X86_MATCH_VFM(INTEL_NOVALAKE_L, NULL),
{}
};
MODULE_DEVICE_TABLE(x86cpu, intel_uncore_cpu_ids);
diff --git a/drivers/platform/x86/msi-wmi-platform.c b/drivers/platform/x86/msi-wmi-platform.c
index dc5e9878cb68..e912fcc12d12 100644
--- a/drivers/platform/x86/msi-wmi-platform.c
+++ b/drivers/platform/x86/msi-wmi-platform.c
@@ -14,6 +14,7 @@
#include <linux/debugfs.h>
#include <linux/device.h>
#include <linux/device/driver.h>
+#include <linux/dmi.h>
#include <linux/errno.h>
#include <linux/hwmon.h>
#include <linux/kernel.h>
@@ -28,7 +29,7 @@
#define DRIVER_NAME "msi-wmi-platform"
-#define MSI_PLATFORM_GUID "ABBC0F6E-8EA1-11d1-00A0-C90629100000"
+#define MSI_PLATFORM_GUID "ABBC0F6E-8EA1-11D1-00A0-C90629100000"
#define MSI_WMI_PLATFORM_INTERFACE_VERSION 2
@@ -448,7 +449,45 @@ static struct wmi_driver msi_wmi_platform_driver = {
.probe = msi_wmi_platform_probe,
.no_singleton = true,
};
-module_wmi_driver(msi_wmi_platform_driver);
+
+/*
+ * MSI reused the WMI GUID from the WMI-ACPI sample code provided by Microsoft,
+ * so other manufacturers might use it as well for their WMI-ACPI implementations.
+ */
+static const struct dmi_system_id msi_wmi_platform_whitelist[] __initconst = {
+ {
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "MICRO-STAR INT"),
+ },
+ },
+ {
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Micro-Star International"),
+ },
+ },
+ { }
+};
+
+static int __init msi_wmi_platform_module_init(void)
+{
+ if (!dmi_check_system(msi_wmi_platform_whitelist)) {
+ if (!force)
+ return -ENODEV;
+
+ pr_warn("Ignoring DMI whitelist\n");
+ }
+
+ return wmi_driver_register(&msi_wmi_platform_driver);
+}
+
+static void __exit msi_wmi_platform_module_exit(void)
+{
+ wmi_driver_unregister(&msi_wmi_platform_driver);
+}
+
+module_init(msi_wmi_platform_module_init);
+module_exit(msi_wmi_platform_module_exit);
+
MODULE_AUTHOR("Armin Wolf <W_Armin@gmx.de>");
MODULE_DESCRIPTION("MSI WMI platform features");
diff --git a/drivers/pmdomain/arm/scmi_pm_domain.c b/drivers/pmdomain/arm/scmi_pm_domain.c
index 8fe1c0a501c9..b5e2ffd5ea64 100644
--- a/drivers/pmdomain/arm/scmi_pm_domain.c
+++ b/drivers/pmdomain/arm/scmi_pm_domain.c
@@ -41,7 +41,7 @@ static int scmi_pd_power_off(struct generic_pm_domain *domain)
static int scmi_pm_domain_probe(struct scmi_device *sdev)
{
- int num_domains, i;
+ int num_domains, i, ret;
struct device *dev = &sdev->dev;
struct device_node *np = dev->of_node;
struct scmi_pm_domain *scmi_pd;
@@ -108,9 +108,18 @@ static int scmi_pm_domain_probe(struct scmi_device *sdev)
scmi_pd_data->domains = domains;
scmi_pd_data->num_domains = num_domains;
+ ret = of_genpd_add_provider_onecell(np, scmi_pd_data);
+ if (ret)
+ goto err_rm_genpds;
+
dev_set_drvdata(dev, scmi_pd_data);
- return of_genpd_add_provider_onecell(np, scmi_pd_data);
+ return 0;
+err_rm_genpds:
+ for (i = num_domains - 1; i >= 0; i--)
+ pm_genpd_remove(domains[i]);
+
+ return ret;
}
static void scmi_pm_domain_remove(struct scmi_device *sdev)
diff --git a/drivers/pmdomain/imx/gpc.c b/drivers/pmdomain/imx/gpc.c
index 33991f3c6b55..a34b260274f7 100644
--- a/drivers/pmdomain/imx/gpc.c
+++ b/drivers/pmdomain/imx/gpc.c
@@ -536,6 +536,8 @@ static void imx_gpc_remove(struct platform_device *pdev)
return;
}
}
+
+ of_node_put(pgc_node);
}
static struct platform_driver imx_gpc_driver = {
diff --git a/drivers/pmdomain/mediatek/mtk-pm-domains.c b/drivers/pmdomain/mediatek/mtk-pm-domains.c
index 0ebe7379b94e..9c9323c8c93a 100644
--- a/drivers/pmdomain/mediatek/mtk-pm-domains.c
+++ b/drivers/pmdomain/mediatek/mtk-pm-domains.c
@@ -748,6 +748,18 @@ static void scpsys_domain_cleanup(struct scpsys *scpsys)
}
}
+static struct device_node *scpsys_get_legacy_regmap(struct device_node *np, const char *pn)
+{
+ struct device_node *local_node;
+
+ for_each_child_of_node(np, local_node) {
+ if (of_property_present(local_node, pn))
+ return local_node;
+ }
+
+ return NULL;
+}
+
static int scpsys_get_bus_protection_legacy(struct device *dev, struct scpsys *scpsys)
{
const u8 bp_blocks[3] = {
@@ -769,7 +781,7 @@ static int scpsys_get_bus_protection_legacy(struct device *dev, struct scpsys *s
* this makes it then possible to allocate the array of bus_prot
* regmaps and convert all to the new style handling.
*/
- node = of_find_node_with_property(np, "mediatek,infracfg");
+ node = scpsys_get_legacy_regmap(np, "mediatek,infracfg");
if (node) {
regmap[0] = syscon_regmap_lookup_by_phandle(node, "mediatek,infracfg");
of_node_put(node);
@@ -782,7 +794,7 @@ static int scpsys_get_bus_protection_legacy(struct device *dev, struct scpsys *s
regmap[0] = NULL;
}
- node = of_find_node_with_property(np, "mediatek,smi");
+ node = scpsys_get_legacy_regmap(np, "mediatek,smi");
if (node) {
smi_np = of_parse_phandle(node, "mediatek,smi", 0);
of_node_put(node);
@@ -800,7 +812,7 @@ static int scpsys_get_bus_protection_legacy(struct device *dev, struct scpsys *s
regmap[1] = NULL;
}
- node = of_find_node_with_property(np, "mediatek,infracfg-nao");
+ node = scpsys_get_legacy_regmap(np, "mediatek,infracfg-nao");
if (node) {
regmap[2] = syscon_regmap_lookup_by_phandle(node, "mediatek,infracfg-nao");
num_regmaps++;
diff --git a/drivers/pmdomain/samsung/exynos-pm-domains.c b/drivers/pmdomain/samsung/exynos-pm-domains.c
index 5d478bb37ad6..5c3aa8983087 100644
--- a/drivers/pmdomain/samsung/exynos-pm-domains.c
+++ b/drivers/pmdomain/samsung/exynos-pm-domains.c
@@ -92,13 +92,14 @@ static const struct of_device_id exynos_pm_domain_of_match[] = {
{ },
};
-static const char *exynos_get_domain_name(struct device_node *node)
+static const char *exynos_get_domain_name(struct device *dev,
+ struct device_node *node)
{
const char *name;
if (of_property_read_string(node, "label", &name) < 0)
name = kbasename(node->full_name);
- return kstrdup_const(name, GFP_KERNEL);
+ return devm_kstrdup_const(dev, name, GFP_KERNEL);
}
static int exynos_pd_probe(struct platform_device *pdev)
@@ -115,20 +116,27 @@ static int exynos_pd_probe(struct platform_device *pdev)
if (!pd)
return -ENOMEM;
- pd->pd.name = exynos_get_domain_name(np);
+ pd->pd.name = exynos_get_domain_name(dev, np);
if (!pd->pd.name)
return -ENOMEM;
pd->base = of_iomap(np, 0);
- if (!pd->base) {
- kfree_const(pd->pd.name);
+ if (!pd->base)
return -ENODEV;
- }
pd->pd.power_off = exynos_pd_power_off;
pd->pd.power_on = exynos_pd_power_on;
pd->local_pwr_cfg = pm_domain_cfg->local_pwr_cfg;
+ /*
+ * Some Samsung platforms with bootloaders turning on the splash-screen
+ * and handing it over to the kernel, requires the power-domains to be
+ * reset during boot.
+ */
+ if (IS_ENABLED(CONFIG_ARM) &&
+ of_device_is_compatible(np, "samsung,exynos4210-pd"))
+ exynos_pd_power_off(&pd->pd);
+
on = readl_relaxed(pd->base + 0x4) & pd->local_pwr_cfg;
pm_genpd_init(&pd->pd, NULL, !on);
@@ -147,15 +155,6 @@ static int exynos_pd_probe(struct platform_device *pdev)
parent.np, child.np);
}
- /*
- * Some Samsung platforms with bootloaders turning on the splash-screen
- * and handing it over to the kernel, requires the power-domains to be
- * reset during boot. As a temporary hack to manage this, let's enforce
- * a sync_state.
- */
- if (!ret)
- of_genpd_sync_state(np);
-
pm_runtime_enable(dev);
return ret;
}
diff --git a/drivers/pmdomain/tegra/powergate-bpmp.c b/drivers/pmdomain/tegra/powergate-bpmp.c
index b0138ca9f851..9f4366250bfd 100644
--- a/drivers/pmdomain/tegra/powergate-bpmp.c
+++ b/drivers/pmdomain/tegra/powergate-bpmp.c
@@ -184,6 +184,7 @@ tegra_powergate_add(struct tegra_bpmp *bpmp,
powergate->genpd.name = kstrdup(info->name, GFP_KERNEL);
powergate->genpd.power_on = tegra_powergate_power_on;
powergate->genpd.power_off = tegra_powergate_power_off;
+ powergate->genpd.flags = GENPD_FLAG_NO_STAY_ON;
err = pm_genpd_init(&powergate->genpd, NULL, off);
if (err < 0) {
diff --git a/drivers/power/supply/intel_dc_ti_battery.c b/drivers/power/supply/intel_dc_ti_battery.c
index 56b0c92e9d28..67a75281b0ac 100644
--- a/drivers/power/supply/intel_dc_ti_battery.c
+++ b/drivers/power/supply/intel_dc_ti_battery.c
@@ -127,7 +127,8 @@ struct dc_ti_battery_chip {
static int dc_ti_battery_get_voltage_and_current_now(struct power_supply *psy, int *volt, int *curr)
{
struct dc_ti_battery_chip *chip = power_supply_get_drvdata(psy);
- s64 cnt_start_usec, now_usec, sleep_usec;
+ ktime_t ktime;
+ s64 sleep_usec;
unsigned int reg_val;
s32 acc, smpl_ctr;
int ret;
@@ -141,16 +142,17 @@ static int dc_ti_battery_get_voltage_and_current_now(struct power_supply *psy, i
if (ret)
goto out_err;
- cnt_start_usec = ktime_get_ns() / NSEC_PER_USEC;
+ ktime = ktime_get();
/* Read Vbat, convert IIO mV to power-supply ųV */
ret = iio_read_channel_processed_scale(chip->vbat_channel, volt, 1000);
if (ret < 0)
goto out_err;
+ ktime = ktime_sub(ktime_get(), ktime);
+
/* Sleep at least 3 sample-times + slack to get 3+ CC samples */
- now_usec = ktime_get_ns() / NSEC_PER_USEC;
- sleep_usec = 3 * SMPL_INTVL_US + SLEEP_SLACK_US - (now_usec - cnt_start_usec);
+ sleep_usec = 3 * SMPL_INTVL_US + SLEEP_SLACK_US - ktime_to_us(ktime);
if (sleep_usec > 0 && sleep_usec < 1000000)
usleep_range(sleep_usec, sleep_usec + SLEEP_SLACK_US);
diff --git a/drivers/pwm/pwm-adp5585.c b/drivers/pwm/pwm-adp5585.c
index dc2860979e24..806f8d79b0d7 100644
--- a/drivers/pwm/pwm-adp5585.c
+++ b/drivers/pwm/pwm-adp5585.c
@@ -190,13 +190,13 @@ static int adp5585_pwm_probe(struct platform_device *pdev)
return 0;
}
-static const struct adp5585_pwm_chip adp5589_pwm_chip_info = {
+static const struct adp5585_pwm_chip adp5585_pwm_chip_info = {
.pwm_cfg = ADP5585_PWM_CFG,
.pwm_offt_low = ADP5585_PWM_OFFT_LOW,
.pwm_ont_low = ADP5585_PWM_ONT_LOW,
};
-static const struct adp5585_pwm_chip adp5585_pwm_chip_info = {
+static const struct adp5585_pwm_chip adp5589_pwm_chip_info = {
.pwm_cfg = ADP5589_PWM_CFG,
.pwm_offt_low = ADP5589_PWM_OFFT_LOW,
.pwm_ont_low = ADP5589_PWM_ONT_LOW,
diff --git a/drivers/regulator/fixed.c b/drivers/regulator/fixed.c
index 1cb647ed70c6..a2d16e9abfb5 100644
--- a/drivers/regulator/fixed.c
+++ b/drivers/regulator/fixed.c
@@ -334,6 +334,7 @@ static int reg_fixed_voltage_probe(struct platform_device *pdev)
ret = dev_err_probe(&pdev->dev, PTR_ERR(drvdata->dev),
"Failed to register regulator: %ld\n",
PTR_ERR(drvdata->dev));
+ gpiod_put(cfg.ena_gpiod);
return ret;
}
diff --git a/drivers/regulator/rtq2208-regulator.c b/drivers/regulator/rtq2208-regulator.c
index 9cde7181b0f0..f669a562f036 100644
--- a/drivers/regulator/rtq2208-regulator.c
+++ b/drivers/regulator/rtq2208-regulator.c
@@ -53,7 +53,7 @@
#define RTQ2208_MASK_BUCKPH_GROUP1 GENMASK(6, 4)
#define RTQ2208_MASK_BUCKPH_GROUP2 GENMASK(2, 0)
#define RTQ2208_MASK_LDO2_OPT0 BIT(7)
-#define RTQ2208_MASK_LDO2_OPT1 BIT(6)
+#define RTQ2208_MASK_LDO2_OPT1 BIT(7)
#define RTQ2208_MASK_LDO1_FIXED BIT(6)
/* Size */
@@ -543,14 +543,14 @@ static int rtq2208_regulator_check(struct device *dev, int *num, int *regulator_
switch (FIELD_GET(RTQ2208_MASK_BUCKPH_GROUP2, buck_phase)) {
case 2:
- rtq2208_used_table[RTQ2208_BUCK_F] = true;
+ rtq2208_used_table[RTQ2208_BUCK_H] = true;
fallthrough;
case 1:
rtq2208_used_table[RTQ2208_BUCK_E] = true;
fallthrough;
case 0:
case 3:
- rtq2208_used_table[RTQ2208_BUCK_H] = true;
+ rtq2208_used_table[RTQ2208_BUCK_F] = true;
fallthrough;
default:
rtq2208_used_table[RTQ2208_BUCK_G] = true;
diff --git a/drivers/reset/reset-imx8mp-audiomix.c b/drivers/reset/reset-imx8mp-audiomix.c
index 6b357adfe646..eceb37ff5dc5 100644
--- a/drivers/reset/reset-imx8mp-audiomix.c
+++ b/drivers/reset/reset-imx8mp-audiomix.c
@@ -14,8 +14,8 @@
#include <linux/reset-controller.h>
#define IMX8MP_AUDIOMIX_EARC_RESET_OFFSET 0x200
-#define IMX8MP_AUDIOMIX_EARC_RESET_MASK BIT(1)
-#define IMX8MP_AUDIOMIX_EARC_PHY_RESET_MASK BIT(2)
+#define IMX8MP_AUDIOMIX_EARC_RESET_MASK BIT(0)
+#define IMX8MP_AUDIOMIX_EARC_PHY_RESET_MASK BIT(1)
#define IMX8MP_AUDIOMIX_DSP_RUNSTALL_OFFSET 0x108
#define IMX8MP_AUDIOMIX_DSP_RUNSTALL_MASK BIT(5)
diff --git a/drivers/s390/net/ctcm_mpc.c b/drivers/s390/net/ctcm_mpc.c
index 0aeafa772fb1..407b7c516658 100644
--- a/drivers/s390/net/ctcm_mpc.c
+++ b/drivers/s390/net/ctcm_mpc.c
@@ -701,7 +701,6 @@ static void mpc_rcvd_sweep_req(struct mpcg_info *mpcginfo)
grp->sweep_req_pend_num--;
ctcmpc_send_sweep_resp(ch);
- kfree(mpcginfo);
return;
}
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 4c62c597c7be..b3af9b78fa12 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -2208,9 +2208,17 @@ sg_remove_sfp_usercontext(struct work_struct *work)
write_lock_irqsave(&sfp->rq_list_lock, iflags);
while (!list_empty(&sfp->rq_list)) {
srp = list_first_entry(&sfp->rq_list, Sg_request, entry);
- sg_finish_rem_req(srp);
list_del(&srp->entry);
+ write_unlock_irqrestore(&sfp->rq_list_lock, iflags);
+
+ sg_finish_rem_req(srp);
+ /*
+ * sg_rq_end_io() uses srp->parentfp. Hence, only clear
+ * srp->parentfp after blk_mq_free_request() has been called.
+ */
srp->parentfp = NULL;
+
+ write_lock_irqsave(&sfp->rq_list_lock, iflags);
}
write_unlock_irqrestore(&sfp->rq_list_lock, iflags);
diff --git a/drivers/slimbus/qcom-ngd-ctrl.c b/drivers/slimbus/qcom-ngd-ctrl.c
index 4fb66986cc22..cd40ab839c54 100644
--- a/drivers/slimbus/qcom-ngd-ctrl.c
+++ b/drivers/slimbus/qcom-ngd-ctrl.c
@@ -1241,6 +1241,7 @@ static void qcom_slim_ngd_notify_slaves(struct qcom_slim_ngd_ctrl *ctrl)
if (slim_get_logical_addr(sbdev))
dev_err(ctrl->dev, "Failed to get logical address\n");
+ put_device(&sbdev->dev);
}
}
diff --git a/drivers/spi/Kconfig b/drivers/spi/Kconfig
index 4d8f00c850c1..55675750182e 100644
--- a/drivers/spi/Kconfig
+++ b/drivers/spi/Kconfig
@@ -1181,10 +1181,10 @@ config SPI_TEGRA210_QUAD
config SPI_TEGRA114
tristate "NVIDIA Tegra114 SPI Controller"
- depends on (ARCH_TEGRA && TEGRA20_APB_DMA) || COMPILE_TEST
+ depends on ARCH_TEGRA || COMPILE_TEST
depends on RESET_CONTROLLER
help
- SPI driver for NVIDIA Tegra114 SPI Controller interface. This controller
+ SPI controller driver for NVIDIA Tegra114 and later SoCs. This controller
is different than the older SoCs SPI controller and also register interface
get changed with this controller.
diff --git a/drivers/spi/spi-amlogic-spifc-a1.c b/drivers/spi/spi-amlogic-spifc-a1.c
index 18c9aa2cbc29..eb503790017b 100644
--- a/drivers/spi/spi-amlogic-spifc-a1.c
+++ b/drivers/spi/spi-amlogic-spifc-a1.c
@@ -353,7 +353,9 @@ static int amlogic_spifc_a1_probe(struct platform_device *pdev)
pm_runtime_set_autosuspend_delay(spifc->dev, 500);
pm_runtime_use_autosuspend(spifc->dev);
- devm_pm_runtime_enable(spifc->dev);
+ ret = devm_pm_runtime_enable(spifc->dev);
+ if (ret)
+ return ret;
ctrl->num_chipselect = 1;
ctrl->dev.of_node = pdev->dev.of_node;
diff --git a/drivers/spi/spi-bcm63xx.c b/drivers/spi/spi-bcm63xx.c
index b56210734caa..2e3c62f12bef 100644
--- a/drivers/spi/spi-bcm63xx.c
+++ b/drivers/spi/spi-bcm63xx.c
@@ -247,6 +247,20 @@ static int bcm63xx_txrx_bufs(struct spi_device *spi, struct spi_transfer *first,
if (t->rx_buf) {
do_rx = true;
+
+ /*
+ * In certain hardware implementations, there appears to be a
+ * hidden accumulator that tracks the number of bytes written into
+ * the hardware FIFO, and this accumulator overrides the length in
+ * the SPI_MSG_CTL register.
+ *
+ * Therefore, for read-only transfers, we need to write some dummy
+ * value into the FIFO to keep the accumulator tracking the correct
+ * length.
+ */
+ if (!t->tx_buf)
+ memset_io(bs->tx_io + len, 0xFF, t->len);
+
/* prepend is half-duplex write only */
if (t == first)
prepend_len = 0;
diff --git a/drivers/spi/spi-cadence-quadspi.c b/drivers/spi/spi-cadence-quadspi.c
index 81017402bc56..af6d050da1c8 100644
--- a/drivers/spi/spi-cadence-quadspi.c
+++ b/drivers/spi/spi-cadence-quadspi.c
@@ -1981,6 +1981,13 @@ static int cqspi_probe(struct platform_device *pdev)
cqspi->current_cs = -1;
cqspi->sclk = 0;
+ if (!(ddata && (ddata->quirks & CQSPI_DISABLE_RUNTIME_PM))) {
+ pm_runtime_enable(dev);
+ pm_runtime_set_autosuspend_delay(dev, CQSPI_AUTOSUSPEND_TIMEOUT);
+ pm_runtime_use_autosuspend(dev);
+ pm_runtime_get_noresume(dev);
+ }
+
ret = cqspi_setup_flash(cqspi);
if (ret) {
dev_err(dev, "failed to setup flash parameters %d\n", ret);
@@ -1995,14 +2002,7 @@ static int cqspi_probe(struct platform_device *pdev)
if (cqspi->use_direct_mode) {
ret = cqspi_request_mmap_dma(cqspi);
if (ret == -EPROBE_DEFER)
- goto probe_dma_failed;
- }
-
- if (!(ddata && (ddata->quirks & CQSPI_DISABLE_RUNTIME_PM))) {
- pm_runtime_enable(dev);
- pm_runtime_set_autosuspend_delay(dev, CQSPI_AUTOSUSPEND_TIMEOUT);
- pm_runtime_use_autosuspend(dev);
- pm_runtime_get_noresume(dev);
+ goto probe_setup_failed;
}
ret = spi_register_controller(host);
@@ -2012,7 +2012,6 @@ static int cqspi_probe(struct platform_device *pdev)
}
if (!(ddata && (ddata->quirks & CQSPI_DISABLE_RUNTIME_PM))) {
- pm_runtime_put_autosuspend(dev);
pm_runtime_mark_last_busy(dev);
pm_runtime_put_autosuspend(dev);
}
@@ -2021,7 +2020,6 @@ static int cqspi_probe(struct platform_device *pdev)
probe_setup_failed:
if (!(ddata && (ddata->quirks & CQSPI_DISABLE_RUNTIME_PM)))
pm_runtime_disable(dev);
-probe_dma_failed:
cqspi_controller_enable(cqspi, 0);
probe_reset_failed:
if (cqspi->is_jh7110)
diff --git a/drivers/spi/spi-fsl-lpspi.c b/drivers/spi/spi-fsl-lpspi.c
index 8da66e101386..065456aba2ae 100644
--- a/drivers/spi/spi-fsl-lpspi.c
+++ b/drivers/spi/spi-fsl-lpspi.c
@@ -486,7 +486,13 @@ static int fsl_lpspi_setup_transfer(struct spi_controller *controller,
fsl_lpspi->tx = fsl_lpspi_buf_tx_u32;
}
- fsl_lpspi->watermark = min_t(typeof(fsl_lpspi->watermark),
+ /*
+ * t->len is 'unsigned' and txfifosize and watermrk is 'u8', force
+ * type cast is inevitable. When len > 255, len will be truncated in min_t(),
+ * it caused wrong watermark set. 'unsigned int' is as the designated type
+ * for min_t() to avoid truncation.
+ */
+ fsl_lpspi->watermark = min_t(unsigned int,
fsl_lpspi->txfifosize,
t->len);
diff --git a/drivers/spi/spi-imx.c b/drivers/spi/spi-imx.c
index 155ddeb8fcd4..bbf1fd4fe1e9 100644
--- a/drivers/spi/spi-imx.c
+++ b/drivers/spi/spi-imx.c
@@ -519,9 +519,15 @@ static void mx51_ecspi_trigger(struct spi_imx_data *spi_imx)
{
u32 reg;
- reg = readl(spi_imx->base + MX51_ECSPI_CTRL);
- reg |= MX51_ECSPI_CTRL_XCH;
- writel(reg, spi_imx->base + MX51_ECSPI_CTRL);
+ if (spi_imx->usedma) {
+ reg = readl(spi_imx->base + MX51_ECSPI_DMA);
+ reg |= MX51_ECSPI_DMA_TEDEN | MX51_ECSPI_DMA_RXDEN;
+ writel(reg, spi_imx->base + MX51_ECSPI_DMA);
+ } else {
+ reg = readl(spi_imx->base + MX51_ECSPI_CTRL);
+ reg |= MX51_ECSPI_CTRL_XCH;
+ writel(reg, spi_imx->base + MX51_ECSPI_CTRL);
+ }
}
static void mx51_ecspi_disable(struct spi_imx_data *spi_imx)
@@ -759,7 +765,6 @@ static void mx51_setup_wml(struct spi_imx_data *spi_imx)
writel(MX51_ECSPI_DMA_RX_WML(spi_imx->wml - 1) |
MX51_ECSPI_DMA_TX_WML(tx_wml) |
MX51_ECSPI_DMA_RXT_WML(spi_imx->wml) |
- MX51_ECSPI_DMA_TEDEN | MX51_ECSPI_DMA_RXDEN |
MX51_ECSPI_DMA_RXTDEN, spi_imx->base + MX51_ECSPI_DMA);
}
@@ -1520,6 +1525,8 @@ static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
reinit_completion(&spi_imx->dma_tx_completion);
dma_async_issue_pending(controller->dma_tx);
+ spi_imx->devtype_data->trigger(spi_imx);
+
transfer_timeout = spi_imx_calculate_timeout(spi_imx, transfer->len);
/* Wait SDMA to finish the data transfer.*/
diff --git a/drivers/spi/spi-nxp-fspi.c b/drivers/spi/spi-nxp-fspi.c
index b6c79e50d842..50a7e4916a60 100644
--- a/drivers/spi/spi-nxp-fspi.c
+++ b/drivers/spi/spi-nxp-fspi.c
@@ -1287,7 +1287,7 @@ static int nxp_fspi_probe(struct platform_device *pdev)
{
struct spi_controller *ctlr;
struct device *dev = &pdev->dev;
- struct device_node *np = dev->of_node;
+ struct fwnode_handle *fwnode = dev_fwnode(dev);
struct resource *res;
struct nxp_fspi *f;
int ret, irq;
@@ -1309,7 +1309,7 @@ static int nxp_fspi_probe(struct platform_device *pdev)
platform_set_drvdata(pdev, f);
/* find the resources - configuration register address space */
- if (is_acpi_node(dev_fwnode(f->dev)))
+ if (is_acpi_node(fwnode))
f->iobase = devm_platform_ioremap_resource(pdev, 0);
else
f->iobase = devm_platform_ioremap_resource_byname(pdev, "fspi_base");
@@ -1317,7 +1317,7 @@ static int nxp_fspi_probe(struct platform_device *pdev)
return PTR_ERR(f->iobase);
/* find the resources - controller memory mapped space */
- if (is_acpi_node(dev_fwnode(f->dev)))
+ if (is_acpi_node(fwnode))
res = platform_get_resource(pdev, IORESOURCE_MEM, 1);
else
res = platform_get_resource_byname(pdev,
@@ -1330,7 +1330,7 @@ static int nxp_fspi_probe(struct platform_device *pdev)
f->memmap_phy_size = resource_size(res);
/* find the clocks */
- if (dev_of_node(&pdev->dev)) {
+ if (is_of_node(fwnode)) {
f->clk_en = devm_clk_get(dev, "fspi_en");
if (IS_ERR(f->clk_en))
return PTR_ERR(f->clk_en);
@@ -1383,7 +1383,7 @@ static int nxp_fspi_probe(struct platform_device *pdev)
else
ctlr->mem_caps = &nxp_fspi_mem_caps;
- ctlr->dev.of_node = np;
+ device_set_node(&ctlr->dev, fwnode);
ret = devm_add_action_or_reset(dev, nxp_fspi_cleanup, f);
if (ret)
diff --git a/drivers/spi/spi-xilinx.c b/drivers/spi/spi-xilinx.c
index d59cc8a18484..c86dc56f38b4 100644
--- a/drivers/spi/spi-xilinx.c
+++ b/drivers/spi/spi-xilinx.c
@@ -300,7 +300,7 @@ static int xilinx_spi_txrx_bufs(struct spi_device *spi, struct spi_transfer *t)
/* Read out all the data from the Rx FIFO */
rx_words = n_words;
- stalled = 10;
+ stalled = 32;
while (rx_words) {
if (rx_words == n_words && !(stalled--) &&
!(sr & XSPI_SR_TX_EMPTY_MASK) &&
diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 2e0647a06890..e25df9990f82 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -2851,6 +2851,18 @@ static acpi_status acpi_register_spi_device(struct spi_controller *ctlr,
acpi_set_modalias(adev, acpi_device_hid(adev), spi->modalias,
sizeof(spi->modalias));
+ /*
+ * This gets re-tried in spi_probe() for -EPROBE_DEFER handling in case
+ * the GPIO controller does not have a driver yet. This needs to be done
+ * here too, because this call sets the GPIO direction and/or bias.
+ * Setting these needs to be done even if there is no driver, in which
+ * case spi_probe() will never get called.
+ * TODO: ideally the setup of the GPIO should be handled in a generic
+ * manner in the ACPI/gpiolib core code.
+ */
+ if (spi->irq < 0)
+ spi->irq = acpi_dev_gpio_irq_get(adev, 0);
+
acpi_device_set_enumerated(adev);
adev->power.flags.ignore_parent = true;
diff --git a/drivers/target/loopback/tcm_loop.c b/drivers/target/loopback/tcm_loop.c
index c7b7da629741..01a8e349dc4d 100644
--- a/drivers/target/loopback/tcm_loop.c
+++ b/drivers/target/loopback/tcm_loop.c
@@ -894,6 +894,9 @@ static ssize_t tcm_loop_tpg_address_show(struct config_item *item,
struct tcm_loop_tpg, tl_se_tpg);
struct tcm_loop_hba *tl_hba = tl_tpg->tl_hba;
+ if (!tl_hba->sh)
+ return -ENODEV;
+
return snprintf(page, PAGE_SIZE, "%d:0:%d\n",
tl_hba->sh->host_no, tl_tpg->tl_tpgt);
}
diff --git a/drivers/target/target_core_configfs.c b/drivers/target/target_core_configfs.c
index b19acd662726..9e51c535ba8c 100644
--- a/drivers/target/target_core_configfs.c
+++ b/drivers/target/target_core_configfs.c
@@ -3670,8 +3670,6 @@ static int __init target_core_init_configfs(void)
{
struct configfs_subsystem *subsys = &target_core_fabrics;
struct t10_alua_lu_gp *lu_gp;
- struct cred *kern_cred;
- const struct cred *old_cred;
int ret;
pr_debug("TARGET_CORE[0]: Loading Generic Kernel Storage"
@@ -3748,16 +3746,8 @@ static int __init target_core_init_configfs(void)
if (ret < 0)
goto out;
- /* We use the kernel credentials to access the target directory */
- kern_cred = prepare_kernel_cred(&init_task);
- if (!kern_cred) {
- ret = -ENOMEM;
- goto out;
- }
- old_cred = override_creds(kern_cred);
- target_init_dbroot();
- revert_creds(old_cred);
- put_cred(kern_cred);
+ scoped_with_kernel_creds()
+ target_init_dbroot();
return 0;
diff --git a/drivers/tee/qcomtee/call.c b/drivers/tee/qcomtee/call.c
index ac134452cc9c..65f9140d4e1f 100644
--- a/drivers/tee/qcomtee/call.c
+++ b/drivers/tee/qcomtee/call.c
@@ -645,7 +645,7 @@ static void qcomtee_get_version(struct tee_device *teedev,
static void qcomtee_get_qtee_feature_list(struct tee_context *ctx, u32 id,
u32 *version)
{
- struct qcomtee_object_invoke_ctx *oic __free(kfree);
+ struct qcomtee_object_invoke_ctx *oic __free(kfree) = NULL;
struct qcomtee_object *client_env, *service;
struct qcomtee_arg u[3] = { 0 };
int result;
diff --git a/drivers/tee/qcomtee/core.c b/drivers/tee/qcomtee/core.c
index b6715ada7700..ecd04403591c 100644
--- a/drivers/tee/qcomtee/core.c
+++ b/drivers/tee/qcomtee/core.c
@@ -82,7 +82,7 @@ static void qcomtee_do_release_qtee_object(struct work_struct *work)
{
struct qcomtee_object *object;
struct qcomtee *qcomtee;
- int ret, result;
+ int ret, result = 0;
/* RELEASE does not require any argument. */
struct qcomtee_arg args[] = { { .type = QCOMTEE_ARG_TYPE_INV } };
diff --git a/drivers/thunderbolt/nhi.c b/drivers/thunderbolt/nhi.c
index 5f63f9b9cf40..addb4a20d5ea 100644
--- a/drivers/thunderbolt/nhi.c
+++ b/drivers/thunderbolt/nhi.c
@@ -1538,6 +1538,8 @@ static struct pci_device_id nhi_ids[] = {
.driver_data = (kernel_ulong_t)&icl_nhi_ops },
{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_PTL_P_NHI1),
.driver_data = (kernel_ulong_t)&icl_nhi_ops },
+ { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_WCL_NHI0),
+ .driver_data = (kernel_ulong_t)&icl_nhi_ops },
{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_BARLOW_RIDGE_HOST_80G_NHI) },
{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_BARLOW_RIDGE_HOST_40G_NHI) },
diff --git a/drivers/thunderbolt/nhi.h b/drivers/thunderbolt/nhi.h
index 16744f25a9a0..24ac4246d0ca 100644
--- a/drivers/thunderbolt/nhi.h
+++ b/drivers/thunderbolt/nhi.h
@@ -75,6 +75,7 @@ extern const struct tb_nhi_ops icl_nhi_ops;
#define PCI_DEVICE_ID_INTEL_TITAN_RIDGE_DD_BRIDGE 0x15ef
#define PCI_DEVICE_ID_INTEL_ADL_NHI0 0x463e
#define PCI_DEVICE_ID_INTEL_ADL_NHI1 0x466d
+#define PCI_DEVICE_ID_INTEL_WCL_NHI0 0x4d33
#define PCI_DEVICE_ID_INTEL_BARLOW_RIDGE_HOST_80G_NHI 0x5781
#define PCI_DEVICE_ID_INTEL_BARLOW_RIDGE_HOST_40G_NHI 0x5784
#define PCI_DEVICE_ID_INTEL_BARLOW_RIDGE_HUB_80G_BRIDGE 0x5786
diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c
index 8bb1a01fef2a..41c1d909525c 100644
--- a/drivers/tty/pty.c
+++ b/drivers/tty/pty.c
@@ -589,6 +589,23 @@ static inline void legacy_pty_init(void) { }
#ifdef CONFIG_UNIX98_PTYS
static struct cdev ptmx_cdev;
+static struct file *ptm_open_peer_file(struct file *master,
+ struct tty_struct *tty, int flags)
+{
+ struct path path;
+ struct file *file;
+
+ /* Compute the slave's path */
+ path.mnt = devpts_mntget(master, tty->driver_data);
+ if (IS_ERR(path.mnt))
+ return ERR_CAST(path.mnt);
+ path.dentry = tty->link->driver_data;
+
+ file = dentry_open(&path, flags, current_cred());
+ mntput(path.mnt);
+ return file;
+}
+
/**
* ptm_open_peer - open the peer of a pty
* @master: the open struct file of the ptmx device node
@@ -601,42 +618,10 @@ static struct cdev ptmx_cdev;
*/
int ptm_open_peer(struct file *master, struct tty_struct *tty, int flags)
{
- int fd;
- struct file *filp;
- int retval = -EINVAL;
- struct path path;
-
if (tty->driver != ptm_driver)
return -EIO;
- fd = get_unused_fd_flags(flags);
- if (fd < 0) {
- retval = fd;
- goto err;
- }
-
- /* Compute the slave's path */
- path.mnt = devpts_mntget(master, tty->driver_data);
- if (IS_ERR(path.mnt)) {
- retval = PTR_ERR(path.mnt);
- goto err_put;
- }
- path.dentry = tty->link->driver_data;
-
- filp = dentry_open(&path, flags, current_cred());
- mntput(path.mnt);
- if (IS_ERR(filp)) {
- retval = PTR_ERR(filp);
- goto err_put;
- }
-
- fd_install(fd, filp);
- return fd;
-
-err_put:
- put_unused_fd(fd);
-err:
- return retval;
+ return FD_ADD(flags, ptm_open_peer_file(master, tty, flags));
}
static int pty_unix98_ioctl(struct tty_struct *tty,
diff --git a/drivers/tty/serial/8250/8250.h b/drivers/tty/serial/8250/8250.h
index 58e64c4e1e3a..e99f5193d8f1 100644
--- a/drivers/tty/serial/8250/8250.h
+++ b/drivers/tty/serial/8250/8250.h
@@ -322,13 +322,13 @@ static inline void serial8250_pnp_exit(void) { }
#endif
#ifdef CONFIG_SERIAL_8250_RSA
-void univ8250_rsa_support(struct uart_ops *ops);
+void univ8250_rsa_support(struct uart_ops *ops, const struct uart_ops *core_ops);
void rsa_enable(struct uart_8250_port *up);
void rsa_disable(struct uart_8250_port *up);
void rsa_autoconfig(struct uart_8250_port *up);
void rsa_reset(struct uart_8250_port *up);
#else
-static inline void univ8250_rsa_support(struct uart_ops *ops) { }
+static inline void univ8250_rsa_support(struct uart_ops *ops, const struct uart_ops *core_ops) { }
static inline void rsa_enable(struct uart_8250_port *up) {}
static inline void rsa_disable(struct uart_8250_port *up) {}
static inline void rsa_autoconfig(struct uart_8250_port *up) {}
diff --git a/drivers/tty/serial/8250/8250_platform.c b/drivers/tty/serial/8250/8250_platform.c
index b27981340e76..fe7ec440ffa5 100644
--- a/drivers/tty/serial/8250/8250_platform.c
+++ b/drivers/tty/serial/8250/8250_platform.c
@@ -75,7 +75,7 @@ static void __init __serial8250_isa_init_ports(void)
/* chain base port ops to support Remote Supervisor Adapter */
univ8250_port_ops = *univ8250_port_base_ops;
- univ8250_rsa_support(&univ8250_port_ops);
+ univ8250_rsa_support(&univ8250_port_ops, univ8250_port_base_ops);
if (share_irqs)
irqflag = IRQF_SHARED;
diff --git a/drivers/tty/serial/8250/8250_rsa.c b/drivers/tty/serial/8250/8250_rsa.c
index 40a3dbd9e452..1f182f165525 100644
--- a/drivers/tty/serial/8250/8250_rsa.c
+++ b/drivers/tty/serial/8250/8250_rsa.c
@@ -14,6 +14,8 @@
static unsigned long probe_rsa[PORT_RSA_MAX];
static unsigned int probe_rsa_count;
+static const struct uart_ops *core_port_base_ops;
+
static int rsa8250_request_resource(struct uart_8250_port *up)
{
struct uart_port *port = &up->port;
@@ -67,7 +69,7 @@ static void univ8250_config_port(struct uart_port *port, int flags)
}
}
- univ8250_port_base_ops->config_port(port, flags);
+ core_port_base_ops->config_port(port, flags);
if (port->type != PORT_RSA && up->probe & UART_PROBE_RSA)
rsa8250_release_resource(up);
@@ -78,11 +80,11 @@ static int univ8250_request_port(struct uart_port *port)
struct uart_8250_port *up = up_to_u8250p(port);
int ret;
- ret = univ8250_port_base_ops->request_port(port);
+ ret = core_port_base_ops->request_port(port);
if (ret == 0 && port->type == PORT_RSA) {
ret = rsa8250_request_resource(up);
if (ret < 0)
- univ8250_port_base_ops->release_port(port);
+ core_port_base_ops->release_port(port);
}
return ret;
@@ -94,15 +96,25 @@ static void univ8250_release_port(struct uart_port *port)
if (port->type == PORT_RSA)
rsa8250_release_resource(up);
- univ8250_port_base_ops->release_port(port);
+ core_port_base_ops->release_port(port);
}
-void univ8250_rsa_support(struct uart_ops *ops)
+/*
+ * It is not allowed to directly reference any symbols from 8250.ko here as
+ * that would result in a dependency loop between the 8250.ko and
+ * 8250_base.ko modules. This function is called from 8250.ko and is used to
+ * break the symbolic dependency cycle. Anything that is needed from 8250.ko
+ * has to be passed as pointers to this function which then can adjust those
+ * variables on 8250.ko side or store them locally as needed.
+ */
+void univ8250_rsa_support(struct uart_ops *ops, const struct uart_ops *core_ops)
{
+ core_port_base_ops = core_ops;
ops->config_port = univ8250_config_port;
ops->request_port = univ8250_request_port;
ops->release_port = univ8250_release_port;
}
+EXPORT_SYMBOL_FOR_MODULES(univ8250_rsa_support, "8250");
module_param_hw_array(probe_rsa, ulong, ioport, &probe_rsa_count, 0444);
MODULE_PARM_DESC(probe_rsa, "Probe I/O ports for RSA");
@@ -146,7 +158,6 @@ void rsa_enable(struct uart_8250_port *up)
if (up->port.uartclk == SERIAL_RSA_BAUD_BASE * 16)
serial_out(up, UART_RSA_FRR, 0);
}
-EXPORT_SYMBOL_FOR_MODULES(rsa_enable, "8250_base");
/*
* Attempts to turn off the RSA FIFO and resets the RSA board back to 115kbps compat mode. It is
@@ -178,7 +189,6 @@ void rsa_disable(struct uart_8250_port *up)
if (result)
up->port.uartclk = SERIAL_RSA_BAUD_BASE_LO * 16;
}
-EXPORT_SYMBOL_FOR_MODULES(rsa_disable, "8250_base");
void rsa_autoconfig(struct uart_8250_port *up)
{
@@ -191,7 +201,6 @@ void rsa_autoconfig(struct uart_8250_port *up)
if (__rsa_enable(up))
up->port.type = PORT_RSA;
}
-EXPORT_SYMBOL_FOR_MODULES(rsa_autoconfig, "8250_base");
void rsa_reset(struct uart_8250_port *up)
{
@@ -200,7 +209,6 @@ void rsa_reset(struct uart_8250_port *up)
serial_out(up, UART_RSA_FRR, 0);
}
-EXPORT_SYMBOL_FOR_MODULES(rsa_reset, "8250_base");
#ifdef CONFIG_SERIAL_8250_DEPRECATED_OPTIONS
#ifndef MODULE
diff --git a/drivers/tty/serial/8250/Makefile b/drivers/tty/serial/8250/Makefile
index 513a0941c284..9ec4d5fe64de 100644
--- a/drivers/tty/serial/8250/Makefile
+++ b/drivers/tty/serial/8250/Makefile
@@ -7,7 +7,6 @@ obj-$(CONFIG_SERIAL_8250) += 8250.o
8250-y := 8250_core.o
8250-y += 8250_platform.o
8250-$(CONFIG_SERIAL_8250_PNP) += 8250_pnp.o
-8250-$(CONFIG_SERIAL_8250_RSA) += 8250_rsa.o
obj-$(CONFIG_SERIAL_8250) += 8250_base.o
8250_base-y := 8250_port.o
@@ -15,6 +14,7 @@ obj-$(CONFIG_SERIAL_8250) += 8250_base.o
8250_base-$(CONFIG_SERIAL_8250_DWLIB) += 8250_dwlib.o
8250_base-$(CONFIG_SERIAL_8250_FINTEK) += 8250_fintek.o
8250_base-$(CONFIG_SERIAL_8250_PCILIB) += 8250_pcilib.o
+8250_base-$(CONFIG_SERIAL_8250_RSA) += 8250_rsa.o
obj-$(CONFIG_SERIAL_8250_CONSOLE) += 8250_early.o
diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c
index 22939841b1de..7f17d288c807 100644
--- a/drivers/tty/serial/amba-pl011.c
+++ b/drivers/tty/serial/amba-pl011.c
@@ -628,7 +628,7 @@ static int pl011_dma_tx_refill(struct uart_amba_port *uap)
dmatx->len = count;
dmatx->dma = dma_map_single(dma_dev->dev, dmatx->buf, count,
DMA_TO_DEVICE);
- if (dmatx->dma == DMA_MAPPING_ERROR) {
+ if (dma_mapping_error(dma_dev->dev, dmatx->dma)) {
uap->dmatx.queued = false;
dev_dbg(uap->port.dev, "unable to map TX DMA\n");
return -EBUSY;
diff --git a/drivers/usb/cdns3/cdns3-pci-wrap.c b/drivers/usb/cdns3/cdns3-pci-wrap.c
index 3b3b3dc75f35..57f57c24c663 100644
--- a/drivers/usb/cdns3/cdns3-pci-wrap.c
+++ b/drivers/usb/cdns3/cdns3-pci-wrap.c
@@ -98,10 +98,8 @@ static int cdns3_pci_probe(struct pci_dev *pdev,
wrap = pci_get_drvdata(func);
} else {
wrap = kzalloc(sizeof(*wrap), GFP_KERNEL);
- if (!wrap) {
- pci_disable_device(pdev);
+ if (!wrap)
return -ENOMEM;
- }
}
res = wrap->dev_res;
@@ -160,7 +158,6 @@ static int cdns3_pci_probe(struct pci_dev *pdev,
/* register platform device */
wrap->plat_dev = platform_device_register_full(&plat_info);
if (IS_ERR(wrap->plat_dev)) {
- pci_disable_device(pdev);
err = PTR_ERR(wrap->plat_dev);
kfree(wrap);
return err;
diff --git a/drivers/usb/dwc3/core.c b/drivers/usb/dwc3/core.c
index ae140c356295..c2ce2f5e60a1 100644
--- a/drivers/usb/dwc3/core.c
+++ b/drivers/usb/dwc3/core.c
@@ -25,6 +25,7 @@
#include <linux/of.h>
#include <linux/of_graph.h>
#include <linux/acpi.h>
+#include <linux/pci.h>
#include <linux/pinctrl/consumer.h>
#include <linux/pinctrl/devinfo.h>
#include <linux/reset.h>
@@ -2241,7 +2242,7 @@ int dwc3_core_probe(const struct dwc3_probe_data *data)
dev_set_drvdata(dev, dwc);
dwc3_cache_hwparams(dwc);
- if (!dwc->sysdev_is_parent &&
+ if (!dev_is_pci(dwc->sysdev) &&
DWC3_GHWPARAMS0_AWIDTH(dwc->hwparams.hwparams0) == 64) {
ret = dma_set_mask_and_coherent(dwc->sysdev, DMA_BIT_MASK(64));
if (ret)
diff --git a/drivers/usb/dwc3/dwc3-pci.c b/drivers/usb/dwc3/dwc3-pci.c
index 39c72cb52ce7..8f5faf632a8b 100644
--- a/drivers/usb/dwc3/dwc3-pci.c
+++ b/drivers/usb/dwc3/dwc3-pci.c
@@ -21,40 +21,41 @@
#include <linux/acpi.h>
#include <linux/delay.h>
+#define PCI_DEVICE_ID_INTEL_CMLLP 0x02ee
+#define PCI_DEVICE_ID_INTEL_CMLH 0x06ee
+#define PCI_DEVICE_ID_INTEL_BXT 0x0aaa
#define PCI_DEVICE_ID_INTEL_BYT 0x0f37
#define PCI_DEVICE_ID_INTEL_MRFLD 0x119e
-#define PCI_DEVICE_ID_INTEL_BSW 0x22b7
-#define PCI_DEVICE_ID_INTEL_SPTLP 0x9d30
-#define PCI_DEVICE_ID_INTEL_SPTH 0xa130
-#define PCI_DEVICE_ID_INTEL_BXT 0x0aaa
#define PCI_DEVICE_ID_INTEL_BXT_M 0x1aaa
-#define PCI_DEVICE_ID_INTEL_APL 0x5aaa
-#define PCI_DEVICE_ID_INTEL_KBP 0xa2b0
-#define PCI_DEVICE_ID_INTEL_CMLLP 0x02ee
-#define PCI_DEVICE_ID_INTEL_CMLH 0x06ee
+#define PCI_DEVICE_ID_INTEL_BSW 0x22b7
#define PCI_DEVICE_ID_INTEL_GLK 0x31aa
-#define PCI_DEVICE_ID_INTEL_CNPLP 0x9dee
-#define PCI_DEVICE_ID_INTEL_CNPH 0xa36e
-#define PCI_DEVICE_ID_INTEL_CNPV 0xa3b0
#define PCI_DEVICE_ID_INTEL_ICLLP 0x34ee
-#define PCI_DEVICE_ID_INTEL_EHL 0x4b7e
-#define PCI_DEVICE_ID_INTEL_TGPLP 0xa0ee
#define PCI_DEVICE_ID_INTEL_TGPH 0x43ee
-#define PCI_DEVICE_ID_INTEL_JSP 0x4dee
-#define PCI_DEVICE_ID_INTEL_WCL 0x4d7e
#define PCI_DEVICE_ID_INTEL_ADL 0x460e
-#define PCI_DEVICE_ID_INTEL_ADL_PCH 0x51ee
#define PCI_DEVICE_ID_INTEL_ADLN 0x465e
+#define PCI_DEVICE_ID_INTEL_EHL 0x4b7e
+#define PCI_DEVICE_ID_INTEL_WCL 0x4d7e
+#define PCI_DEVICE_ID_INTEL_JSP 0x4dee
+#define PCI_DEVICE_ID_INTEL_ADL_PCH 0x51ee
#define PCI_DEVICE_ID_INTEL_ADLN_PCH 0x54ee
-#define PCI_DEVICE_ID_INTEL_ADLS 0x7ae1
-#define PCI_DEVICE_ID_INTEL_RPL 0xa70e
+#define PCI_DEVICE_ID_INTEL_APL 0x5aaa
+#define PCI_DEVICE_ID_INTEL_NVLS_PCH 0x6e6f
+#define PCI_DEVICE_ID_INTEL_ARLH_PCH 0x777e
#define PCI_DEVICE_ID_INTEL_RPLS 0x7a61
+#define PCI_DEVICE_ID_INTEL_MTL 0x7e7e
+#define PCI_DEVICE_ID_INTEL_ADLS 0x7ae1
#define PCI_DEVICE_ID_INTEL_MTLM 0x7eb1
#define PCI_DEVICE_ID_INTEL_MTLP 0x7ec1
#define PCI_DEVICE_ID_INTEL_MTLS 0x7f6f
-#define PCI_DEVICE_ID_INTEL_MTL 0x7e7e
-#define PCI_DEVICE_ID_INTEL_ARLH_PCH 0x777e
#define PCI_DEVICE_ID_INTEL_TGL 0x9a15
+#define PCI_DEVICE_ID_INTEL_SPTLP 0x9d30
+#define PCI_DEVICE_ID_INTEL_CNPLP 0x9dee
+#define PCI_DEVICE_ID_INTEL_TGPLP 0xa0ee
+#define PCI_DEVICE_ID_INTEL_SPTH 0xa130
+#define PCI_DEVICE_ID_INTEL_KBP 0xa2b0
+#define PCI_DEVICE_ID_INTEL_CNPH 0xa36e
+#define PCI_DEVICE_ID_INTEL_CNPV 0xa3b0
+#define PCI_DEVICE_ID_INTEL_RPL 0xa70e
#define PCI_DEVICE_ID_INTEL_PTLH 0xe332
#define PCI_DEVICE_ID_INTEL_PTLH_PCH 0xe37e
#define PCI_DEVICE_ID_INTEL_PTLU 0xe432
@@ -412,40 +413,41 @@ static void dwc3_pci_remove(struct pci_dev *pci)
}
static const struct pci_device_id dwc3_pci_id_table[] = {
- { PCI_DEVICE_DATA(INTEL, BSW, &dwc3_pci_intel_swnode) },
- { PCI_DEVICE_DATA(INTEL, BYT, &dwc3_pci_intel_byt_swnode) },
- { PCI_DEVICE_DATA(INTEL, MRFLD, &dwc3_pci_intel_mrfld_swnode) },
{ PCI_DEVICE_DATA(INTEL, CMLLP, &dwc3_pci_intel_swnode) },
{ PCI_DEVICE_DATA(INTEL, CMLH, &dwc3_pci_intel_swnode) },
- { PCI_DEVICE_DATA(INTEL, SPTLP, &dwc3_pci_intel_swnode) },
- { PCI_DEVICE_DATA(INTEL, SPTH, &dwc3_pci_intel_swnode) },
{ PCI_DEVICE_DATA(INTEL, BXT, &dwc3_pci_intel_swnode) },
+ { PCI_DEVICE_DATA(INTEL, BYT, &dwc3_pci_intel_byt_swnode) },
+ { PCI_DEVICE_DATA(INTEL, MRFLD, &dwc3_pci_intel_mrfld_swnode) },
{ PCI_DEVICE_DATA(INTEL, BXT_M, &dwc3_pci_intel_swnode) },
- { PCI_DEVICE_DATA(INTEL, APL, &dwc3_pci_intel_swnode) },
- { PCI_DEVICE_DATA(INTEL, KBP, &dwc3_pci_intel_swnode) },
+ { PCI_DEVICE_DATA(INTEL, BSW, &dwc3_pci_intel_swnode) },
{ PCI_DEVICE_DATA(INTEL, GLK, &dwc3_pci_intel_swnode) },
- { PCI_DEVICE_DATA(INTEL, CNPLP, &dwc3_pci_intel_swnode) },
- { PCI_DEVICE_DATA(INTEL, CNPH, &dwc3_pci_intel_swnode) },
- { PCI_DEVICE_DATA(INTEL, CNPV, &dwc3_pci_intel_swnode) },
{ PCI_DEVICE_DATA(INTEL, ICLLP, &dwc3_pci_intel_swnode) },
- { PCI_DEVICE_DATA(INTEL, EHL, &dwc3_pci_intel_swnode) },
- { PCI_DEVICE_DATA(INTEL, TGPLP, &dwc3_pci_intel_swnode) },
{ PCI_DEVICE_DATA(INTEL, TGPH, &dwc3_pci_intel_swnode) },
- { PCI_DEVICE_DATA(INTEL, JSP, &dwc3_pci_intel_swnode) },
- { PCI_DEVICE_DATA(INTEL, WCL, &dwc3_pci_intel_swnode) },
{ PCI_DEVICE_DATA(INTEL, ADL, &dwc3_pci_intel_swnode) },
- { PCI_DEVICE_DATA(INTEL, ADL_PCH, &dwc3_pci_intel_swnode) },
{ PCI_DEVICE_DATA(INTEL, ADLN, &dwc3_pci_intel_swnode) },
+ { PCI_DEVICE_DATA(INTEL, EHL, &dwc3_pci_intel_swnode) },
+ { PCI_DEVICE_DATA(INTEL, WCL, &dwc3_pci_intel_swnode) },
+ { PCI_DEVICE_DATA(INTEL, JSP, &dwc3_pci_intel_swnode) },
+ { PCI_DEVICE_DATA(INTEL, ADL_PCH, &dwc3_pci_intel_swnode) },
{ PCI_DEVICE_DATA(INTEL, ADLN_PCH, &dwc3_pci_intel_swnode) },
- { PCI_DEVICE_DATA(INTEL, ADLS, &dwc3_pci_intel_swnode) },
- { PCI_DEVICE_DATA(INTEL, RPL, &dwc3_pci_intel_swnode) },
+ { PCI_DEVICE_DATA(INTEL, APL, &dwc3_pci_intel_swnode) },
+ { PCI_DEVICE_DATA(INTEL, NVLS_PCH, &dwc3_pci_intel_swnode) },
+ { PCI_DEVICE_DATA(INTEL, ARLH_PCH, &dwc3_pci_intel_swnode) },
{ PCI_DEVICE_DATA(INTEL, RPLS, &dwc3_pci_intel_swnode) },
+ { PCI_DEVICE_DATA(INTEL, MTL, &dwc3_pci_intel_swnode) },
+ { PCI_DEVICE_DATA(INTEL, ADLS, &dwc3_pci_intel_swnode) },
{ PCI_DEVICE_DATA(INTEL, MTLM, &dwc3_pci_intel_swnode) },
{ PCI_DEVICE_DATA(INTEL, MTLP, &dwc3_pci_intel_swnode) },
- { PCI_DEVICE_DATA(INTEL, MTL, &dwc3_pci_intel_swnode) },
{ PCI_DEVICE_DATA(INTEL, MTLS, &dwc3_pci_intel_swnode) },
- { PCI_DEVICE_DATA(INTEL, ARLH_PCH, &dwc3_pci_intel_swnode) },
{ PCI_DEVICE_DATA(INTEL, TGL, &dwc3_pci_intel_swnode) },
+ { PCI_DEVICE_DATA(INTEL, SPTLP, &dwc3_pci_intel_swnode) },
+ { PCI_DEVICE_DATA(INTEL, CNPLP, &dwc3_pci_intel_swnode) },
+ { PCI_DEVICE_DATA(INTEL, TGPLP, &dwc3_pci_intel_swnode) },
+ { PCI_DEVICE_DATA(INTEL, SPTH, &dwc3_pci_intel_swnode) },
+ { PCI_DEVICE_DATA(INTEL, KBP, &dwc3_pci_intel_swnode) },
+ { PCI_DEVICE_DATA(INTEL, CNPH, &dwc3_pci_intel_swnode) },
+ { PCI_DEVICE_DATA(INTEL, CNPV, &dwc3_pci_intel_swnode) },
+ { PCI_DEVICE_DATA(INTEL, RPL, &dwc3_pci_intel_swnode) },
{ PCI_DEVICE_DATA(INTEL, PTLH, &dwc3_pci_intel_swnode) },
{ PCI_DEVICE_DATA(INTEL, PTLH_PCH, &dwc3_pci_intel_swnode) },
{ PCI_DEVICE_DATA(INTEL, PTLU, &dwc3_pci_intel_swnode) },
diff --git a/drivers/usb/dwc3/ep0.c b/drivers/usb/dwc3/ep0.c
index b4229aa13f37..e0bad5708664 100644
--- a/drivers/usb/dwc3/ep0.c
+++ b/drivers/usb/dwc3/ep0.c
@@ -94,6 +94,7 @@ static int __dwc3_gadget_ep0_queue(struct dwc3_ep *dep,
req->request.actual = 0;
req->request.status = -EINPROGRESS;
req->epnum = dep->number;
+ req->status = DWC3_REQUEST_STATUS_QUEUED;
list_add_tail(&req->list, &dep->pending_list);
diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
index 6f18b4840a25..5e4997f974dd 100644
--- a/drivers/usb/dwc3/gadget.c
+++ b/drivers/usb/dwc3/gadget.c
@@ -228,6 +228,13 @@ void dwc3_gadget_giveback(struct dwc3_ep *dep, struct dwc3_request *req,
{
struct dwc3 *dwc = dep->dwc;
+ /*
+ * The request might have been processed and completed while the
+ * spinlock was released. Skip processing if already completed.
+ */
+ if (req->status == DWC3_REQUEST_STATUS_COMPLETED)
+ return;
+
dwc3_gadget_del_and_unmap_request(dep, req, status);
req->status = DWC3_REQUEST_STATUS_COMPLETED;
diff --git a/drivers/usb/gadget/function/f_eem.c b/drivers/usb/gadget/function/f_eem.c
index 6de81ea17274..edbbadad6138 100644
--- a/drivers/usb/gadget/function/f_eem.c
+++ b/drivers/usb/gadget/function/f_eem.c
@@ -477,8 +477,13 @@ static int eem_unwrap(struct gether *port,
req->complete = eem_cmd_complete;
req->zero = 1;
req->context = ctx;
- if (usb_ep_queue(port->in_ep, req, GFP_ATOMIC))
+ if (usb_ep_queue(port->in_ep, req, GFP_ATOMIC)) {
DBG(cdev, "echo response queue fail\n");
+ kfree(ctx);
+ kfree(req->buf);
+ usb_ep_free_request(ep, req);
+ dev_kfree_skb_any(skb2);
+ }
break;
case 1: /* echo response */
diff --git a/drivers/usb/gadget/udc/core.c b/drivers/usb/gadget/udc/core.c
index 694653761c44..8dbe79bdc0f9 100644
--- a/drivers/usb/gadget/udc/core.c
+++ b/drivers/usb/gadget/udc/core.c
@@ -1126,8 +1126,13 @@ static void usb_gadget_state_work(struct work_struct *work)
void usb_gadget_set_state(struct usb_gadget *gadget,
enum usb_device_state state)
{
+ unsigned long flags;
+
+ spin_lock_irqsave(&gadget->state_lock, flags);
gadget->state = state;
- schedule_work(&gadget->work);
+ if (!gadget->teardown)
+ schedule_work(&gadget->work);
+ spin_unlock_irqrestore(&gadget->state_lock, flags);
trace_usb_gadget_set_state(gadget, 0);
}
EXPORT_SYMBOL_GPL(usb_gadget_set_state);
@@ -1361,6 +1366,8 @@ static void usb_udc_nop_release(struct device *dev)
void usb_initialize_gadget(struct device *parent, struct usb_gadget *gadget,
void (*release)(struct device *dev))
{
+ spin_lock_init(&gadget->state_lock);
+ gadget->teardown = false;
INIT_WORK(&gadget->work, usb_gadget_state_work);
gadget->dev.parent = parent;
@@ -1535,6 +1542,7 @@ EXPORT_SYMBOL_GPL(usb_add_gadget_udc);
void usb_del_gadget(struct usb_gadget *gadget)
{
struct usb_udc *udc = gadget->udc;
+ unsigned long flags;
if (!udc)
return;
@@ -1548,6 +1556,13 @@ void usb_del_gadget(struct usb_gadget *gadget)
kobject_uevent(&udc->dev.kobj, KOBJ_REMOVE);
sysfs_remove_link(&udc->dev.kobj, "gadget");
device_del(&gadget->dev);
+ /*
+ * Set the teardown flag before flushing the work to prevent new work
+ * from being scheduled while we are cleaning up.
+ */
+ spin_lock_irqsave(&gadget->state_lock, flags);
+ gadget->teardown = true;
+ spin_unlock_irqrestore(&gadget->state_lock, flags);
flush_work(&gadget->work);
ida_free(&gadget_id_numbers, gadget->id_number);
cancel_work_sync(&udc->vbus_work);
diff --git a/drivers/usb/gadget/udc/renesas_usbf.c b/drivers/usb/gadget/udc/renesas_usbf.c
index 14f4b2cf05a4..4c201574a0af 100644
--- a/drivers/usb/gadget/udc/renesas_usbf.c
+++ b/drivers/usb/gadget/udc/renesas_usbf.c
@@ -3262,7 +3262,9 @@ static int usbf_probe(struct platform_device *pdev)
if (IS_ERR(udc->regs))
return PTR_ERR(udc->regs);
- devm_pm_runtime_enable(&pdev->dev);
+ ret = devm_pm_runtime_enable(&pdev->dev);
+ if (ret)
+ return ret;
ret = pm_runtime_resume_and_get(&pdev->dev);
if (ret < 0)
return ret;
diff --git a/drivers/usb/host/xhci-dbgcap.h b/drivers/usb/host/xhci-dbgcap.h
index 47ac72c2286d..5426c971d2d3 100644
--- a/drivers/usb/host/xhci-dbgcap.h
+++ b/drivers/usb/host/xhci-dbgcap.h
@@ -114,6 +114,7 @@ struct dbc_port {
unsigned int tx_boundary;
bool registered;
+ bool tx_running;
};
struct dbc_driver {
diff --git a/drivers/usb/host/xhci-dbgtty.c b/drivers/usb/host/xhci-dbgtty.c
index d894081d8d15..57cdda4e09c8 100644
--- a/drivers/usb/host/xhci-dbgtty.c
+++ b/drivers/usb/host/xhci-dbgtty.c
@@ -47,7 +47,7 @@ dbc_kfifo_to_req(struct dbc_port *port, char *packet)
return len;
}
-static int dbc_start_tx(struct dbc_port *port)
+static int dbc_do_start_tx(struct dbc_port *port)
__releases(&port->port_lock)
__acquires(&port->port_lock)
{
@@ -57,6 +57,8 @@ static int dbc_start_tx(struct dbc_port *port)
bool do_tty_wake = false;
struct list_head *pool = &port->write_pool;
+ port->tx_running = true;
+
while (!list_empty(pool)) {
req = list_entry(pool->next, struct dbc_request, list_pool);
len = dbc_kfifo_to_req(port, req->buf);
@@ -77,12 +79,25 @@ static int dbc_start_tx(struct dbc_port *port)
}
}
+ port->tx_running = false;
+
if (do_tty_wake && port->port.tty)
tty_wakeup(port->port.tty);
return status;
}
+/* must be called with port->port_lock held */
+static int dbc_start_tx(struct dbc_port *port)
+{
+ lockdep_assert_held(&port->port_lock);
+
+ if (port->tx_running)
+ return -EBUSY;
+
+ return dbc_do_start_tx(port);
+}
+
static void dbc_start_rx(struct dbc_port *port)
__releases(&port->port_lock)
__acquires(&port->port_lock)
@@ -535,6 +550,12 @@ static void xhci_dbc_tty_unregister_device(struct xhci_dbc *dbc)
if (!port->registered)
return;
+ /*
+ * Hang up the TTY. This wakes up any blocked
+ * writers and causes subsequent writes to fail.
+ */
+ tty_vhangup(port->port.tty);
+
tty_unregister_device(dbc_tty_driver, port->minor);
xhci_dbc_tty_exit_port(port);
port->registered = false;
diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
index 8e209aa33ea7..5bdcf9ab2b99 100644
--- a/drivers/usb/host/xhci-ring.c
+++ b/drivers/usb/host/xhci-ring.c
@@ -1985,6 +1985,7 @@ static void xhci_cavium_reset_phy_quirk(struct xhci_hcd *xhci)
static void handle_port_status(struct xhci_hcd *xhci, union xhci_trb *event)
{
+ struct xhci_virt_device *vdev = NULL;
struct usb_hcd *hcd;
u32 port_id;
u32 portsc, cmd_reg;
@@ -2016,6 +2017,9 @@ static void handle_port_status(struct xhci_hcd *xhci, union xhci_trb *event)
goto cleanup;
}
+ if (port->slot_id)
+ vdev = xhci->devs[port->slot_id];
+
/* We might get interrupts after shared_hcd is removed */
if (port->rhub == &xhci->usb3_rhub && xhci->shared_hcd == NULL) {
xhci_dbg(xhci, "ignore port event for removed USB3 hcd\n");
@@ -2038,10 +2042,11 @@ static void handle_port_status(struct xhci_hcd *xhci, union xhci_trb *event)
usb_hcd_resume_root_hub(hcd);
}
- if (hcd->speed >= HCD_USB3 &&
- (portsc & PORT_PLS_MASK) == XDEV_INACTIVE) {
- if (port->slot_id && xhci->devs[port->slot_id])
- xhci->devs[port->slot_id]->flags |= VDEV_PORT_ERROR;
+ if (vdev && (portsc & PORT_PLS_MASK) == XDEV_INACTIVE) {
+ if (!(portsc & PORT_RESET))
+ vdev->flags |= VDEV_PORT_ERROR;
+ } else if (vdev && portsc & PORT_RC) {
+ vdev->flags &= ~VDEV_PORT_ERROR;
}
if ((portsc & PORT_PLC) && (portsc & PORT_PLS_MASK) == XDEV_RESUME) {
@@ -2099,7 +2104,7 @@ static void handle_port_status(struct xhci_hcd *xhci, union xhci_trb *event)
* so the roothub behavior is consistent with external
* USB 3.0 hub behavior.
*/
- if (port->slot_id && xhci->devs[port->slot_id])
+ if (vdev)
xhci_ring_device(xhci, port->slot_id);
if (bus_state->port_remote_wakeup & (1 << hcd_portnum)) {
xhci_test_and_clear_bit(xhci, port, PORT_PLC);
diff --git a/drivers/usb/host/xhci-sideband.c b/drivers/usb/host/xhci-sideband.c
index e771a476fef2..a85f62a73313 100644
--- a/drivers/usb/host/xhci-sideband.c
+++ b/drivers/usb/host/xhci-sideband.c
@@ -73,9 +73,12 @@ err:
return NULL;
}
+/* Caller must hold sb->mutex */
static void
__xhci_sideband_remove_endpoint(struct xhci_sideband *sb, struct xhci_virt_ep *ep)
{
+ lockdep_assert_held(&sb->mutex);
+
/*
* Issue a stop endpoint command when an endpoint is removed.
* The stop ep cmd handler will handle the ring cleanup.
@@ -86,6 +89,25 @@ __xhci_sideband_remove_endpoint(struct xhci_sideband *sb, struct xhci_virt_ep *e
sb->eps[ep->ep_index] = NULL;
}
+/* Caller must hold sb->mutex */
+static void
+__xhci_sideband_remove_interrupter(struct xhci_sideband *sb)
+{
+ struct usb_device *udev;
+
+ lockdep_assert_held(&sb->mutex);
+
+ if (!sb->ir)
+ return;
+
+ xhci_remove_secondary_interrupter(xhci_to_hcd(sb->xhci), sb->ir);
+ sb->ir = NULL;
+ udev = sb->vdev->udev;
+
+ if (udev->state != USB_STATE_NOTATTACHED)
+ usb_offload_put(udev);
+}
+
/* sideband api functions */
/**
@@ -131,14 +153,16 @@ xhci_sideband_add_endpoint(struct xhci_sideband *sb,
struct xhci_virt_ep *ep;
unsigned int ep_index;
- mutex_lock(&sb->mutex);
+ guard(mutex)(&sb->mutex);
+
+ if (!sb->vdev)
+ return -ENODEV;
+
ep_index = xhci_get_endpoint_index(&host_ep->desc);
ep = &sb->vdev->eps[ep_index];
- if (ep->ep_state & EP_HAS_STREAMS) {
- mutex_unlock(&sb->mutex);
+ if (ep->ep_state & EP_HAS_STREAMS)
return -EINVAL;
- }
/*
* Note, we don't know the DMA mask of the audio DSP device, if its
@@ -148,14 +172,11 @@ xhci_sideband_add_endpoint(struct xhci_sideband *sb,
* and let this function add the endpoint and allocate the ring buffer
* with the smallest common DMA mask
*/
- if (sb->eps[ep_index] || ep->sideband) {
- mutex_unlock(&sb->mutex);
+ if (sb->eps[ep_index] || ep->sideband)
return -EBUSY;
- }
ep->sideband = sb;
sb->eps[ep_index] = ep;
- mutex_unlock(&sb->mutex);
return 0;
}
@@ -180,18 +201,16 @@ xhci_sideband_remove_endpoint(struct xhci_sideband *sb,
struct xhci_virt_ep *ep;
unsigned int ep_index;
- mutex_lock(&sb->mutex);
+ guard(mutex)(&sb->mutex);
+
ep_index = xhci_get_endpoint_index(&host_ep->desc);
ep = sb->eps[ep_index];
- if (!ep || !ep->sideband || ep->sideband != sb) {
- mutex_unlock(&sb->mutex);
+ if (!ep || !ep->sideband || ep->sideband != sb)
return -ENODEV;
- }
__xhci_sideband_remove_endpoint(sb, ep);
xhci_initialize_ring_info(ep->ring);
- mutex_unlock(&sb->mutex);
return 0;
}
@@ -316,28 +335,25 @@ xhci_sideband_create_interrupter(struct xhci_sideband *sb, int num_seg,
if (!sb || !sb->xhci)
return -ENODEV;
- mutex_lock(&sb->mutex);
- if (sb->ir) {
- ret = -EBUSY;
- goto out;
- }
+ guard(mutex)(&sb->mutex);
+
+ if (!sb->vdev)
+ return -ENODEV;
+
+ if (sb->ir)
+ return -EBUSY;
sb->ir = xhci_create_secondary_interrupter(xhci_to_hcd(sb->xhci),
num_seg, imod_interval,
intr_num);
- if (!sb->ir) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!sb->ir)
+ return -ENOMEM;
udev = sb->vdev->udev;
ret = usb_offload_get(udev);
sb->ir->ip_autoclear = ip_autoclear;
-out:
- mutex_unlock(&sb->mutex);
-
return ret;
}
EXPORT_SYMBOL_GPL(xhci_sideband_create_interrupter);
@@ -352,21 +368,12 @@ EXPORT_SYMBOL_GPL(xhci_sideband_create_interrupter);
void
xhci_sideband_remove_interrupter(struct xhci_sideband *sb)
{
- struct usb_device *udev;
-
- if (!sb || !sb->ir)
+ if (!sb)
return;
- mutex_lock(&sb->mutex);
- xhci_remove_secondary_interrupter(xhci_to_hcd(sb->xhci), sb->ir);
-
- sb->ir = NULL;
- udev = sb->vdev->udev;
+ guard(mutex)(&sb->mutex);
- if (udev->state != USB_STATE_NOTATTACHED)
- usb_offload_put(udev);
-
- mutex_unlock(&sb->mutex);
+ __xhci_sideband_remove_interrupter(sb);
}
EXPORT_SYMBOL_GPL(xhci_sideband_remove_interrupter);
@@ -465,6 +472,7 @@ EXPORT_SYMBOL_GPL(xhci_sideband_register);
void
xhci_sideband_unregister(struct xhci_sideband *sb)
{
+ struct xhci_virt_device *vdev;
struct xhci_hcd *xhci;
int i;
@@ -473,17 +481,23 @@ xhci_sideband_unregister(struct xhci_sideband *sb)
xhci = sb->xhci;
- mutex_lock(&sb->mutex);
- for (i = 0; i < EP_CTX_PER_DEV; i++)
- if (sb->eps[i])
- __xhci_sideband_remove_endpoint(sb, sb->eps[i]);
- mutex_unlock(&sb->mutex);
+ scoped_guard(mutex, &sb->mutex) {
+ vdev = sb->vdev;
+ if (!vdev)
+ return;
+
+ for (i = 0; i < EP_CTX_PER_DEV; i++)
+ if (sb->eps[i])
+ __xhci_sideband_remove_endpoint(sb, sb->eps[i]);
- xhci_sideband_remove_interrupter(sb);
+ __xhci_sideband_remove_interrupter(sb);
+
+ sb->vdev = NULL;
+ }
spin_lock_irq(&xhci->lock);
sb->xhci = NULL;
- sb->vdev->sideband = NULL;
+ vdev->sideband = NULL;
spin_unlock_irq(&xhci->lock);
kfree(sb);
diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c
index 0cb45b95e4f5..a148a1280126 100644
--- a/drivers/usb/host/xhci.c
+++ b/drivers/usb/host/xhci.c
@@ -4007,6 +4007,7 @@ static int xhci_discover_or_reset_device(struct usb_hcd *hcd,
xhci_get_slot_state(xhci, virt_dev->out_ctx));
xhci_dbg(xhci, "Not freeing device rings.\n");
/* Don't treat this as an error. May change my mind later. */
+ virt_dev->flags = 0;
ret = 0;
goto command_cleanup;
case COMP_SUCCESS:
diff --git a/drivers/usb/renesas_usbhs/common.c b/drivers/usb/renesas_usbhs/common.c
index 8f536f2c500f..dc2fec9168b7 100644
--- a/drivers/usb/renesas_usbhs/common.c
+++ b/drivers/usb/renesas_usbhs/common.c
@@ -813,18 +813,18 @@ static void usbhs_remove(struct platform_device *pdev)
flush_delayed_work(&priv->notify_hotplug_work);
- /* power off */
- if (!usbhs_get_dparam(priv, runtime_pwctrl))
- usbhsc_power_ctrl(priv, 0);
-
- pm_runtime_disable(&pdev->dev);
-
usbhs_platform_call(priv, hardware_exit, pdev);
- usbhsc_clk_put(priv);
reset_control_assert(priv->rsts);
usbhs_mod_remove(priv);
usbhs_fifo_remove(priv);
usbhs_pipe_remove(priv);
+
+ /* power off */
+ if (!usbhs_get_dparam(priv, runtime_pwctrl))
+ usbhsc_power_ctrl(priv, 0);
+
+ usbhsc_clk_put(priv);
+ pm_runtime_disable(&pdev->dev);
}
static int usbhsc_suspend(struct device *dev)
diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c
index 49666c33b41f..b37fa31f5694 100644
--- a/drivers/usb/serial/ftdi_sio.c
+++ b/drivers/usb/serial/ftdi_sio.c
@@ -1074,6 +1074,7 @@ static const struct usb_device_id id_table_combined[] = {
/* U-Blox devices */
{ USB_DEVICE(UBLOX_VID, UBLOX_C099F9P_ZED_PID) },
{ USB_DEVICE(UBLOX_VID, UBLOX_C099F9P_ODIN_PID) },
+ { USB_DEVICE_INTERFACE_NUMBER(UBLOX_VID, UBLOX_EVK_M101_PID, 2) },
/* FreeCalypso USB adapters */
{ USB_DEVICE(FTDI_VID, FTDI_FALCONIA_JTAG_BUF_PID),
.driver_info = (kernel_ulong_t)&ftdi_jtag_quirk },
diff --git a/drivers/usb/serial/ftdi_sio_ids.h b/drivers/usb/serial/ftdi_sio_ids.h
index 4cc1fae8acb9..2539b9e2f712 100644
--- a/drivers/usb/serial/ftdi_sio_ids.h
+++ b/drivers/usb/serial/ftdi_sio_ids.h
@@ -1614,6 +1614,7 @@
#define UBLOX_VID 0x1546
#define UBLOX_C099F9P_ZED_PID 0x0502
#define UBLOX_C099F9P_ODIN_PID 0x0503
+#define UBLOX_EVK_M101_PID 0x0506
/*
* GMC devices
diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
index 5de856f65f0d..e9400727ad36 100644
--- a/drivers/usb/serial/option.c
+++ b/drivers/usb/serial/option.c
@@ -2424,12 +2424,18 @@ static const struct usb_device_id option_ids[] = {
{ USB_DEVICE_INTERFACE_CLASS(0x305a, 0x1406, 0xff) }, /* GosunCn GM500 ECM/NCM */
{ USB_DEVICE(0x33f8, 0x0104), /* Rolling RW101-GL (laptop RMNET) */
.driver_info = RSVD(4) | RSVD(5) },
+ { USB_DEVICE_INTERFACE_CLASS(0x33f8, 0x0115, 0xff), /* Rolling RW135-GL (laptop MBIM) */
+ .driver_info = RSVD(5) },
{ USB_DEVICE_INTERFACE_CLASS(0x33f8, 0x01a2, 0xff) }, /* Rolling RW101-GL (laptop MBIM) */
{ USB_DEVICE_INTERFACE_CLASS(0x33f8, 0x01a3, 0xff) }, /* Rolling RW101-GL (laptop MBIM) */
{ USB_DEVICE_INTERFACE_CLASS(0x33f8, 0x01a4, 0xff), /* Rolling RW101-GL (laptop MBIM) */
.driver_info = RSVD(4) },
- { USB_DEVICE_INTERFACE_CLASS(0x33f8, 0x0115, 0xff), /* Rolling RW135-GL (laptop MBIM) */
- .driver_info = RSVD(5) },
+ { USB_DEVICE_INTERFACE_CLASS(0x33f8, 0x01a8, 0xff), /* Rolling RW101R-GL (laptop MBIM) */
+ .driver_info = RSVD(4) },
+ { USB_DEVICE_INTERFACE_CLASS(0x33f8, 0x01a9, 0xff), /* Rolling RW101R-GL (laptop MBIM) */
+ .driver_info = RSVD(4) },
+ { USB_DEVICE_INTERFACE_CLASS(0x33f8, 0x0301, 0xff) }, /* Rolling RW101R-GL (laptop MBIM) */
+ { USB_DEVICE_INTERFACE_CLASS(0x33f8, 0x0302, 0xff) }, /* Rolling RW101R-GL (laptop MBIM) */
{ USB_DEVICE_INTERFACE_CLASS(0x33f8, 0x0802, 0xff), /* Rolling RW350-GL (laptop MBIM) */
.driver_info = RSVD(5) },
{ USB_DEVICE_AND_INTERFACE_INFO(0x3731, 0x0100, 0xff, 0xff, 0x30) }, /* NetPrisma LCUK54-WWD for Global */
diff --git a/drivers/usb/storage/sddr55.c b/drivers/usb/storage/sddr55.c
index b323f0a36260..9d813727e65f 100644
--- a/drivers/usb/storage/sddr55.c
+++ b/drivers/usb/storage/sddr55.c
@@ -469,6 +469,12 @@ static int sddr55_write_data(struct us_data *us,
new_pba = (status[3] + (status[4] << 8) + (status[5] << 16))
>> info->blockshift;
+ /* check if device-reported new_pba is out of range */
+ if (new_pba >= (info->capacity >> (info->blockshift + info->pageshift))) {
+ result = USB_STOR_TRANSPORT_FAILED;
+ goto leave;
+ }
+
/* check status for error */
if (status[0] == 0xff && status[1] == 0x4) {
info->pba_to_lba[new_pba] = BAD_BLOCK;
diff --git a/drivers/usb/storage/transport.c b/drivers/usb/storage/transport.c
index 1aa1bd26c81f..9a4bf86e7b6a 100644
--- a/drivers/usb/storage/transport.c
+++ b/drivers/usb/storage/transport.c
@@ -1200,7 +1200,23 @@ int usb_stor_Bulk_transport(struct scsi_cmnd *srb, struct us_data *us)
US_BULK_CS_WRAP_LEN &&
bcs->Signature ==
cpu_to_le32(US_BULK_CS_SIGN)) {
+ unsigned char buf[US_BULK_CS_WRAP_LEN];
+
usb_stor_dbg(us, "Device skipped data phase\n");
+
+ /*
+ * Devices skipping data phase might leave CSW data in srb's
+ * transfer buffer. Zero it to prevent USB protocol leakage.
+ */
+ sg = NULL;
+ offset = 0;
+ memset(buf, 0, sizeof(buf));
+ if (usb_stor_access_xfer_buf(buf,
+ US_BULK_CS_WRAP_LEN, srb, &sg,
+ &offset, TO_XFER_BUF) !=
+ US_BULK_CS_WRAP_LEN)
+ usb_stor_dbg(us, "Failed to clear CSW data\n");
+
scsi_set_resid(srb, transfer_length);
goto skipped_data_phase;
}
diff --git a/drivers/usb/storage/uas.c b/drivers/usb/storage/uas.c
index 4ed0dc19afe0..45b01df364f7 100644
--- a/drivers/usb/storage/uas.c
+++ b/drivers/usb/storage/uas.c
@@ -698,6 +698,10 @@ static int uas_queuecommand_lck(struct scsi_cmnd *cmnd)
* of queueing, no matter how fatal the error
*/
if (err == -ENODEV) {
+ if (cmdinfo->state & (COMMAND_INFLIGHT | DATA_IN_URB_INFLIGHT |
+ DATA_OUT_URB_INFLIGHT))
+ goto out;
+
set_host_byte(cmnd, DID_NO_CONNECT);
scsi_done(cmnd);
goto zombie;
@@ -711,6 +715,7 @@ static int uas_queuecommand_lck(struct scsi_cmnd *cmnd)
uas_add_work(cmnd);
}
+out:
devinfo->cmnd[idx] = cmnd;
zombie:
spin_unlock_irqrestore(&devinfo->lock, flags);
diff --git a/drivers/usb/storage/unusual_devs.h b/drivers/usb/storage/unusual_devs.h
index dfa5276a5a43..47f50d7a385c 100644
--- a/drivers/usb/storage/unusual_devs.h
+++ b/drivers/usb/storage/unusual_devs.h
@@ -938,7 +938,7 @@ UNUSUAL_DEV( 0x05e3, 0x0723, 0x9451, 0x9451,
UNUSUAL_DEV( 0x0603, 0x8611, 0x0000, 0xffff,
"Novatek",
"NTK96550-based camera",
- USB_SC_SCSI, USB_PR_BULK, NULL,
+ USB_SC_DEVICE, USB_PR_DEVICE, NULL,
US_FL_BULK_IGNORE_TAG ),
/*
diff --git a/drivers/usb/typec/ucsi/psy.c b/drivers/usb/typec/ucsi/psy.c
index 62a9d68bb66d..8ae900c8c132 100644
--- a/drivers/usb/typec/ucsi/psy.c
+++ b/drivers/usb/typec/ucsi/psy.c
@@ -145,6 +145,11 @@ static int ucsi_psy_get_current_max(struct ucsi_connector *con,
{
u32 pdo;
+ if (!UCSI_CONSTAT(con, CONNECTED)) {
+ val->intval = 0;
+ return 0;
+ }
+
switch (UCSI_CONSTAT(con, PWR_OPMODE)) {
case UCSI_CONSTAT_PWR_OPMODE_PD:
if (con->num_pdos > 0) {
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 82034efb74fc..a7936bd1aabe 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -573,6 +573,8 @@ static int cq_create(struct mlx5_vdpa_net *ndev, u16 idx, u32 num_ent)
vcq->mcq.set_ci_db = vcq->db.db;
vcq->mcq.arm_db = vcq->db.db + 1;
vcq->mcq.cqe_sz = 64;
+ vcq->mcq.comp = mlx5_vdpa_cq_comp;
+ vcq->cqe = num_ent;
err = cq_frag_buf_alloc(ndev, &vcq->buf, num_ent);
if (err)
@@ -612,10 +614,6 @@ static int cq_create(struct mlx5_vdpa_net *ndev, u16 idx, u32 num_ent)
if (err)
goto err_vec;
- vcq->mcq.comp = mlx5_vdpa_cq_comp;
- vcq->cqe = num_ent;
- vcq->mcq.set_ci_db = vcq->db.db;
- vcq->mcq.arm_db = vcq->db.db + 1;
mlx5_cq_arm(&mvq->cq.mcq, MLX5_CQ_DB_REQ_NOT, uar_page, mvq->cq.mcq.cons_index);
kfree(in);
return 0;
diff --git a/drivers/vfio/group.c b/drivers/vfio/group.c
index c376a6279de0..d47ffada6912 100644
--- a/drivers/vfio/group.c
+++ b/drivers/vfio/group.c
@@ -299,10 +299,8 @@ static int vfio_group_ioctl_get_device_fd(struct vfio_group *group,
char __user *arg)
{
struct vfio_device *device;
- struct file *filep;
char *buf;
- int fdno;
- int ret;
+ int fd;
buf = strndup_user(arg, PAGE_SIZE);
if (IS_ERR(buf))
@@ -313,26 +311,10 @@ static int vfio_group_ioctl_get_device_fd(struct vfio_group *group,
if (IS_ERR(device))
return PTR_ERR(device);
- fdno = get_unused_fd_flags(O_CLOEXEC);
- if (fdno < 0) {
- ret = fdno;
- goto err_put_device;
- }
-
- filep = vfio_device_open_file(device);
- if (IS_ERR(filep)) {
- ret = PTR_ERR(filep);
- goto err_put_fdno;
- }
-
- fd_install(fdno, filep);
- return fdno;
-
-err_put_fdno:
- put_unused_fd(fdno);
-err_put_device:
- vfio_device_put_registration(device);
- return ret;
+ fd = FD_ADD(O_CLOEXEC, vfio_device_open_file(device));
+ if (fd < 0)
+ vfio_device_put_registration(device);
+ return fd;
}
static int vfio_group_ioctl_get_status(struct vfio_group *group,
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 35ded4330431..8f7f50acb6d6 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -592,14 +592,15 @@ static void vhost_net_busy_poll(struct vhost_net *net,
static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
struct vhost_net_virtqueue *tnvq,
unsigned int *out_num, unsigned int *in_num,
- struct msghdr *msghdr, bool *busyloop_intr)
+ struct msghdr *msghdr, bool *busyloop_intr,
+ unsigned int *ndesc)
{
struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
struct vhost_virtqueue *rvq = &rnvq->vq;
struct vhost_virtqueue *tvq = &tnvq->vq;
- int r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
- out_num, in_num, NULL, NULL);
+ int r = vhost_get_vq_desc_n(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
+ out_num, in_num, NULL, NULL, ndesc);
if (r == tvq->num && tvq->busyloop_timeout) {
/* Flush batched packets first */
@@ -610,8 +611,8 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, false);
- r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
- out_num, in_num, NULL, NULL);
+ r = vhost_get_vq_desc_n(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
+ out_num, in_num, NULL, NULL, ndesc);
}
return r;
@@ -642,12 +643,14 @@ static int get_tx_bufs(struct vhost_net *net,
struct vhost_net_virtqueue *nvq,
struct msghdr *msg,
unsigned int *out, unsigned int *in,
- size_t *len, bool *busyloop_intr)
+ size_t *len, bool *busyloop_intr,
+ unsigned int *ndesc)
{
struct vhost_virtqueue *vq = &nvq->vq;
int ret;
- ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg, busyloop_intr);
+ ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg,
+ busyloop_intr, ndesc);
if (ret < 0 || ret == vq->num)
return ret;
@@ -766,6 +769,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
int sent_pkts = 0;
bool sock_can_batch = (sock->sk->sk_sndbuf == INT_MAX);
bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
+ unsigned int ndesc = 0;
do {
bool busyloop_intr = false;
@@ -774,7 +778,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
vhost_tx_batch(net, nvq, sock, &msg);
head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
- &busyloop_intr);
+ &busyloop_intr, &ndesc);
/* On error, stop handling until the next kick. */
if (unlikely(head < 0))
break;
@@ -806,7 +810,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
goto done;
} else if (unlikely(err != -ENOSPC)) {
vhost_tx_batch(net, nvq, sock, &msg);
- vhost_discard_vq_desc(vq, 1);
+ vhost_discard_vq_desc(vq, 1, ndesc);
vhost_net_enable_vq(net, vq);
break;
}
@@ -829,7 +833,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
err = sock->ops->sendmsg(sock, &msg, len);
if (unlikely(err < 0)) {
if (err == -EAGAIN || err == -ENOMEM || err == -ENOBUFS) {
- vhost_discard_vq_desc(vq, 1);
+ vhost_discard_vq_desc(vq, 1, ndesc);
vhost_net_enable_vq(net, vq);
break;
}
@@ -868,6 +872,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
int err;
struct vhost_net_ubuf_ref *ubufs;
struct ubuf_info_msgzc *ubuf;
+ unsigned int ndesc = 0;
bool zcopy_used;
int sent_pkts = 0;
@@ -879,7 +884,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
busyloop_intr = false;
head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
- &busyloop_intr);
+ &busyloop_intr, &ndesc);
/* On error, stop handling until the next kick. */
if (unlikely(head < 0))
break;
@@ -941,7 +946,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN;
}
if (retry) {
- vhost_discard_vq_desc(vq, 1);
+ vhost_discard_vq_desc(vq, 1, ndesc);
vhost_net_enable_vq(net, vq);
break;
}
@@ -1045,11 +1050,12 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq,
unsigned *iovcount,
struct vhost_log *log,
unsigned *log_num,
- unsigned int quota)
+ unsigned int quota,
+ unsigned int *ndesc)
{
struct vhost_virtqueue *vq = &nvq->vq;
bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
- unsigned int out, in;
+ unsigned int out, in, desc_num, n = 0;
int seg = 0;
int headcount = 0;
unsigned d;
@@ -1064,9 +1070,9 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq,
r = -ENOBUFS;
goto err;
}
- r = vhost_get_vq_desc(vq, vq->iov + seg,
- ARRAY_SIZE(vq->iov) - seg, &out,
- &in, log, log_num);
+ r = vhost_get_vq_desc_n(vq, vq->iov + seg,
+ ARRAY_SIZE(vq->iov) - seg, &out,
+ &in, log, log_num, &desc_num);
if (unlikely(r < 0))
goto err;
@@ -1093,6 +1099,7 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq,
++headcount;
datalen -= len;
seg += in;
+ n += desc_num;
}
*iovcount = seg;
@@ -1113,9 +1120,11 @@ static int get_rx_bufs(struct vhost_net_virtqueue *nvq,
nheads[0] = headcount;
}
+ *ndesc = n;
+
return headcount;
err:
- vhost_discard_vq_desc(vq, headcount);
+ vhost_discard_vq_desc(vq, headcount, n);
return r;
}
@@ -1151,6 +1160,7 @@ static void handle_rx(struct vhost_net *net)
struct iov_iter fixup;
__virtio16 num_buffers;
int recv_pkts = 0;
+ unsigned int ndesc;
mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_RX);
sock = vhost_vq_get_backend(vq);
@@ -1182,7 +1192,8 @@ static void handle_rx(struct vhost_net *net)
headcount = get_rx_bufs(nvq, vq->heads + count,
vq->nheads + count,
vhost_len, &in, vq_log, &log,
- likely(mergeable) ? UIO_MAXIOV : 1);
+ likely(mergeable) ? UIO_MAXIOV : 1,
+ &ndesc);
/* On error, stop handling until the next kick. */
if (unlikely(headcount < 0))
goto out;
@@ -1228,7 +1239,7 @@ static void handle_rx(struct vhost_net *net)
if (unlikely(err != sock_len)) {
pr_debug("Discarded rx packet: "
" len %d, expected %zd\n", err, sock_len);
- vhost_discard_vq_desc(vq, headcount);
+ vhost_discard_vq_desc(vq, headcount, ndesc);
continue;
}
/* Supply virtio_net_hdr if VHOST_NET_F_VIRTIO_NET_HDR */
@@ -1252,7 +1263,7 @@ static void handle_rx(struct vhost_net *net)
copy_to_iter(&num_buffers, sizeof num_buffers,
&fixup) != sizeof num_buffers) {
vq_err(vq, "Failed num_buffers write");
- vhost_discard_vq_desc(vq, headcount);
+ vhost_discard_vq_desc(vq, headcount, ndesc);
goto out;
}
nvq->done_idx += headcount;
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 8570fdf2e14a..a78226b37739 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2792,18 +2792,34 @@ static int get_indirect(struct vhost_virtqueue *vq,
return 0;
}
-/* This looks in the virtqueue and for the first available buffer, and converts
- * it to an iovec for convenient access. Since descriptors consist of some
- * number of output then some number of input descriptors, it's actually two
- * iovecs, but we pack them into one and note how many of each there were.
+/**
+ * vhost_get_vq_desc_n - Fetch the next available descriptor chain and build iovecs
+ * @vq: target virtqueue
+ * @iov: array that receives the scatter/gather segments
+ * @iov_size: capacity of @iov in elements
+ * @out_num: the number of output segments
+ * @in_num: the number of input segments
+ * @log: optional array to record addr/len for each writable segment; NULL if unused
+ * @log_num: optional output; number of entries written to @log when provided
+ * @ndesc: optional output; number of descriptors consumed from the available ring
+ * (useful for rollback via vhost_discard_vq_desc)
*
- * This function returns the descriptor number found, or vq->num (which is
- * never a valid descriptor number) if none was found. A negative code is
- * returned on error. */
-int vhost_get_vq_desc(struct vhost_virtqueue *vq,
- struct iovec iov[], unsigned int iov_size,
- unsigned int *out_num, unsigned int *in_num,
- struct vhost_log *log, unsigned int *log_num)
+ * Extracts one available descriptor chain from @vq and translates guest addresses
+ * into host iovecs.
+ *
+ * On success, advances @vq->last_avail_idx by 1 and @vq->next_avail_head by the
+ * number of descriptors consumed (also stored via @ndesc when non-NULL).
+ *
+ * Return:
+ * - head index in [0, @vq->num) on success;
+ * - @vq->num if no descriptor is currently available;
+ * - negative errno on failure
+ */
+int vhost_get_vq_desc_n(struct vhost_virtqueue *vq,
+ struct iovec iov[], unsigned int iov_size,
+ unsigned int *out_num, unsigned int *in_num,
+ struct vhost_log *log, unsigned int *log_num,
+ unsigned int *ndesc)
{
bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER);
struct vring_desc desc;
@@ -2921,17 +2937,49 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
vq->last_avail_idx++;
vq->next_avail_head += c;
+ if (ndesc)
+ *ndesc = c;
+
/* Assume notifications from guest are disabled at this point,
* if they aren't we would need to update avail_event index. */
BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
return head;
}
+EXPORT_SYMBOL_GPL(vhost_get_vq_desc_n);
+
+/* This looks in the virtqueue and for the first available buffer, and converts
+ * it to an iovec for convenient access. Since descriptors consist of some
+ * number of output then some number of input descriptors, it's actually two
+ * iovecs, but we pack them into one and note how many of each there were.
+ *
+ * This function returns the descriptor number found, or vq->num (which is
+ * never a valid descriptor number) if none was found. A negative code is
+ * returned on error.
+ */
+int vhost_get_vq_desc(struct vhost_virtqueue *vq,
+ struct iovec iov[], unsigned int iov_size,
+ unsigned int *out_num, unsigned int *in_num,
+ struct vhost_log *log, unsigned int *log_num)
+{
+ return vhost_get_vq_desc_n(vq, iov, iov_size, out_num, in_num,
+ log, log_num, NULL);
+}
EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
-/* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
-void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
+/**
+ * vhost_discard_vq_desc - Reverse the effect of vhost_get_vq_desc_n()
+ * @vq: target virtqueue
+ * @nbufs: number of buffers to roll back
+ * @ndesc: number of descriptors to roll back
+ *
+ * Rewinds the internal consumer cursors after a failed attempt to use buffers
+ * returned by vhost_get_vq_desc_n().
+ */
+void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int nbufs,
+ unsigned int ndesc)
{
- vq->last_avail_idx -= n;
+ vq->next_avail_head -= ndesc;
+ vq->last_avail_idx -= nbufs;
}
EXPORT_SYMBOL_GPL(vhost_discard_vq_desc);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 621a6d9a8791..b49f08e4a1b4 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -230,7 +230,15 @@ int vhost_get_vq_desc(struct vhost_virtqueue *,
struct iovec iov[], unsigned int iov_size,
unsigned int *out_num, unsigned int *in_num,
struct vhost_log *log, unsigned int *log_num);
-void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
+
+int vhost_get_vq_desc_n(struct vhost_virtqueue *vq,
+ struct iovec iov[], unsigned int iov_size,
+ unsigned int *out_num, unsigned int *in_num,
+ struct vhost_log *log, unsigned int *log_num,
+ unsigned int *ndesc);
+
+void vhost_discard_vq_desc(struct vhost_virtqueue *, int nbuf,
+ unsigned int ndesc);
bool vhost_vq_work_queue(struct vhost_virtqueue *vq, struct vhost_work *work);
bool vhost_vq_has_work(struct vhost_virtqueue *vq);
diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c
index 9bd3c3814b5c..e7e07eb2142e 100644
--- a/drivers/video/fbdev/core/fbcon.c
+++ b/drivers/video/fbdev/core/fbcon.c
@@ -66,6 +66,7 @@
#include <linux/string.h>
#include <linux/kd.h>
#include <linux/panic.h>
+#include <linux/pci.h>
#include <linux/printk.h>
#include <linux/slab.h>
#include <linux/fb.h>
@@ -78,6 +79,7 @@
#include <linux/interrupt.h>
#include <linux/crc32.h> /* For counting font checksums */
#include <linux/uaccess.h>
+#include <linux/vga_switcheroo.h>
#include <asm/irq.h>
#include "fbcon.h"
@@ -2899,6 +2901,9 @@ void fbcon_fb_unregistered(struct fb_info *info)
console_lock();
+ if (info->device && dev_is_pci(info->device))
+ vga_switcheroo_client_fb_set(to_pci_dev(info->device), NULL);
+
fbcon_registered_fb[info->node] = NULL;
fbcon_num_registered_fb--;
@@ -3032,6 +3037,10 @@ static int do_fb_registered(struct fb_info *info)
}
}
+ /* Set the fb info for vga_switcheroo clients. Does nothing otherwise. */
+ if (info->device && dev_is_pci(info->device))
+ vga_switcheroo_client_fb_set(to_pci_dev(info->device), info);
+
return ret;
}
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index eed551d8555f..633da5e37299 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -6,6 +6,7 @@
#include <linux/module.h>
#include <linux/fs.h>
+#include <linux/fs_struct.h>
#include <net/9p/9p.h>
#include <net/9p/client.h>
#include <linux/slab.h>
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index eb0b083da269..612a230bc012 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -483,24 +483,15 @@ v9fs_vm_page_mkwrite(struct vm_fault *vmf)
static void v9fs_mmap_vm_close(struct vm_area_struct *vma)
{
- struct inode *inode;
-
- struct writeback_control wbc = {
- .nr_to_write = LONG_MAX,
- .sync_mode = WB_SYNC_ALL,
- .range_start = (loff_t)vma->vm_pgoff * PAGE_SIZE,
- /* absolute end, byte at end included */
- .range_end = (loff_t)vma->vm_pgoff * PAGE_SIZE +
- (vma->vm_end - vma->vm_start - 1),
- };
-
if (!(vma->vm_flags & VM_SHARED))
return;
p9_debug(P9_DEBUG_VFS, "9p VMA close, %p, flushing", vma);
- inode = file_inode(vma->vm_file);
- filemap_fdatawrite_wbc(inode->i_mapping, &wbc);
+ filemap_fdatawrite_range(file_inode(vma->vm_file)->i_mapping,
+ (loff_t)vma->vm_pgoff * PAGE_SIZE,
+ (loff_t)vma->vm_pgoff * PAGE_SIZE +
+ (vma->vm_end - vma->vm_start - 1));
}
static const struct vm_operations_struct v9fs_mmap_file_vm_ops = {
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index d0c77ec31b1d..8666c9c62258 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -422,7 +422,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb,
inode = iget5_locked(sb, QID2INO(qid), test, v9fs_set_inode, st);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
/*
* initialize the inode with the stat info
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index be297e335468..1661a25f2772 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -112,7 +112,7 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
inode = iget5_locked(sb, QID2INO(qid), test, v9fs_set_inode_dotl, st);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
/*
* initialize the inode with the stat info
diff --git a/fs/Makefile b/fs/Makefile
index e3523ab2e587..a04274a3c854 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -14,7 +14,7 @@ obj-y := open.o read_write.o file_table.o super.o \
seq_file.o xattr.o libfs.o fs-writeback.o \
pnode.o splice.o sync.o utimes.o d_path.o \
stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
- fs_types.o fs_context.o fs_parser.o fsopen.o init.o \
+ fs_dirent.o fs_context.o fs_parser.o fsopen.o init.o \
kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o \
file_attr.o
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 0210df8d3500..0bfc7d151dcd 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -29,7 +29,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
inode = iget_locked(sb, ino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
pr_debug("affs_iget(%lu)\n", inode->i_ino);
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index f31359922e98..71c10a05cebe 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -140,7 +140,9 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
return ERR_PTR(-ENOMEM);
}
- cell->name = kmalloc(1 + namelen + 1, GFP_KERNEL);
+ /* Allocate the cell name and the key name in one go. */
+ cell->name = kmalloc(1 + namelen + 1 +
+ 4 + namelen + 1, GFP_KERNEL);
if (!cell->name) {
kfree(cell);
return ERR_PTR(-ENOMEM);
@@ -151,7 +153,11 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
cell->name_len = namelen;
for (i = 0; i < namelen; i++)
cell->name[i] = tolower(name[i]);
- cell->name[i] = 0;
+ cell->name[i++] = 0;
+
+ cell->key_desc = cell->name + i;
+ memcpy(cell->key_desc, "afs@", 4);
+ memcpy(cell->key_desc + 4, cell->name, cell->name_len + 1);
cell->net = net;
refcount_set(&cell->ref, 1);
@@ -229,7 +235,7 @@ error:
* @name: The name of the cell.
* @namesz: The strlen of the cell name.
* @vllist: A colon/comma separated list of numeric IP addresses or NULL.
- * @excl: T if an error should be given if the cell name already exists.
+ * @reason: The reason we're doing the lookup
* @trace: The reason to be logged if the lookup is successful.
*
* Look up a cell record by name and query the DNS for VL server addresses if
@@ -239,7 +245,8 @@ error:
*/
struct afs_cell *afs_lookup_cell(struct afs_net *net,
const char *name, unsigned int namesz,
- const char *vllist, bool excl,
+ const char *vllist,
+ enum afs_lookup_cell_for reason,
enum afs_cell_trace trace)
{
struct afs_cell *cell, *candidate, *cursor;
@@ -247,12 +254,18 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net,
enum afs_cell_state state;
int ret, n;
- _enter("%s,%s", name, vllist);
+ _enter("%s,%s,%u", name, vllist, reason);
- if (!excl) {
+ if (reason != AFS_LOOKUP_CELL_PRELOAD) {
cell = afs_find_cell(net, name, namesz, trace);
- if (!IS_ERR(cell))
+ if (!IS_ERR(cell)) {
+ if (reason == AFS_LOOKUP_CELL_DYNROOT)
+ goto no_wait;
+ if (cell->state == AFS_CELL_SETTING_UP ||
+ cell->state == AFS_CELL_UNLOOKED)
+ goto lookup_cell;
goto wait_for_cell;
+ }
}
/* Assume we're probably going to create a cell and preallocate and
@@ -298,26 +311,69 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net,
rb_insert_color(&cell->net_node, &net->cells);
up_write(&net->cells_lock);
- afs_queue_cell(cell, afs_cell_trace_queue_new);
+lookup_cell:
+ if (reason != AFS_LOOKUP_CELL_PRELOAD &&
+ reason != AFS_LOOKUP_CELL_ROOTCELL) {
+ set_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags);
+ afs_queue_cell(cell, afs_cell_trace_queue_new);
+ }
wait_for_cell:
- _debug("wait_for_cell");
state = smp_load_acquire(&cell->state); /* vs error */
- if (state != AFS_CELL_ACTIVE &&
- state != AFS_CELL_DEAD) {
+ switch (state) {
+ case AFS_CELL_ACTIVE:
+ case AFS_CELL_DEAD:
+ break;
+ case AFS_CELL_UNLOOKED:
+ default:
+ if (reason == AFS_LOOKUP_CELL_PRELOAD ||
+ reason == AFS_LOOKUP_CELL_ROOTCELL)
+ break;
+ _debug("wait_for_cell");
afs_see_cell(cell, afs_cell_trace_wait);
wait_var_event(&cell->state,
({
state = smp_load_acquire(&cell->state); /* vs error */
state == AFS_CELL_ACTIVE || state == AFS_CELL_DEAD;
}));
+ _debug("waited_for_cell %d %d", cell->state, cell->error);
}
+no_wait:
/* Check the state obtained from the wait check. */
+ state = smp_load_acquire(&cell->state); /* vs error */
if (state == AFS_CELL_DEAD) {
ret = cell->error;
goto error;
}
+ if (state == AFS_CELL_ACTIVE) {
+ switch (cell->dns_status) {
+ case DNS_LOOKUP_NOT_DONE:
+ if (cell->dns_source == DNS_RECORD_FROM_CONFIG) {
+ ret = 0;
+ break;
+ }
+ fallthrough;
+ default:
+ ret = -EIO;
+ goto error;
+ case DNS_LOOKUP_GOOD:
+ case DNS_LOOKUP_GOOD_WITH_BAD:
+ ret = 0;
+ break;
+ case DNS_LOOKUP_GOT_NOT_FOUND:
+ ret = -ENOENT;
+ goto error;
+ case DNS_LOOKUP_BAD:
+ ret = -EREMOTEIO;
+ goto error;
+ case DNS_LOOKUP_GOT_LOCAL_FAILURE:
+ case DNS_LOOKUP_GOT_TEMP_FAILURE:
+ case DNS_LOOKUP_GOT_NS_FAILURE:
+ ret = -EDESTADDRREQ;
+ goto error;
+ }
+ }
_leave(" = %p [cell]", cell);
return cell;
@@ -325,7 +381,7 @@ wait_for_cell:
cell_already_exists:
_debug("cell exists");
cell = cursor;
- if (excl) {
+ if (reason == AFS_LOOKUP_CELL_PRELOAD) {
ret = -EEXIST;
} else {
afs_use_cell(cursor, trace);
@@ -384,7 +440,8 @@ int afs_cell_init(struct afs_net *net, const char *rootcell)
return -EINVAL;
/* allocate a cell record for the root/workstation cell */
- new_root = afs_lookup_cell(net, rootcell, len, vllist, false,
+ new_root = afs_lookup_cell(net, rootcell, len, vllist,
+ AFS_LOOKUP_CELL_ROOTCELL,
afs_cell_trace_use_lookup_ws);
if (IS_ERR(new_root)) {
_leave(" = %ld", PTR_ERR(new_root));
@@ -660,33 +717,6 @@ void afs_set_cell_timer(struct afs_cell *cell, unsigned int delay_secs)
}
/*
- * Allocate a key to use as a placeholder for anonymous user security.
- */
-static int afs_alloc_anon_key(struct afs_cell *cell)
-{
- struct key *key;
- char keyname[4 + AFS_MAXCELLNAME + 1], *cp, *dp;
-
- /* Create a key to represent an anonymous user. */
- memcpy(keyname, "afs@", 4);
- dp = keyname + 4;
- cp = cell->name;
- do {
- *dp++ = tolower(*cp);
- } while (*cp++);
-
- key = rxrpc_get_null_key(keyname);
- if (IS_ERR(key))
- return PTR_ERR(key);
-
- cell->anonymous_key = key;
-
- _debug("anon key %p{%x}",
- cell->anonymous_key, key_serial(cell->anonymous_key));
- return 0;
-}
-
-/*
* Activate a cell.
*/
static int afs_activate_cell(struct afs_net *net, struct afs_cell *cell)
@@ -695,12 +725,6 @@ static int afs_activate_cell(struct afs_net *net, struct afs_cell *cell)
struct afs_cell *pcell;
int ret;
- if (!cell->anonymous_key) {
- ret = afs_alloc_anon_key(cell);
- if (ret < 0)
- return ret;
- }
-
ret = afs_proc_cell_setup(cell);
if (ret < 0)
return ret;
@@ -777,6 +801,7 @@ static bool afs_manage_cell(struct afs_cell *cell)
switch (cell->state) {
case AFS_CELL_SETTING_UP:
goto set_up_cell;
+ case AFS_CELL_UNLOOKED:
case AFS_CELL_ACTIVE:
goto cell_is_active;
case AFS_CELL_REMOVING:
@@ -797,7 +822,7 @@ set_up_cell:
goto remove_cell;
}
- afs_set_cell_state(cell, AFS_CELL_ACTIVE);
+ afs_set_cell_state(cell, AFS_CELL_UNLOOKED);
cell_is_active:
if (afs_has_cell_expired(cell, &next_manage))
@@ -807,6 +832,8 @@ cell_is_active:
ret = afs_update_cell(cell);
if (ret < 0)
cell->error = ret;
+ if (cell->state == AFS_CELL_UNLOOKED)
+ afs_set_cell_state(cell, AFS_CELL_ACTIVE);
}
if (next_manage < TIME64_MAX && cell->net->live) {
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 89d36e3e5c79..f4e9e12373ac 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -779,7 +779,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry)
struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode;
struct inode *inode = NULL, *ti;
afs_dataversion_t data_version = READ_ONCE(dvnode->status.data_version);
- bool supports_ibulk;
+ bool supports_ibulk, isnew;
long ret;
int i;
@@ -850,7 +850,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry)
* callback counters.
*/
ti = ilookup5_nowait(dir->i_sb, vp->fid.vnode,
- afs_ilookup5_test_by_fid, &vp->fid);
+ afs_ilookup5_test_by_fid, &vp->fid, &isnew);
if (!IS_ERR_OR_NULL(ti)) {
vnode = AFS_FS_I(ti);
vp->dv_before = vnode->status.data_version;
diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c
index 8c6130789fde..aa56e8951e03 100644
--- a/fs/afs/dynroot.c
+++ b/fs/afs/dynroot.c
@@ -64,7 +64,7 @@ static struct inode *afs_iget_pseudo_dir(struct super_block *sb, ino_t ino)
vnode = AFS_FS_I(inode);
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
netfs_inode_init(&vnode->netfs, NULL, false);
simple_inode_init_ts(inode);
set_nlink(inode, 2);
@@ -108,7 +108,8 @@ static struct dentry *afs_dynroot_lookup_cell(struct inode *dir, struct dentry *
dotted = true;
}
- cell = afs_lookup_cell(net, name, len, NULL, false,
+ cell = afs_lookup_cell(net, name, len, NULL,
+ AFS_LOOKUP_CELL_DYNROOT,
afs_cell_trace_use_lookup_dynroot);
if (IS_ERR(cell)) {
ret = PTR_ERR(cell);
@@ -258,7 +259,7 @@ static struct dentry *afs_lookup_atcell(struct inode *dir, struct dentry *dentry
vnode = AFS_FS_I(inode);
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
netfs_inode_init(&vnode->netfs, NULL, false);
simple_inode_init_ts(inode);
set_nlink(inode, 1);
@@ -383,7 +384,7 @@ struct inode *afs_dynroot_iget_root(struct super_block *sb)
vnode = AFS_FS_I(inode);
/* there shouldn't be an existing inode */
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
netfs_inode_init(&vnode->netfs, NULL, false);
simple_inode_init_ts(inode);
set_nlink(inode, 2);
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index e1cb17b85791..dde1857fcabb 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -427,7 +427,7 @@ static void afs_fetch_status_success(struct afs_operation *op)
struct afs_vnode *vnode = vp->vnode;
int ret;
- if (vnode->netfs.inode.i_state & I_NEW) {
+ if (inode_state_read_once(&vnode->netfs.inode) & I_NEW) {
ret = afs_inode_init_from_status(op, vp, vnode);
afs_op_set_error(op, ret);
if (ret == 0)
@@ -579,7 +579,7 @@ struct inode *afs_iget(struct afs_operation *op, struct afs_vnode_param *vp)
inode, vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
/* deal with an existing inode */
- if (!(inode->i_state & I_NEW)) {
+ if (!(inode_state_read_once(inode) & I_NEW)) {
_leave(" = %p", inode);
return inode;
}
@@ -639,7 +639,7 @@ struct inode *afs_root_iget(struct super_block *sb, struct key *key)
_debug("GOT ROOT INODE %p { vl=%llx }", inode, as->volume->vid);
- BUG_ON(!(inode->i_state & I_NEW));
+ BUG_ON(!(inode_state_read_once(inode) & I_NEW));
vnode = AFS_FS_I(inode);
vnode->cb_v_check = atomic_read(&as->volume->cb_v_break);
@@ -748,7 +748,7 @@ void afs_evict_inode(struct inode *inode)
if ((S_ISDIR(inode->i_mode) ||
S_ISLNK(inode->i_mode)) &&
- (inode->i_state & I_DIRTY) &&
+ (inode_state_read_once(inode) & I_DIRTY) &&
!sbi->dyn_root) {
struct writeback_control wbc = {
.sync_mode = WB_SYNC_ALL,
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index a45ae5c2ef8a..009064b8d661 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -343,6 +343,7 @@ extern const char afs_init_sysname[];
enum afs_cell_state {
AFS_CELL_SETTING_UP,
+ AFS_CELL_UNLOOKED,
AFS_CELL_ACTIVE,
AFS_CELL_REMOVING,
AFS_CELL_DEAD,
@@ -412,6 +413,7 @@ struct afs_cell {
u8 name_len; /* Length of name */
char *name; /* Cell name, case-flattened and NUL-padded */
+ char *key_desc; /* Authentication key description */
};
/*
@@ -1049,9 +1051,18 @@ static inline bool afs_cb_is_broken(unsigned int cb_break,
extern int afs_cell_init(struct afs_net *, const char *);
extern struct afs_cell *afs_find_cell(struct afs_net *, const char *, unsigned,
enum afs_cell_trace);
+enum afs_lookup_cell_for {
+ AFS_LOOKUP_CELL_DYNROOT,
+ AFS_LOOKUP_CELL_MOUNTPOINT,
+ AFS_LOOKUP_CELL_DIRECT_MOUNT,
+ AFS_LOOKUP_CELL_PRELOAD,
+ AFS_LOOKUP_CELL_ROOTCELL,
+ AFS_LOOKUP_CELL_ALIAS_CHECK,
+};
struct afs_cell *afs_lookup_cell(struct afs_net *net,
const char *name, unsigned int namesz,
- const char *vllist, bool excl,
+ const char *vllist,
+ enum afs_lookup_cell_for reason,
enum afs_cell_trace trace);
extern struct afs_cell *afs_use_cell(struct afs_cell *, enum afs_cell_trace);
void afs_unuse_cell(struct afs_cell *cell, enum afs_cell_trace reason);
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 1ad048e6e164..57c204a3c04e 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -107,7 +107,8 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt)
if (size > AFS_MAXCELLNAME)
return -ENAMETOOLONG;
- cell = afs_lookup_cell(ctx->net, p, size, NULL, false,
+ cell = afs_lookup_cell(ctx->net, p, size, NULL,
+ AFS_LOOKUP_CELL_MOUNTPOINT,
afs_cell_trace_use_lookup_mntpt);
if (IS_ERR(cell)) {
pr_err("kAFS: unable to lookup cell '%pd'\n", mntpt);
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 40e879c8ca77..44520549b509 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -122,7 +122,8 @@ static int afs_proc_cells_write(struct file *file, char *buf, size_t size)
if (strcmp(buf, "add") == 0) {
struct afs_cell *cell;
- cell = afs_lookup_cell(net, name, strlen(name), args, true,
+ cell = afs_lookup_cell(net, name, strlen(name), args,
+ AFS_LOOKUP_CELL_PRELOAD,
afs_cell_trace_use_lookup_add);
if (IS_ERR(cell)) {
ret = PTR_ERR(cell);
diff --git a/fs/afs/security.c b/fs/afs/security.c
index 6a7744c9e2a2..55ddce94af03 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -16,6 +16,31 @@
static DEFINE_HASHTABLE(afs_permits_cache, 10);
static DEFINE_SPINLOCK(afs_permits_lock);
+static DEFINE_MUTEX(afs_key_lock);
+
+/*
+ * Allocate a key to use as a placeholder for anonymous user security.
+ */
+static int afs_alloc_anon_key(struct afs_cell *cell)
+{
+ struct key *key;
+
+ mutex_lock(&afs_key_lock);
+ key = cell->anonymous_key;
+ if (!key) {
+ key = rxrpc_get_null_key(cell->key_desc);
+ if (!IS_ERR(key))
+ cell->anonymous_key = key;
+ }
+ mutex_unlock(&afs_key_lock);
+
+ if (IS_ERR(key))
+ return PTR_ERR(key);
+
+ _debug("anon key %p{%x}",
+ cell->anonymous_key, key_serial(cell->anonymous_key));
+ return 0;
+}
/*
* get a key
@@ -23,11 +48,12 @@ static DEFINE_SPINLOCK(afs_permits_lock);
struct key *afs_request_key(struct afs_cell *cell)
{
struct key *key;
+ int ret;
- _enter("{%x}", key_serial(cell->anonymous_key));
+ _enter("{%s}", cell->key_desc);
- _debug("key %s", cell->anonymous_key->description);
- key = request_key_net(&key_type_rxrpc, cell->anonymous_key->description,
+ _debug("key %s", cell->key_desc);
+ key = request_key_net(&key_type_rxrpc, cell->key_desc,
cell->net->net, NULL);
if (IS_ERR(key)) {
if (PTR_ERR(key) != -ENOKEY) {
@@ -35,6 +61,12 @@ struct key *afs_request_key(struct afs_cell *cell)
return key;
}
+ if (!cell->anonymous_key) {
+ ret = afs_alloc_anon_key(cell);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ }
+
/* act as anonymous user */
_leave(" = {%x} [anon]", key_serial(cell->anonymous_key));
return key_get(cell->anonymous_key);
@@ -52,11 +84,10 @@ struct key *afs_request_key_rcu(struct afs_cell *cell)
{
struct key *key;
- _enter("{%x}", key_serial(cell->anonymous_key));
+ _enter("{%s}", cell->key_desc);
- _debug("key %s", cell->anonymous_key->description);
- key = request_key_net_rcu(&key_type_rxrpc,
- cell->anonymous_key->description,
+ _debug("key %s", cell->key_desc);
+ key = request_key_net_rcu(&key_type_rxrpc, cell->key_desc,
cell->net->net);
if (IS_ERR(key)) {
if (PTR_ERR(key) != -ENOKEY) {
@@ -65,6 +96,8 @@ struct key *afs_request_key_rcu(struct afs_cell *cell)
}
/* act as anonymous user */
+ if (!cell->anonymous_key)
+ return NULL; /* Need to allocate */
_leave(" = {%x} [anon]", key_serial(cell->anonymous_key));
return key_get(cell->anonymous_key);
} else {
@@ -408,7 +441,7 @@ int afs_permission(struct mnt_idmap *idmap, struct inode *inode,
if (mask & MAY_NOT_BLOCK) {
key = afs_request_key_rcu(vnode->volume->cell);
- if (IS_ERR(key))
+ if (IS_ERR_OR_NULL(key))
return -ECHILD;
ret = -ECHILD;
diff --git a/fs/afs/super.c b/fs/afs/super.c
index da407f2d6f0d..d672b7ab57ae 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -290,7 +290,7 @@ static int afs_parse_source(struct fs_context *fc, struct fs_parameter *param)
/* lookup the cell record */
if (cellname) {
cell = afs_lookup_cell(ctx->net, cellname, cellnamesz,
- NULL, false,
+ NULL, AFS_LOOKUP_CELL_DIRECT_MOUNT,
afs_cell_trace_use_lookup_mount);
if (IS_ERR(cell)) {
pr_err("kAFS: unable to lookup cell '%*.*s'\n",
diff --git a/fs/afs/vl_alias.c b/fs/afs/vl_alias.c
index 709b4cdb723e..fc9676abd252 100644
--- a/fs/afs/vl_alias.c
+++ b/fs/afs/vl_alias.c
@@ -269,7 +269,8 @@ static int yfs_check_canonical_cell_name(struct afs_cell *cell, struct key *key)
if (!name_len || name_len > AFS_MAXCELLNAME)
master = ERR_PTR(-EOPNOTSUPP);
else
- master = afs_lookup_cell(cell->net, cell_name, name_len, NULL, false,
+ master = afs_lookup_cell(cell->net, cell_name, name_len, NULL,
+ AFS_LOOKUP_CELL_ALIAS_CHECK,
afs_cell_trace_use_lookup_canonical);
kfree(cell_name);
if (IS_ERR(master))
diff --git a/fs/aio.c b/fs/aio.c
index 5bc133386407..0a23a8c0717f 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1640,10 +1640,10 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb,
static void aio_fsync_work(struct work_struct *work)
{
struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, fsync.work);
- const struct cred *old_cred = override_creds(iocb->fsync.creds);
- iocb->ki_res.res = vfs_fsync(iocb->fsync.file, iocb->fsync.datasync);
- revert_creds(old_cred);
+ scoped_with_creds(iocb->fsync.creds)
+ iocb->ki_res.res = vfs_fsync(iocb->fsync.file, iocb->fsync.datasync);
+
put_cred(iocb->fsync.creds);
iocb_put(iocb);
}
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 180a458fc4f7..b8381c7fb636 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -280,27 +280,8 @@ static int __anon_inode_getfd(const char *name,
const struct inode *context_inode,
bool make_inode)
{
- int error, fd;
- struct file *file;
-
- error = get_unused_fd_flags(flags);
- if (error < 0)
- return error;
- fd = error;
-
- file = __anon_inode_getfile(name, fops, priv, flags, context_inode,
- make_inode);
- if (IS_ERR(file)) {
- error = PTR_ERR(file);
- goto err_put_unused_fd;
- }
- fd_install(fd, file);
-
- return fd;
-
-err_put_unused_fd:
- put_unused_fd(fd);
- return error;
+ return FD_ADD(flags, __anon_inode_getfile(name, fops, priv, flags,
+ context_inode, make_inode));
}
/**
diff --git a/fs/attr.c b/fs/attr.c
index 795f231d00e8..b9ec6b47bab2 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -415,7 +415,7 @@ EXPORT_SYMBOL(may_setattr);
* performed on the raw inode simply pass @nop_mnt_idmap.
*/
int notify_change(struct mnt_idmap *idmap, struct dentry *dentry,
- struct iattr *attr, struct inode **delegated_inode)
+ struct iattr *attr, struct delegated_inode *delegated_inode)
{
struct inode *inode = dentry->d_inode;
umode_t mode = inode->i_mode;
diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index 23cea74f9933..4fd555528c5d 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -16,6 +16,7 @@
#include <linux/wait.h>
#include <linux/sched.h>
#include <linux/sched/signal.h>
+#include <uapi/linux/mount.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/uaccess.h>
@@ -27,6 +28,9 @@
#include <linux/magic.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
+#include "../mount.h"
+#include <linux/ns_common.h>
+
/* This is the range of ioctl() numbers we claim as ours */
#define AUTOFS_IOC_FIRST AUTOFS_IOC_READY
@@ -114,6 +118,7 @@ struct autofs_sb_info {
int pipefd;
struct file *pipe;
struct pid *oz_pgrp;
+ u64 mnt_ns_id;
int version;
int sub_version;
int min_proto;
diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c
index d8dd150cbd74..a58f9248b0f5 100644
--- a/fs/autofs/dev-ioctl.c
+++ b/fs/autofs/dev-ioctl.c
@@ -231,32 +231,14 @@ static int test_by_type(const struct path *path, void *p)
*/
static int autofs_dev_ioctl_open_mountpoint(const char *name, dev_t devid)
{
- int err, fd;
-
- fd = get_unused_fd_flags(O_CLOEXEC);
- if (likely(fd >= 0)) {
- struct file *filp;
- struct path path;
-
- err = find_autofs_mount(name, &path, test_by_dev, &devid);
- if (err)
- goto out;
-
- filp = dentry_open(&path, O_RDONLY, current_cred());
- path_put(&path);
- if (IS_ERR(filp)) {
- err = PTR_ERR(filp);
- goto out;
- }
-
- fd_install(fd, filp);
- }
+ struct path path __free(path_put) = {};
+ int err;
- return fd;
+ err = find_autofs_mount(name, &path, test_by_dev, &devid);
+ if (err)
+ return err;
-out:
- put_unused_fd(fd);
- return err;
+ return FD_ADD(O_CLOEXEC, dentry_open(&path, O_RDONLY, current_cred()));
}
/* Open a file descriptor on an autofs mount point */
@@ -381,6 +363,7 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
swap(sbi->oz_pgrp, new_pid);
sbi->pipefd = pipefd;
sbi->pipe = pipe;
+ sbi->mnt_ns_id = to_ns_common(current->nsproxy->mnt_ns)->ns_id;
sbi->flags &= ~AUTOFS_SBI_CATATONIC;
}
out:
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index f5c16ffba013..732aee76a24c 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -251,6 +251,7 @@ static struct autofs_sb_info *autofs_alloc_sbi(void)
sbi->min_proto = AUTOFS_MIN_PROTO_VERSION;
sbi->max_proto = AUTOFS_MAX_PROTO_VERSION;
sbi->pipefd = -1;
+ sbi->mnt_ns_id = to_ns_common(current->nsproxy->mnt_ns)->ns_id;
set_autofs_type_indirect(&sbi->type);
mutex_init(&sbi->wq_mutex);
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 174c7205fee4..d10df9d89d1c 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -341,6 +341,14 @@ static struct vfsmount *autofs_d_automount(struct path *path)
if (autofs_oz_mode(sbi))
return NULL;
+ /* Refuse to trigger mount if current namespace is not the owner
+ * and the mount is propagation private.
+ */
+ if (sbi->mnt_ns_id != to_ns_common(current->nsproxy->mnt_ns)->ns_id) {
+ if (vfsmount_to_propagation_flags(path->mnt) & MS_PRIVATE)
+ return ERR_PTR(-EPERM);
+ }
+
/*
* If an expire request is pending everyone must wait.
* If the expire fails we're still mounted so continue
diff --git a/fs/backing-file.c b/fs/backing-file.c
index 15a7f8031084..45da8600d564 100644
--- a/fs/backing-file.c
+++ b/fs/backing-file.c
@@ -157,13 +157,37 @@ static int backing_aio_init_wq(struct kiocb *iocb)
return sb_init_dio_done_wq(sb);
}
+static int do_backing_file_read_iter(struct file *file, struct iov_iter *iter,
+ struct kiocb *iocb, int flags)
+{
+ struct backing_aio *aio = NULL;
+ int ret;
+
+ if (is_sync_kiocb(iocb)) {
+ rwf_t rwf = iocb_to_rw_flags(flags);
+
+ return vfs_iter_read(file, iter, &iocb->ki_pos, rwf);
+ }
+
+ aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL);
+ if (!aio)
+ return -ENOMEM;
+
+ aio->orig_iocb = iocb;
+ kiocb_clone(&aio->iocb, iocb, get_file(file));
+ aio->iocb.ki_complete = backing_aio_rw_complete;
+ refcount_set(&aio->ref, 2);
+ ret = vfs_iocb_iter_read(file, &aio->iocb, iter);
+ backing_aio_put(aio);
+ if (ret != -EIOCBQUEUED)
+ backing_aio_cleanup(aio, ret);
+ return ret;
+}
ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter,
struct kiocb *iocb, int flags,
struct backing_file_ctx *ctx)
{
- struct backing_aio *aio = NULL;
- const struct cred *old_cred;
ssize_t ret;
if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)))
@@ -176,41 +200,57 @@ ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter,
!(file->f_mode & FMODE_CAN_ODIRECT))
return -EINVAL;
- old_cred = override_creds(ctx->cred);
+ scoped_with_creds(ctx->cred)
+ ret = do_backing_file_read_iter(file, iter, iocb, flags);
+
+ if (ctx->accessed)
+ ctx->accessed(iocb->ki_filp);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(backing_file_read_iter);
+
+static int do_backing_file_write_iter(struct file *file, struct iov_iter *iter,
+ struct kiocb *iocb, int flags,
+ void (*end_write)(struct kiocb *, ssize_t))
+{
+ struct backing_aio *aio;
+ int ret;
+
if (is_sync_kiocb(iocb)) {
rwf_t rwf = iocb_to_rw_flags(flags);
- ret = vfs_iter_read(file, iter, &iocb->ki_pos, rwf);
- } else {
- ret = -ENOMEM;
- aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL);
- if (!aio)
- goto out;
-
- aio->orig_iocb = iocb;
- kiocb_clone(&aio->iocb, iocb, get_file(file));
- aio->iocb.ki_complete = backing_aio_rw_complete;
- refcount_set(&aio->ref, 2);
- ret = vfs_iocb_iter_read(file, &aio->iocb, iter);
- backing_aio_put(aio);
- if (ret != -EIOCBQUEUED)
- backing_aio_cleanup(aio, ret);
+ ret = vfs_iter_write(file, iter, &iocb->ki_pos, rwf);
+ if (end_write)
+ end_write(iocb, ret);
+ return ret;
}
-out:
- revert_creds(old_cred);
- if (ctx->accessed)
- ctx->accessed(iocb->ki_filp);
+ ret = backing_aio_init_wq(iocb);
+ if (ret)
+ return ret;
+
+ aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL);
+ if (!aio)
+ return -ENOMEM;
+ aio->orig_iocb = iocb;
+ aio->end_write = end_write;
+ kiocb_clone(&aio->iocb, iocb, get_file(file));
+ aio->iocb.ki_flags = flags;
+ aio->iocb.ki_complete = backing_aio_queue_completion;
+ refcount_set(&aio->ref, 2);
+ ret = vfs_iocb_iter_write(file, &aio->iocb, iter);
+ backing_aio_put(aio);
+ if (ret != -EIOCBQUEUED)
+ backing_aio_cleanup(aio, ret);
return ret;
}
-EXPORT_SYMBOL_GPL(backing_file_read_iter);
ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter,
struct kiocb *iocb, int flags,
struct backing_file_ctx *ctx)
{
- const struct cred *old_cred;
ssize_t ret;
if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)))
@@ -227,46 +267,8 @@ ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter,
!(file->f_mode & FMODE_CAN_ODIRECT))
return -EINVAL;
- /*
- * Stacked filesystems don't support deferred completions, don't copy
- * this property in case it is set by the issuer.
- */
- flags &= ~IOCB_DIO_CALLER_COMP;
-
- old_cred = override_creds(ctx->cred);
- if (is_sync_kiocb(iocb)) {
- rwf_t rwf = iocb_to_rw_flags(flags);
-
- ret = vfs_iter_write(file, iter, &iocb->ki_pos, rwf);
- if (ctx->end_write)
- ctx->end_write(iocb, ret);
- } else {
- struct backing_aio *aio;
-
- ret = backing_aio_init_wq(iocb);
- if (ret)
- goto out;
-
- ret = -ENOMEM;
- aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL);
- if (!aio)
- goto out;
-
- aio->orig_iocb = iocb;
- aio->end_write = ctx->end_write;
- kiocb_clone(&aio->iocb, iocb, get_file(file));
- aio->iocb.ki_flags = flags;
- aio->iocb.ki_complete = backing_aio_queue_completion;
- refcount_set(&aio->ref, 2);
- ret = vfs_iocb_iter_write(file, &aio->iocb, iter);
- backing_aio_put(aio);
- if (ret != -EIOCBQUEUED)
- backing_aio_cleanup(aio, ret);
- }
-out:
- revert_creds(old_cred);
-
- return ret;
+ scoped_with_creds(ctx->cred)
+ return do_backing_file_write_iter(file, iter, iocb, flags, ctx->end_write);
}
EXPORT_SYMBOL_GPL(backing_file_write_iter);
@@ -275,15 +277,13 @@ ssize_t backing_file_splice_read(struct file *in, struct kiocb *iocb,
unsigned int flags,
struct backing_file_ctx *ctx)
{
- const struct cred *old_cred;
ssize_t ret;
if (WARN_ON_ONCE(!(in->f_mode & FMODE_BACKING)))
return -EIO;
- old_cred = override_creds(ctx->cred);
- ret = vfs_splice_read(in, &iocb->ki_pos, pipe, len, flags);
- revert_creds(old_cred);
+ scoped_with_creds(ctx->cred)
+ ret = vfs_splice_read(in, &iocb->ki_pos, pipe, len, flags);
if (ctx->accessed)
ctx->accessed(iocb->ki_filp);
@@ -297,7 +297,6 @@ ssize_t backing_file_splice_write(struct pipe_inode_info *pipe,
size_t len, unsigned int flags,
struct backing_file_ctx *ctx)
{
- const struct cred *old_cred;
ssize_t ret;
if (WARN_ON_ONCE(!(out->f_mode & FMODE_BACKING)))
@@ -310,11 +309,11 @@ ssize_t backing_file_splice_write(struct pipe_inode_info *pipe,
if (ret)
return ret;
- old_cred = override_creds(ctx->cred);
- file_start_write(out);
- ret = out->f_op->splice_write(pipe, out, &iocb->ki_pos, len, flags);
- file_end_write(out);
- revert_creds(old_cred);
+ scoped_with_creds(ctx->cred) {
+ file_start_write(out);
+ ret = out->f_op->splice_write(pipe, out, &iocb->ki_pos, len, flags);
+ file_end_write(out);
+ }
if (ctx->end_write)
ctx->end_write(iocb, ret);
@@ -326,7 +325,6 @@ EXPORT_SYMBOL_GPL(backing_file_splice_write);
int backing_file_mmap(struct file *file, struct vm_area_struct *vma,
struct backing_file_ctx *ctx)
{
- const struct cred *old_cred;
struct file *user_file = vma->vm_file;
int ret;
@@ -338,9 +336,8 @@ int backing_file_mmap(struct file *file, struct vm_area_struct *vma,
vma_set_file(vma, file);
- old_cred = override_creds(ctx->cred);
- ret = vfs_mmap(vma->vm_file, vma);
- revert_creds(old_cred);
+ scoped_with_creds(ctx->cred)
+ ret = vfs_mmap(vma->vm_file, vma);
if (ctx->accessed)
ctx->accessed(user_file);
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 8f430ff8e445..9fcfdd6b8189 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -307,7 +307,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
inode = iget_locked(sb, ino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
befs_ino = BEFS_I(inode);
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 1d41ce477df5..ce6f83234b67 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -42,7 +42,7 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino)
inode = iget_locked(sb, ino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
if ((ino < BFS_ROOT_INO) || (ino > BFS_SB(inode->i_sb)->si_lasti)) {
@@ -61,7 +61,19 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino)
off = (ino - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK;
di = (struct bfs_inode *)bh->b_data + off;
- inode->i_mode = 0x0000FFFF & le32_to_cpu(di->i_mode);
+ /*
+ * https://martin.hinner.info/fs/bfs/bfs-structure.html explains that
+ * BFS in SCO UnixWare environment used only lower 9 bits of di->i_mode
+ * value. This means that, although bfs_write_inode() saves whole
+ * inode->i_mode bits (which include S_IFMT bits and S_IS{UID,GID,VTX}
+ * bits), middle 7 bits of di->i_mode value can be garbage when these
+ * bits were not saved by bfs_write_inode().
+ * Since we can't tell whether middle 7 bits are garbage, use only
+ * lower 12 bits (i.e. tolerate S_IS{UID,GID,VTX} bits possibly being
+ * garbage) and reconstruct S_IFMT bits for Linux environment from
+ * di->i_vtype value.
+ */
+ inode->i_mode = 0x00000FFF & le32_to_cpu(di->i_mode);
if (le32_to_cpu(di->i_vtype) == BFS_VDIR) {
inode->i_mode |= S_IFDIR;
inode->i_op = &bfs_dir_inops;
@@ -71,6 +83,11 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino)
inode->i_op = &bfs_file_inops;
inode->i_fop = &bfs_file_operations;
inode->i_mapping->a_ops = &bfs_aops;
+ } else {
+ brelse(bh);
+ printf("Unknown vtype=%u %s:%08lx\n",
+ le32_to_cpu(di->i_vtype), inode->i_sb->s_id, ino);
+ goto error;
}
BFS_I(inode)->i_sblock = le32_to_cpu(di->i_sblock);
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index a839f960cd4a..d7aec5b87c2b 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -782,8 +782,6 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
return PTR_ERR(e);
if (e->flags & MISC_FMT_OPEN_FILE) {
- const struct cred *old_cred;
-
/*
* Now that we support unprivileged binfmt_misc mounts make
* sure we use the credentials that the register @file was
@@ -791,9 +789,8 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
* didn't matter much as only a privileged process could open
* the register file.
*/
- old_cred = override_creds(file->f_cred);
- f = open_exec(e->interpreter);
- revert_creds(old_cred);
+ scoped_with_creds(file->f_cred)
+ f = open_exec(e->interpreter);
if (IS_ERR(f)) {
pr_notice("register: failed to install interpreter file %s\n",
e->interpreter);
@@ -837,8 +834,10 @@ out:
inode_unlock(d_inode(root));
if (err) {
- if (f)
+ if (f) {
+ exe_file_allow_write_access(f);
filp_close(f, NULL);
+ }
kfree(e);
return err;
}
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 5322ef2ae015..08cdda47509f 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1850,12 +1850,10 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
if (!btrfs_should_reclaim(fs_info))
return;
- sb_start_write(fs_info->sb);
+ guard(super_write)(fs_info->sb);
- if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
- sb_end_write(fs_info->sb);
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
return;
- }
/*
* Long running balances can keep us blocked here for eternity, so
@@ -1863,7 +1861,6 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
*/
if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) {
btrfs_exclop_finish(fs_info);
- sb_end_write(fs_info->sb);
return;
}
@@ -1947,7 +1944,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
/*
* Get out fast, in case we're read-only or unmounting the
* filesystem. It is OK to drop block groups from the list even
- * for the read-only case. As we did sb_start_write(),
+ * for the read-only case. As we did take the super write lock,
* "mount -o remount,ro" won't happen and read-only filesystem
* means it is forced read-only due to a fatal error. So, it
* never gets back to read-write to let us reclaim again.
@@ -2030,7 +2027,6 @@ end:
list_splice_tail(&retry_list, &fs_info->reclaim_bgs);
spin_unlock(&fs_info->unused_bgs_lock);
btrfs_exclop_finish(fs_info);
- sb_end_write(fs_info->sb);
}
void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index eba188a9e3bb..aee1fd21cdd6 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -85,8 +85,8 @@ static inline u32 btrfs_calc_input_length(struct folio *folio, u64 range_end, u6
{
/* @cur must be inside the folio. */
ASSERT(folio_pos(folio) <= cur);
- ASSERT(cur < folio_end(folio));
- return min(range_end, folio_end(folio)) - cur;
+ ASSERT(cur < folio_next_pos(folio));
+ return umin(range_end, folio_next_pos(folio)) - cur;
}
int btrfs_alloc_compress_wsm(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index 7b277934f66f..a7f20f048398 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -254,10 +254,9 @@ again:
range.extent_thresh = defrag->extent_thresh;
file_ra_state_init(ra, inode->vfs_inode.i_mapping);
- sb_start_write(fs_info->sb);
- ret = btrfs_defrag_file(inode, ra, &range, defrag->transid,
- BTRFS_DEFRAG_BATCH);
- sb_end_write(fs_info->sb);
+ scoped_guard(super_write, fs_info->sb)
+ ret = btrfs_defrag_file(inode, ra, &range,
+ defrag->transid, BTRFS_DEFRAG_BATCH);
iput(&inode->vfs_inode);
if (ret < 0)
@@ -886,7 +885,7 @@ again:
}
lock_start = folio_pos(folio);
- lock_end = folio_end(folio) - 1;
+ lock_end = folio_next_pos(folio) - 1;
/* Wait for any existing ordered extent in the range */
while (1) {
struct btrfs_ordered_extent *ordered;
@@ -1178,7 +1177,8 @@ static int defrag_one_locked_target(struct btrfs_inode *inode,
if (!folio)
break;
- if (start >= folio_end(folio) || start + len <= folio_pos(folio))
+ if (start >= folio_next_pos(folio) ||
+ start + len <= folio_pos(folio))
continue;
btrfs_folio_clamp_clear_checked(fs_info, folio, start, len);
btrfs_folio_clamp_set_dirty(fs_info, folio, start, len);
@@ -1219,7 +1219,7 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
folios[i] = NULL;
goto free_folios;
}
- cur = folio_end(folios[i]);
+ cur = folio_next_pos(folios[i]);
}
for (int i = 0; i < nr_pages; i++) {
if (!folios[i])
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 23273d0e6f22..7361d5d890d2 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -333,7 +333,7 @@ static noinline int lock_delalloc_folios(struct inode *inode,
goto out;
}
range_start = max_t(u64, folio_pos(folio), start);
- range_len = min_t(u64, folio_end(folio), end + 1) - range_start;
+ range_len = min_t(u64, folio_next_pos(folio), end + 1) - range_start;
btrfs_folio_set_lock(fs_info, folio, range_start, range_len);
processed_end = range_start + range_len - 1;
@@ -387,7 +387,7 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
ASSERT(orig_end > orig_start);
/* The range should at least cover part of the folio */
- ASSERT(!(orig_start >= folio_end(locked_folio) ||
+ ASSERT(!(orig_start >= folio_next_pos(locked_folio) ||
orig_end <= folio_pos(locked_folio)));
again:
/* step one, find a bunch of delalloc bytes starting at start */
@@ -493,7 +493,7 @@ static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 le
struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
ASSERT(folio_pos(folio) <= start &&
- start + len <= folio_end(folio));
+ start + len <= folio_next_pos(folio));
if (uptodate && btrfs_verify_folio(folio, start, len))
btrfs_folio_set_uptodate(fs_info, folio, start, len);
@@ -1201,7 +1201,7 @@ static bool can_skip_one_ordered_range(struct btrfs_inode *inode,
* finished our folio read and unlocked the folio.
*/
if (btrfs_folio_test_dirty(fs_info, folio, cur, blocksize)) {
- u64 range_len = min(folio_end(folio),
+ u64 range_len = umin(folio_next_pos(folio),
ordered->file_offset + ordered->num_bytes) - cur;
ret = true;
@@ -1223,7 +1223,7 @@ static bool can_skip_one_ordered_range(struct btrfs_inode *inode,
* So we return true and update @next_ret to the OE/folio boundary.
*/
if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) {
- u64 range_len = min(folio_end(folio),
+ u64 range_len = umin(folio_next_pos(folio),
ordered->file_offset + ordered->num_bytes) - cur;
/*
@@ -2215,7 +2215,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
for (int i = 0; i < num_extent_folios(eb); i++) {
struct folio *folio = eb->folios[i];
u64 range_start = max_t(u64, eb->start, folio_pos(folio));
- u32 range_len = min_t(u64, folio_end(folio),
+ u32 range_len = min_t(u64, folio_next_pos(folio),
eb->start + eb->len) - range_start;
folio_lock(folio);
@@ -2468,10 +2468,7 @@ static int extent_write_cache_pages(struct address_space *mapping,
&BTRFS_I(inode)->runtime_flags))
wbc->tagged_writepages = 1;
- if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
- tag = PAGECACHE_TAG_TOWRITE;
- else
- tag = PAGECACHE_TAG_DIRTY;
+ tag = wbc_to_tag(wbc);
retry:
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag_pages_for_writeback(mapping, index, end);
@@ -2627,7 +2624,7 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
continue;
}
- cur_end = min_t(u64, folio_end(folio) - 1, end);
+ cur_end = min_t(u64, folio_next_pos(folio) - 1, end);
cur_len = cur_end + 1 - cur;
ASSERT(folio_test_locked(folio));
@@ -3868,7 +3865,7 @@ int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num,
for (int i = 0; i < num_extent_folios(eb); i++) {
struct folio *folio = eb->folios[i];
u64 range_start = max_t(u64, eb->start, folio_pos(folio));
- u32 range_len = min_t(u64, folio_end(folio),
+ u32 range_len = min_t(u64, folio_next_pos(folio),
eb->start + eb->len) - range_start;
bio_add_folio_nofail(&bbio->bio, folio, range_len,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index fa82def46e39..e7453f992e1e 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -89,7 +89,8 @@ int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos
num_bytes = round_up(write_bytes + pos - start_pos,
fs_info->sectorsize);
ASSERT(num_bytes <= U32_MAX);
- ASSERT(folio_pos(folio) <= pos && folio_end(folio) >= pos + write_bytes);
+ ASSERT(folio_pos(folio) <= pos &&
+ folio_next_pos(folio) >= pos + write_bytes);
end_of_last_block = start_pos + num_bytes - 1;
@@ -799,7 +800,7 @@ static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64
u64 len)
{
u64 clamp_start = max_t(u64, pos, folio_pos(folio));
- u64 clamp_end = min_t(u64, pos + len, folio_end(folio));
+ u64 clamp_end = min_t(u64, pos + len, folio_next_pos(folio));
const u32 blocksize = inode_to_fs_info(inode)->sectorsize;
int ret = 0;
@@ -1254,8 +1255,8 @@ again:
* The reserved range goes beyond the current folio, shrink the reserved
* space to the folio boundary.
*/
- if (reserved_start + reserved_len > folio_end(folio)) {
- const u64 last_block = folio_end(folio);
+ if (reserved_start + reserved_len > folio_next_pos(folio)) {
+ const u64 last_block = folio_next_pos(folio);
shrink_reserved_space(inode, *data_reserved, reserved_start,
reserved_len, last_block - reserved_start,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3df5f36185a0..9c6ca87b3d56 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9,6 +9,7 @@
#include <linux/blk-cgroup.h>
#include <linux/file.h>
#include <linux/fs.h>
+#include <linux/fs_struct.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/time.h>
@@ -177,8 +178,10 @@ static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
return ret;
}
ret = paths_from_inode(inum, ipath);
- if (ret < 0)
+ if (ret < 0) {
+ btrfs_put_root(local_root);
goto err;
+ }
/*
* We deliberately ignore the bit ipath might have been too small to
@@ -409,7 +412,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
continue;
}
- index = folio_end(folio) >> PAGE_SHIFT;
+ index = folio_next_index(folio);
/*
* Here we just clear all Ordered bits for every page in the
* range, then btrfs_mark_ordered_io_finished() will handle
@@ -2336,7 +2339,8 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol
* The range must cover part of the @locked_folio, or a return of 1
* can confuse the caller.
*/
- ASSERT(!(end <= folio_pos(locked_folio) || start >= folio_end(locked_folio)));
+ ASSERT(!(end <= folio_pos(locked_folio) ||
+ start >= folio_next_pos(locked_folio)));
if (should_nocow(inode, start, end)) {
ret = run_delalloc_nocow(inode, locked_folio, start, end);
@@ -2743,7 +2747,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
struct btrfs_inode *inode = fixup->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
u64 page_start = folio_pos(folio);
- u64 page_end = folio_end(folio) - 1;
+ u64 page_end = folio_next_pos(folio) - 1;
int ret = 0;
bool free_delalloc_space = true;
@@ -3884,7 +3888,7 @@ static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc)
ASSERT(ret != -ENOMEM);
return ret;
} else if (existing) {
- WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING)));
+ WARN_ON(!(inode_state_read_once(&existing->vfs_inode) & (I_WILL_FREE | I_FREEING)));
}
return 0;
@@ -4855,7 +4859,7 @@ again:
*/
zero_start = max_t(u64, folio_pos(folio), start);
- zero_end = folio_end(folio);
+ zero_end = folio_next_pos(folio);
folio_zero_range(folio, zero_start - folio_pos(folio),
zero_end - zero_start);
@@ -5038,7 +5042,7 @@ again:
* not reach disk, it still affects our page caches.
*/
zero_start = max_t(u64, folio_pos(folio), start);
- zero_end = min_t(u64, folio_end(folio) - 1, end);
+ zero_end = min_t(u64, folio_next_pos(folio) - 1, end);
} else {
zero_start = max_t(u64, block_start, start);
zero_end = min_t(u64, block_end, end);
@@ -5361,7 +5365,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct rb_node *node;
- ASSERT(inode->i_state & I_FREEING);
+ ASSERT(inode_state_read_once(inode) & I_FREEING);
truncate_inode_pages_final(&inode->i_data);
btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
@@ -5799,7 +5803,7 @@ struct btrfs_inode *btrfs_iget_path(u64 ino, struct btrfs_root *root,
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->vfs_inode.i_state & I_NEW))
+ if (!(inode_state_read_once(&inode->vfs_inode) & I_NEW))
return inode;
ret = btrfs_read_locked_inode(inode, path);
@@ -5823,7 +5827,7 @@ struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root)
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->vfs_inode.i_state & I_NEW))
+ if (!(inode_state_read_once(&inode->vfs_inode) & I_NEW))
return inode;
path = btrfs_alloc_path();
@@ -5837,6 +5841,8 @@ struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root)
if (ret)
return ERR_PTR(ret);
+ if (S_ISDIR(inode->vfs_inode.i_mode))
+ inode->vfs_inode.i_opflags |= IOP_FASTPERM_MAY_EXEC;
unlock_new_inode(&inode->vfs_inode);
return inode;
}
@@ -6289,8 +6295,8 @@ static int btrfs_dirty_inode(struct btrfs_inode *inode)
}
/*
- * This is a copy of file_update_time. We need this so we can return error on
- * ENOSPC for updating the inode in the case of file write and mmap writes.
+ * We need our own ->update_time so that we can return error on ENOSPC for
+ * updating the inode in the case of file write and mmap writes.
*/
static int btrfs_update_time(struct inode *inode, int flags)
{
@@ -6788,8 +6794,11 @@ static int btrfs_create_common(struct inode *dir, struct dentry *dentry,
}
ret = btrfs_create_new_inode(trans, &new_inode_args);
- if (!ret)
+ if (!ret) {
+ if (S_ISDIR(inode->i_mode))
+ inode->i_opflags |= IOP_FASTPERM_MAY_EXEC;
d_instantiate_new(dentry, inode);
+ }
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
@@ -7479,7 +7488,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
u64 page_start = folio_pos(folio);
u64 page_end = page_start + folio_size(folio) - 1;
u64 cur;
- int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
+ int inode_evicting = inode_state_read_once(&inode->vfs_inode) & I_FREEING;
/*
* We have folio locked so no new ordered extent can be created on this
@@ -8708,15 +8717,13 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode
* some fairly slow code that needs optimization. This walks the list
* of all the inodes with pending delalloc and forces them to disk.
*/
-static int start_delalloc_inodes(struct btrfs_root *root,
- struct writeback_control *wbc, bool snapshot,
- bool in_reclaim_context)
+static int start_delalloc_inodes(struct btrfs_root *root, long *nr_to_write,
+ bool snapshot, bool in_reclaim_context)
{
struct btrfs_delalloc_work *work, *next;
LIST_HEAD(works);
LIST_HEAD(splice);
int ret = 0;
- bool full_flush = wbc->nr_to_write == LONG_MAX;
mutex_lock(&root->delalloc_mutex);
spin_lock(&root->delalloc_lock);
@@ -8742,10 +8749,10 @@ static int start_delalloc_inodes(struct btrfs_root *root,
if (snapshot)
set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, &inode->runtime_flags);
- if (full_flush) {
- work = btrfs_alloc_delalloc_work(&inode->vfs_inode);
+ if (nr_to_write == NULL) {
+ work = btrfs_alloc_delalloc_work(tmp_inode);
if (!work) {
- iput(&inode->vfs_inode);
+ iput(tmp_inode);
ret = -ENOMEM;
goto out;
}
@@ -8753,9 +8760,11 @@ static int start_delalloc_inodes(struct btrfs_root *root,
btrfs_queue_work(root->fs_info->flush_workers,
&work->work);
} else {
- ret = filemap_fdatawrite_wbc(inode->vfs_inode.i_mapping, wbc);
+ ret = filemap_flush_nr(tmp_inode->i_mapping,
+ nr_to_write);
btrfs_add_delayed_iput(inode);
- if (ret || wbc->nr_to_write <= 0)
+
+ if (ret || *nr_to_write <= 0)
goto out;
}
cond_resched();
@@ -8781,29 +8790,17 @@ out:
int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context)
{
- struct writeback_control wbc = {
- .nr_to_write = LONG_MAX,
- .sync_mode = WB_SYNC_NONE,
- .range_start = 0,
- .range_end = LLONG_MAX,
- };
struct btrfs_fs_info *fs_info = root->fs_info;
if (BTRFS_FS_ERROR(fs_info))
return -EROFS;
-
- return start_delalloc_inodes(root, &wbc, true, in_reclaim_context);
+ return start_delalloc_inodes(root, NULL, true, in_reclaim_context);
}
int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
bool in_reclaim_context)
{
- struct writeback_control wbc = {
- .nr_to_write = nr,
- .sync_mode = WB_SYNC_NONE,
- .range_start = 0,
- .range_end = LLONG_MAX,
- };
+ long *nr_to_write = nr == LONG_MAX ? NULL : &nr;
struct btrfs_root *root;
LIST_HEAD(splice);
int ret;
@@ -8815,13 +8812,6 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
spin_lock(&fs_info->delalloc_root_lock);
list_splice_init(&fs_info->delalloc_roots, &splice);
while (!list_empty(&splice)) {
- /*
- * Reset nr_to_write here so we know that we're doing a full
- * flush.
- */
- if (nr == LONG_MAX)
- wbc.nr_to_write = LONG_MAX;
-
root = list_first_entry(&splice, struct btrfs_root,
delalloc_root);
root = btrfs_grab_root(root);
@@ -8830,9 +8820,10 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
&fs_info->delalloc_roots);
spin_unlock(&fs_info->delalloc_root_lock);
- ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context);
+ ret = start_delalloc_inodes(root, nr_to_write, false,
+ in_reclaim_context);
btrfs_put_root(root);
- if (ret < 0 || wbc.nr_to_write <= 0)
+ if (ret < 0 || nr <= 0)
goto out;
spin_lock(&fs_info->delalloc_root_lock);
}
@@ -9168,6 +9159,11 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
min_size, actual_len, alloc_hint, trans);
}
+/*
+ * NOTE: in case you are adding MAY_EXEC check for directories:
+ * we are marking them with IOP_FASTPERM_MAY_EXEC, allowing path lookup to
+ * elide calls here.
+ */
static int btrfs_permission(struct mnt_idmap *idmap,
struct inode *inode, int mask)
{
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8cb7d5a462ef..b138120feba3 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -904,14 +904,9 @@ static noinline int btrfs_mksubvol(struct dentry *parent,
struct fscrypt_str name_str = FSTR_INIT((char *)qname->name, qname->len);
int ret;
- ret = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
- if (ret == -EINTR)
- return ret;
-
- dentry = lookup_one(idmap, qname, parent);
- ret = PTR_ERR(dentry);
+ dentry = start_creating_killable(idmap, parent, qname);
if (IS_ERR(dentry))
- goto out_unlock;
+ return PTR_ERR(dentry);
ret = btrfs_may_create(idmap, dir, dentry);
if (ret)
@@ -940,9 +935,7 @@ static noinline int btrfs_mksubvol(struct dentry *parent,
out_up_read:
up_read(&fs_info->subvol_sem);
out_dput:
- dput(dentry);
-out_unlock:
- btrfs_inode_unlock(BTRFS_I(dir), 0);
+ end_creating(dentry);
return ret;
}
@@ -2417,18 +2410,10 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
goto free_subvol_name;
}
- ret = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
- if (ret == -EINTR)
- goto free_subvol_name;
- dentry = lookup_one(idmap, &QSTR(subvol_name), parent);
+ dentry = start_removing_killable(idmap, parent, &QSTR(subvol_name));
if (IS_ERR(dentry)) {
ret = PTR_ERR(dentry);
- goto out_unlock_dir;
- }
-
- if (d_really_is_negative(dentry)) {
- ret = -ENOENT;
- goto out_dput;
+ goto out_end_removing;
}
inode = d_inode(dentry);
@@ -2449,7 +2434,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
*/
ret = -EPERM;
if (!btrfs_test_opt(fs_info, USER_SUBVOL_RM_ALLOWED))
- goto out_dput;
+ goto out_end_removing;
/*
* Do not allow deletion if the parent dir is the same
@@ -2460,21 +2445,21 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
*/
ret = -EINVAL;
if (root == dest)
- goto out_dput;
+ goto out_end_removing;
ret = inode_permission(idmap, inode, MAY_WRITE | MAY_EXEC);
if (ret)
- goto out_dput;
+ goto out_end_removing;
}
/* check if subvolume may be deleted by a user */
ret = btrfs_may_delete(idmap, dir, dentry, 1);
if (ret)
- goto out_dput;
+ goto out_end_removing;
if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) {
ret = -EINVAL;
- goto out_dput;
+ goto out_end_removing;
}
btrfs_inode_lock(BTRFS_I(inode), 0);
@@ -2483,10 +2468,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
if (!ret)
d_delete_notify(dir, dentry);
-out_dput:
- dput(dentry);
-out_unlock_dir:
- btrfs_inode_unlock(BTRFS_I(dir), 0);
+out_end_removing:
+ end_removing(dentry);
free_subvol_name:
kfree(subvol_name_ptr);
free_parent:
diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h
index 60f9b000d644..17b71e1285e5 100644
--- a/fs/btrfs/misc.h
+++ b/fs/btrfs/misc.h
@@ -209,9 +209,4 @@ static inline bool bitmap_test_range_all_zero(const unsigned long *addr,
return (found_set == start + nbits);
}
-static inline u64 folio_end(struct folio *folio)
-{
- return folio_pos(folio) + folio_size(folio);
-}
-
#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 2829f20d7bb5..7fedebbee558 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -359,7 +359,7 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
if (folio) {
ASSERT(folio->mapping);
ASSERT(folio_pos(folio) <= file_offset);
- ASSERT(file_offset + len <= folio_end(folio));
+ ASSERT(file_offset + len <= folio_next_pos(folio));
/*
* Ordered flag indicates whether we still have
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 651b11884f82..ba20d9286a34 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2203,6 +2203,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, full_stripe_start,
&length, &bioc, NULL, NULL);
if (ret < 0) {
+ bio_put(bio);
btrfs_put_bioc(bioc);
btrfs_bio_counter_dec(fs_info);
goto out;
@@ -2212,6 +2213,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
btrfs_put_bioc(bioc);
if (!rbio) {
ret = -ENOMEM;
+ bio_put(bio);
btrfs_bio_counter_dec(fs_info);
goto out;
}
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index 5ca8d4db6722..a7ba868e9372 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -186,7 +186,8 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info,
* unmapped page like dummy extent buffer pages.
*/
if (folio->mapping)
- ASSERT(folio_pos(folio) <= start && start + len <= folio_end(folio),
+ ASSERT(folio_pos(folio) <= start &&
+ start + len <= folio_next_pos(folio),
"start=%llu len=%u folio_pos=%llu folio_size=%zu",
start, len, folio_pos(folio), folio_size(folio));
}
@@ -217,7 +218,7 @@ static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len)
if (folio_pos(folio) >= orig_start + orig_len)
*len = 0;
else
- *len = min_t(u64, folio_end(folio), orig_start + orig_len) - *start;
+ *len = min_t(u64, folio_next_pos(folio), orig_start + orig_len) - *start;
}
static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c90b2d2cb08f..30f3c3b849c1 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -7122,7 +7122,7 @@ log_extents:
* a power failure unless the log was synced as part of an fsync
* against any other unrelated inode.
*/
- if (inode_only != LOG_INODE_EXISTS)
+ if (!ctx->logging_new_name && inode_only != LOG_INODE_EXISTS)
inode->last_log_commit = inode->last_sub_trans;
spin_unlock(&inode->lock);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 2bec544d8ba3..cc8aa4a04348 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2002,14 +2002,11 @@ out:
static void update_dev_time(const char *device_path)
{
struct path path;
- int ret;
-
- ret = kern_path(device_path, LOOKUP_FOLLOW, &path);
- if (ret)
- return;
- inode_update_time(d_inode(path.dentry), S_MTIME | S_CTIME | S_VERSION);
- path_put(&path);
+ if (!kern_path(device_path, LOOKUP_FOLLOW, &path)) {
+ vfs_utimes(&path, NULL);
+ path_put(&path);
+ }
}
static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans,
@@ -4660,12 +4657,12 @@ static int balance_kthread(void *data)
struct btrfs_fs_info *fs_info = data;
int ret = 0;
- sb_start_write(fs_info->sb);
+ guard(super_write)(fs_info->sb);
+
mutex_lock(&fs_info->balance_mutex);
if (fs_info->balance_ctl)
ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
mutex_unlock(&fs_info->balance_mutex);
- sb_end_write(fs_info->sb);
return ret;
}
@@ -8177,12 +8174,12 @@ static int relocating_repair_kthread(void *data)
target = cache->start;
btrfs_put_block_group(cache);
- sb_start_write(fs_info->sb);
+ guard(super_write)(fs_info->sb);
+
if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
btrfs_info(fs_info,
"zoned: skip relocating block group %llu to repair: EBUSY",
target);
- sb_end_write(fs_info->sb);
return -EBUSY;
}
@@ -8210,7 +8207,6 @@ out:
btrfs_put_block_group(cache);
mutex_unlock(&fs_info->reclaim_bgs_lock);
btrfs_exclop_finish(fs_info);
- sb_end_write(fs_info->sb);
return ret;
}
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 0ea0df18a8e4..d1db7fa1fe58 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1317,6 +1317,7 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
if (!btrfs_dev_is_sequential(device, info->physical)) {
up_read(&dev_replace->rwsem);
info->alloc_offset = WP_CONVENTIONAL;
+ info->capacity = device->zone_info->zone_size;
return 0;
}
@@ -1522,6 +1523,8 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
u64 last_alloc)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
+ u64 stripe_nr = 0, stripe_offset = 0;
+ u32 stripe_index = 0;
if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
@@ -1529,28 +1532,26 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
return -EINVAL;
}
+ if (last_alloc) {
+ u32 factor = map->num_stripes;
+
+ stripe_nr = last_alloc >> BTRFS_STRIPE_LEN_SHIFT;
+ stripe_offset = last_alloc & BTRFS_STRIPE_LEN_MASK;
+ stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
+ }
+
for (int i = 0; i < map->num_stripes; i++) {
if (zone_info[i].alloc_offset == WP_MISSING_DEV)
continue;
if (zone_info[i].alloc_offset == WP_CONVENTIONAL) {
- u64 stripe_nr, full_stripe_nr;
- u64 stripe_offset;
- int stripe_index;
- stripe_nr = div64_u64(last_alloc, map->stripe_size);
- stripe_offset = stripe_nr * map->stripe_size;
- full_stripe_nr = div_u64(stripe_nr, map->num_stripes);
- div_u64_rem(stripe_nr, map->num_stripes, &stripe_index);
-
- zone_info[i].alloc_offset =
- full_stripe_nr * map->stripe_size;
+ zone_info[i].alloc_offset = btrfs_stripe_nr_to_offset(stripe_nr);
if (stripe_index > i)
- zone_info[i].alloc_offset += map->stripe_size;
+ zone_info[i].alloc_offset += BTRFS_STRIPE_LEN;
else if (stripe_index == i)
- zone_info[i].alloc_offset +=
- (last_alloc - stripe_offset);
+ zone_info[i].alloc_offset += stripe_offset;
}
if (test_bit(0, active) != test_bit(i, active)) {
@@ -1574,6 +1575,8 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
u64 last_alloc)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
+ u64 stripe_nr = 0, stripe_offset = 0;
+ u32 stripe_index = 0;
if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
@@ -1581,6 +1584,14 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
return -EINVAL;
}
+ if (last_alloc) {
+ u32 factor = map->num_stripes / map->sub_stripes;
+
+ stripe_nr = last_alloc >> BTRFS_STRIPE_LEN_SHIFT;
+ stripe_offset = last_alloc & BTRFS_STRIPE_LEN_MASK;
+ stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
+ }
+
for (int i = 0; i < map->num_stripes; i++) {
if (zone_info[i].alloc_offset == WP_MISSING_DEV)
continue;
@@ -1594,26 +1605,12 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
}
if (zone_info[i].alloc_offset == WP_CONVENTIONAL) {
- u64 stripe_nr, full_stripe_nr;
- u64 stripe_offset;
- int stripe_index;
-
- stripe_nr = div64_u64(last_alloc, map->stripe_size);
- stripe_offset = stripe_nr * map->stripe_size;
- full_stripe_nr = div_u64(stripe_nr,
- map->num_stripes / map->sub_stripes);
- div_u64_rem(stripe_nr,
- (map->num_stripes / map->sub_stripes),
- &stripe_index);
-
- zone_info[i].alloc_offset =
- full_stripe_nr * map->stripe_size;
+ zone_info[i].alloc_offset = btrfs_stripe_nr_to_offset(stripe_nr);
if (stripe_index > (i / map->sub_stripes))
- zone_info[i].alloc_offset += map->stripe_size;
+ zone_info[i].alloc_offset += BTRFS_STRIPE_LEN;
else if (stripe_index == (i / map->sub_stripes))
- zone_info[i].alloc_offset +=
- (last_alloc - stripe_offset);
+ zone_info[i].alloc_offset += stripe_offset;
}
if ((i % map->sub_stripes) == 0) {
@@ -1683,8 +1680,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
set_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags);
if (num_conventional > 0) {
- /* Zone capacity is always zone size in emulation */
- cache->zone_capacity = cache->length;
ret = calculate_alloc_pointer(cache, &last_alloc, new);
if (ret) {
btrfs_err(fs_info,
@@ -1693,6 +1688,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
goto out;
} else if (map->num_stripes == num_conventional) {
cache->alloc_offset = last_alloc;
+ cache->zone_capacity = cache->length;
set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags);
goto out;
}
diff --git a/fs/buffer.c b/fs/buffer.c
index 6a8752f7bbed..838c0c571022 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -611,9 +611,9 @@ int generic_buffers_fsync_noflush(struct file *file, loff_t start, loff_t end,
return err;
ret = sync_mapping_buffers(inode->i_mapping);
- if (!(inode->i_state & I_DIRTY_ALL))
+ if (!(inode_state_read_once(inode) & I_DIRTY_ALL))
goto out;
- if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+ if (datasync && !(inode_state_read_once(inode) & I_DIRTY_DATASYNC))
goto out;
err = sync_inode_metadata(inode, 1);
@@ -2732,7 +2732,7 @@ int block_write_full_folio(struct folio *folio, struct writeback_control *wbc,
loff_t i_size = i_size_read(inode);
/* Is the folio fully inside i_size? */
- if (folio_pos(folio) + folio_size(folio) <= i_size)
+ if (folio_next_pos(folio) <= i_size)
return __block_write_full_folio(inode, folio, get_block, wbc);
/* Is the folio fully outside i_size? (truncate in progress) */
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 3e63cfe15874..a08250d244ea 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -9,6 +9,7 @@
#include <linux/mount.h>
#include <linux/xattr.h>
#include <linux/file.h>
+#include <linux/namei.h>
#include <linux/falloc.h>
#include <trace/events/fscache.h>
#include "internal.h"
@@ -428,11 +429,13 @@ static bool cachefiles_invalidate_cookie(struct fscache_cookie *cookie)
if (!old_tmpfile) {
struct cachefiles_volume *volume = object->volume;
struct dentry *fan = volume->fanout[(u8)cookie->key_hash];
+ struct dentry *obj;
- inode_lock_nested(d_inode(fan), I_MUTEX_PARENT);
- cachefiles_bury_object(volume->cache, object, fan,
- old_file->f_path.dentry,
- FSCACHE_OBJECT_INVALIDATED);
+ obj = start_removing_dentry(fan, old_file->f_path.dentry);
+ if (!IS_ERR(obj))
+ cachefiles_bury_object(volume->cache, object,
+ fan, obj,
+ FSCACHE_OBJECT_INVALIDATED);
}
fput(old_file);
}
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index d1edb2ac3837..e5ec90dccc27 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -93,12 +93,11 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
_enter(",,%s", dirname);
/* search the current directory for the element name */
- inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
retry:
ret = cachefiles_inject_read_error();
if (ret == 0)
- subdir = lookup_one(&nop_mnt_idmap, &QSTR(dirname), dir);
+ subdir = start_creating(&nop_mnt_idmap, dir, &QSTR(dirname));
else
subdir = ERR_PTR(ret);
trace_cachefiles_lookup(NULL, dir, subdir);
@@ -129,10 +128,12 @@ retry:
if (ret < 0)
goto mkdir_error;
ret = cachefiles_inject_write_error();
- if (ret == 0)
- subdir = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), subdir, 0700);
- else
+ if (ret == 0) {
+ subdir = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), subdir, 0700, NULL);
+ } else {
+ end_creating(subdir);
subdir = ERR_PTR(ret);
+ }
if (IS_ERR(subdir)) {
trace_cachefiles_vfs_error(NULL, d_inode(dir), ret,
cachefiles_trace_mkdir_error);
@@ -141,7 +142,7 @@ retry:
trace_cachefiles_mkdir(dir, subdir);
if (unlikely(d_unhashed(subdir) || d_is_negative(subdir))) {
- dput(subdir);
+ end_creating(subdir);
goto retry;
}
ASSERT(d_backing_inode(subdir));
@@ -154,7 +155,7 @@ retry:
/* Tell rmdir() it's not allowed to delete the subdir */
inode_lock(d_inode(subdir));
- inode_unlock(d_inode(dir));
+ end_creating_keep(subdir);
if (!__cachefiles_mark_inode_in_use(NULL, d_inode(subdir))) {
pr_notice("cachefiles: Inode already in use: %pd (B=%lx)\n",
@@ -196,14 +197,11 @@ mark_error:
return ERR_PTR(-EBUSY);
mkdir_error:
- inode_unlock(d_inode(dir));
- if (!IS_ERR(subdir))
- dput(subdir);
+ end_creating(subdir);
pr_err("mkdir %s failed with error %d\n", dirname, ret);
return ERR_PTR(ret);
lookup_error:
- inode_unlock(d_inode(dir));
ret = PTR_ERR(subdir);
pr_err("Lookup %s failed with error %d\n", dirname, ret);
return ERR_PTR(ret);
@@ -263,6 +261,8 @@ static int cachefiles_unlink(struct cachefiles_cache *cache,
* - File backed objects are unlinked
* - Directory backed objects are stuffed into the graveyard for userspace to
* delete
+ * On entry dir must be locked. It will be unlocked on exit.
+ * On entry there must be at least 2 refs on rep, one will be dropped on exit.
*/
int cachefiles_bury_object(struct cachefiles_cache *cache,
struct cachefiles_object *object,
@@ -278,27 +278,23 @@ int cachefiles_bury_object(struct cachefiles_cache *cache,
_enter(",'%pd','%pd'", dir, rep);
if (rep->d_parent != dir) {
- inode_unlock(d_inode(dir));
+ end_removing(rep);
_leave(" = -ESTALE");
return -ESTALE;
}
/* non-directories can just be unlinked */
if (!d_is_dir(rep)) {
- dget(rep); /* Stop the dentry being negated if it's only pinned
- * by a file struct.
- */
ret = cachefiles_unlink(cache, object, dir, rep, why);
- dput(rep);
+ end_removing(rep);
- inode_unlock(d_inode(dir));
_leave(" = %d", ret);
return ret;
}
/* directories have to be moved to the graveyard */
_debug("move stale object to graveyard");
- inode_unlock(d_inode(dir));
+ end_removing(rep);
try_again:
/* first step is to make up a grave dentry in the graveyard */
@@ -425,13 +421,12 @@ int cachefiles_delete_object(struct cachefiles_object *object,
_enter(",OBJ%x{%pD}", object->debug_id, object->file);
- /* Stop the dentry being negated if it's only pinned by a file struct. */
- dget(dentry);
-
- inode_lock_nested(d_backing_inode(fan), I_MUTEX_PARENT);
- ret = cachefiles_unlink(volume->cache, object, fan, dentry, why);
- inode_unlock(d_backing_inode(fan));
- dput(dentry);
+ dentry = start_removing_dentry(fan, dentry);
+ if (IS_ERR(dentry))
+ ret = PTR_ERR(dentry);
+ else
+ ret = cachefiles_unlink(volume->cache, object, fan, dentry, why);
+ end_removing(dentry);
return ret;
}
@@ -644,9 +639,13 @@ bool cachefiles_look_up_object(struct cachefiles_object *object)
if (!d_is_reg(dentry)) {
pr_err("%pd is not a file\n", dentry);
- inode_lock_nested(d_inode(fan), I_MUTEX_PARENT);
- ret = cachefiles_bury_object(volume->cache, object, fan, dentry,
- FSCACHE_OBJECT_IS_WEIRD);
+ struct dentry *de = start_removing_dentry(fan, dentry);
+ if (IS_ERR(de))
+ ret = PTR_ERR(de);
+ else
+ ret = cachefiles_bury_object(volume->cache, object,
+ fan, de,
+ FSCACHE_OBJECT_IS_WEIRD);
dput(dentry);
if (ret < 0)
return false;
@@ -679,36 +678,41 @@ bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache,
_enter(",%pD", object->file);
- inode_lock_nested(d_inode(fan), I_MUTEX_PARENT);
ret = cachefiles_inject_read_error();
if (ret == 0)
- dentry = lookup_one(&nop_mnt_idmap, &QSTR(object->d_name), fan);
+ dentry = start_creating(&nop_mnt_idmap, fan, &QSTR(object->d_name));
else
dentry = ERR_PTR(ret);
if (IS_ERR(dentry)) {
trace_cachefiles_vfs_error(object, d_inode(fan), PTR_ERR(dentry),
cachefiles_trace_lookup_error);
_debug("lookup fail %ld", PTR_ERR(dentry));
- goto out_unlock;
+ goto out;
}
- if (!d_is_negative(dentry)) {
+ /*
+ * This loop will only execute more than once if some other thread
+ * races to create the object we are trying to create.
+ */
+ while (!d_is_negative(dentry)) {
ret = cachefiles_unlink(volume->cache, object, fan, dentry,
FSCACHE_OBJECT_IS_STALE);
if (ret < 0)
- goto out_dput;
+ goto out_end;
+
+ end_creating(dentry);
- dput(dentry);
ret = cachefiles_inject_read_error();
if (ret == 0)
- dentry = lookup_one(&nop_mnt_idmap, &QSTR(object->d_name), fan);
+ dentry = start_creating(&nop_mnt_idmap, fan,
+ &QSTR(object->d_name));
else
dentry = ERR_PTR(ret);
if (IS_ERR(dentry)) {
trace_cachefiles_vfs_error(object, d_inode(fan), PTR_ERR(dentry),
cachefiles_trace_lookup_error);
_debug("lookup fail %ld", PTR_ERR(dentry));
- goto out_unlock;
+ goto out;
}
}
@@ -729,10 +733,9 @@ bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache,
success = true;
}
-out_dput:
- dput(dentry);
-out_unlock:
- inode_unlock(d_inode(fan));
+out_end:
+ end_creating(dentry);
+out:
_leave(" = %u", success);
return success;
}
@@ -748,26 +751,20 @@ static struct dentry *cachefiles_lookup_for_cull(struct cachefiles_cache *cache,
struct dentry *victim;
int ret = -ENOENT;
- inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
+ victim = start_removing(&nop_mnt_idmap, dir, &QSTR(filename));
- victim = lookup_one(&nop_mnt_idmap, &QSTR(filename), dir);
if (IS_ERR(victim))
goto lookup_error;
- if (d_is_negative(victim))
- goto lookup_put;
if (d_inode(victim)->i_flags & S_KERNEL_FILE)
goto lookup_busy;
return victim;
lookup_busy:
ret = -EBUSY;
-lookup_put:
- inode_unlock(d_inode(dir));
- dput(victim);
+ end_removing(victim);
return ERR_PTR(ret);
lookup_error:
- inode_unlock(d_inode(dir));
ret = PTR_ERR(victim);
if (ret == -ENOENT)
return ERR_PTR(-ESTALE); /* Probably got retired by the netfs */
@@ -815,18 +812,17 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
ret = cachefiles_bury_object(cache, NULL, dir, victim,
FSCACHE_OBJECT_WAS_CULLED);
+ dput(victim);
if (ret < 0)
goto error;
fscache_count_culled();
- dput(victim);
_leave(" = 0");
return 0;
error_unlock:
- inode_unlock(d_inode(dir));
+ end_removing(victim);
error:
- dput(victim);
if (ret == -ENOENT)
return -ESTALE; /* Probably got retired by the netfs */
diff --git a/fs/cachefiles/volume.c b/fs/cachefiles/volume.c
index 781aac4ef274..90ba926f488e 100644
--- a/fs/cachefiles/volume.c
+++ b/fs/cachefiles/volume.c
@@ -7,6 +7,7 @@
#include <linux/fs.h>
#include <linux/slab.h>
+#include <linux/namei.h>
#include "internal.h"
#include <trace/events/fscache.h>
@@ -58,9 +59,11 @@ retry:
if (ret < 0) {
if (ret != -ESTALE)
goto error_dir;
- inode_lock_nested(d_inode(cache->store), I_MUTEX_PARENT);
- cachefiles_bury_object(cache, NULL, cache->store, vdentry,
- FSCACHE_VOLUME_IS_WEIRD);
+ vdentry = start_removing_dentry(cache->store, vdentry);
+ if (!IS_ERR(vdentry))
+ cachefiles_bury_object(cache, NULL, cache->store,
+ vdentry,
+ FSCACHE_VOLUME_IS_WEIRD);
cachefiles_put_directory(volume->dentry);
cond_resched();
goto retry;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 322ed268f14a..63b75d214210 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1045,11 +1045,7 @@ void ceph_init_writeback_ctl(struct address_space *mapping,
ceph_wbc->index = ceph_wbc->start_index;
ceph_wbc->end = -1;
- if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) {
- ceph_wbc->tag = PAGECACHE_TAG_TOWRITE;
- } else {
- ceph_wbc->tag = PAGECACHE_TAG_DIRTY;
- }
+ ceph_wbc->tag = wbc_to_tag(wbc);
ceph_wbc->op_idx = -1;
ceph_wbc->num_ops = 0;
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 930fbd54d2c8..f678bab189d8 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -26,7 +26,7 @@ void ceph_fscache_register_inode_cookie(struct inode *inode)
return;
/* Only new inodes! */
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return;
WARN_ON_ONCE(ci->netfs.cache);
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
index 7026e794813c..928746b92512 100644
--- a/fs/ceph/crypto.c
+++ b/fs/ceph/crypto.c
@@ -329,7 +329,7 @@ int ceph_encode_encrypted_dname(struct inode *parent, char *buf, int elen)
out:
kfree(cryptbuf);
if (dir != parent) {
- if ((dir->i_state & I_NEW))
+ if ((inode_state_read_once(dir) & I_NEW))
discard_new_inode(dir);
else
iput(dir);
@@ -438,7 +438,7 @@ out:
fscrypt_fname_free_buffer(&_tname);
out_inode:
if (dir != fname->dir) {
- if ((dir->i_state & I_NEW))
+ if ((inode_state_read_once(dir) & I_NEW))
discard_new_inode(dir);
else
iput(dir);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 99b30f784ee2..983390069f73 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -740,7 +740,7 @@ static int ceph_finish_async_create(struct inode *dir, struct inode *inode,
vino.ino, ceph_ino(dir), dentry->d_name.name);
ceph_dir_clear_ordered(dir);
ceph_init_inode_acls(inode, as_ctx);
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
/*
* If it's not I_NEW, then someone created this before
* we got here. Assume the server is aware of it at
@@ -901,7 +901,7 @@ retry:
new_inode = NULL;
goto out_req;
}
- WARN_ON_ONCE(!(new_inode->i_state & I_NEW));
+ WARN_ON_ONCE(!(inode_state_read_once(new_inode) & I_NEW));
spin_lock(&dentry->d_lock);
di->flags |= CEPH_DENTRY_ASYNC_CREATE;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index a6e260d9e420..37d3a2477c17 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -132,7 +132,7 @@ struct inode *ceph_new_inode(struct inode *dir, struct dentry *dentry,
goto out_err;
}
- inode->i_state = 0;
+ inode_state_assign_raw(inode, 0);
inode->i_mode = *mode;
err = ceph_security_init_secctx(dentry, *mode, as_ctx);
@@ -201,7 +201,7 @@ struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino,
doutc(cl, "on %llx=%llx.%llx got %p new %d\n",
ceph_present_inode(inode), ceph_vinop(inode), inode,
- !!(inode->i_state & I_NEW));
+ !!(inode_state_read_once(inode) & I_NEW));
return inode;
}
@@ -228,7 +228,7 @@ struct inode *ceph_get_snapdir(struct inode *parent)
goto err;
}
- if (!(inode->i_state & I_NEW) && !S_ISDIR(inode->i_mode)) {
+ if (!(inode_state_read_once(inode) & I_NEW) && !S_ISDIR(inode->i_mode)) {
pr_warn_once_client(cl, "bad snapdir inode type (mode=0%o)\n",
inode->i_mode);
goto err;
@@ -261,7 +261,7 @@ struct inode *ceph_get_snapdir(struct inode *parent)
}
}
#endif
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
inode->i_op = &ceph_snapdir_iops;
inode->i_fop = &ceph_snapdir_fops;
ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
@@ -270,7 +270,7 @@ struct inode *ceph_get_snapdir(struct inode *parent)
return inode;
err:
- if ((inode->i_state & I_NEW))
+ if ((inode_state_read_once(inode) & I_NEW))
discard_new_inode(inode);
else
iput(inode);
@@ -744,7 +744,7 @@ void ceph_evict_inode(struct inode *inode)
netfs_wait_for_outstanding_io(inode);
truncate_inode_pages_final(&inode->i_data);
- if (inode->i_state & I_PINNING_NETFS_WB)
+ if (inode_state_read_once(inode) & I_PINNING_NETFS_WB)
ceph_fscache_unuse_cookie(inode, true);
clear_inode(inode);
@@ -1013,7 +1013,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
le64_to_cpu(info->version), ci->i_version);
/* Once I_NEW is cleared, we can't change type or dev numbers */
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
inode->i_mode = mode;
} else {
if (inode_wrong_type(inode, mode)) {
@@ -1090,7 +1090,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
#ifdef CONFIG_FS_ENCRYPTION
if (iinfo->fscrypt_auth_len &&
- ((inode->i_state & I_NEW) || (ci->fscrypt_auth_len == 0))) {
+ ((inode_state_read_once(inode) & I_NEW) || (ci->fscrypt_auth_len == 0))) {
kfree(ci->fscrypt_auth);
ci->fscrypt_auth_len = iinfo->fscrypt_auth_len;
ci->fscrypt_auth = iinfo->fscrypt_auth;
@@ -1692,13 +1692,13 @@ retry_lookup:
pr_err_client(cl, "badness %p %llx.%llx\n", in,
ceph_vinop(in));
req->r_target_inode = NULL;
- if (in->i_state & I_NEW)
+ if (inode_state_read_once(in) & I_NEW)
discard_new_inode(in);
else
iput(in);
goto done;
}
- if (in->i_state & I_NEW)
+ if (inode_state_read_once(in) & I_NEW)
unlock_new_inode(in);
}
@@ -1898,11 +1898,11 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
pr_err_client(cl, "inode badness on %p got %d\n", in,
rc);
err = rc;
- if (in->i_state & I_NEW) {
+ if (inode_state_read_once(in) & I_NEW) {
ihold(in);
discard_new_inode(in);
}
- } else if (in->i_state & I_NEW) {
+ } else if (inode_state_read_once(in) & I_NEW) {
unlock_new_inode(in);
}
@@ -2114,7 +2114,7 @@ retry_lookup:
pr_err_client(cl, "badness on %p %llx.%llx\n", in,
ceph_vinop(in));
if (d_really_is_negative(dn)) {
- if (in->i_state & I_NEW) {
+ if (inode_state_read_once(in) & I_NEW) {
ihold(in);
discard_new_inode(in);
}
@@ -2124,7 +2124,7 @@ retry_lookup:
err = ret;
goto next_item;
}
- if (in->i_state & I_NEW)
+ if (inode_state_read_once(in) & I_NEW)
unlock_new_inode(in);
if (d_really_is_negative(dn)) {
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index ad0cf177e75a..f6bf24b5c683 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -1149,7 +1149,7 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
const char *path = fsc->mount_options->server_path ?
fsc->mount_options->server_path + 1 : "";
- err = __ceph_open_session(fsc->client, started);
+ err = __ceph_open_session(fsc->client);
if (err < 0)
goto out;
diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index 62a3d2565c26..70bb0579b40c 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c
@@ -70,7 +70,7 @@ retry:
if (!inode)
return ERR_PTR(-ENOMEM);
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
cii = ITOC(inode);
/* we still need to set i_ino for things like stat(2) */
inode->i_ino = hash;
@@ -148,7 +148,7 @@ struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb)
/* we should never see newly created inodes because we intentionally
* fail in the initialization callback */
- BUG_ON(inode->i_state & I_NEW);
+ BUG_ON(inode_state_read_once(inode) & I_NEW);
return inode;
}
diff --git a/fs/coredump.c b/fs/coredump.c
index 5c1c381ee380..fe4099e0530b 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -1036,7 +1036,7 @@ static bool coredump_pipe(struct core_name *cn, struct coredump_params *cprm,
static bool coredump_write(struct core_name *cn,
struct coredump_params *cprm,
- struct linux_binfmt *binfmt)
+ const struct linux_binfmt *binfmt)
{
if (dump_interrupted())
@@ -1086,119 +1086,119 @@ static inline bool coredump_skip(const struct coredump_params *cprm,
return false;
}
-void vfs_coredump(const kernel_siginfo_t *siginfo)
+static void do_coredump(struct core_name *cn, struct coredump_params *cprm,
+ size_t **argv, int *argc, const struct linux_binfmt *binfmt)
{
- struct cred *cred __free(put_cred) = NULL;
- size_t *argv __free(kfree) = NULL;
- struct core_state core_state;
- struct core_name cn;
- struct mm_struct *mm = current->mm;
- struct linux_binfmt *binfmt = mm->binfmt;
- const struct cred *old_cred;
- int argc = 0;
- struct coredump_params cprm = {
- .siginfo = siginfo,
- .limit = rlimit(RLIMIT_CORE),
- /*
- * We must use the same mm->flags while dumping core to avoid
- * inconsistency of bit flags, since this flag is not protected
- * by any locks.
- *
- * Note that we only care about MMF_DUMP* flags.
- */
- .mm_flags = __mm_flags_get_dumpable(mm),
- .vma_meta = NULL,
- .cpu = raw_smp_processor_id(),
- };
-
- audit_core_dumps(siginfo->si_signo);
-
- if (coredump_skip(&cprm, binfmt))
- return;
-
- cred = prepare_creds();
- if (!cred)
- return;
- /*
- * We cannot trust fsuid as being the "true" uid of the process
- * nor do we know its entire history. We only know it was tainted
- * so we dump it as root in mode 2, and only into a controlled
- * environment (pipe handler or fully qualified path).
- */
- if (coredump_force_suid_safe(&cprm))
- cred->fsuid = GLOBAL_ROOT_UID;
-
- if (coredump_wait(siginfo->si_signo, &core_state) < 0)
- return;
-
- old_cred = override_creds(cred);
-
- if (!coredump_parse(&cn, &cprm, &argv, &argc)) {
+ if (!coredump_parse(cn, cprm, argv, argc)) {
coredump_report_failure("format_corename failed, aborting core");
- goto close_fail;
+ return;
}
- switch (cn.core_type) {
+ switch (cn->core_type) {
case COREDUMP_FILE:
- if (!coredump_file(&cn, &cprm, binfmt))
- goto close_fail;
+ if (!coredump_file(cn, cprm, binfmt))
+ return;
break;
case COREDUMP_PIPE:
- if (!coredump_pipe(&cn, &cprm, argv, argc))
- goto close_fail;
+ if (!coredump_pipe(cn, cprm, *argv, *argc))
+ return;
break;
case COREDUMP_SOCK_REQ:
fallthrough;
case COREDUMP_SOCK:
- if (!coredump_socket(&cn, &cprm))
- goto close_fail;
+ if (!coredump_socket(cn, cprm))
+ return;
break;
default:
WARN_ON_ONCE(true);
- goto close_fail;
+ return;
}
/* Don't even generate the coredump. */
- if (cn.mask & COREDUMP_REJECT)
- goto close_fail;
+ if (cn->mask & COREDUMP_REJECT)
+ return;
/* get us an unshared descriptor table; almost always a no-op */
/* The cell spufs coredump code reads the file descriptor tables */
if (unshare_files())
- goto close_fail;
+ return;
- if ((cn.mask & COREDUMP_KERNEL) && !coredump_write(&cn, &cprm, binfmt))
- goto close_fail;
+ if ((cn->mask & COREDUMP_KERNEL) && !coredump_write(cn, cprm, binfmt))
+ return;
- coredump_sock_shutdown(cprm.file);
+ coredump_sock_shutdown(cprm->file);
/* Let the parent know that a coredump was generated. */
- if (cn.mask & COREDUMP_USERSPACE)
- cn.core_dumped = true;
+ if (cn->mask & COREDUMP_USERSPACE)
+ cn->core_dumped = true;
/*
* When core_pipe_limit is set we wait for the coredump server
* or usermodehelper to finish before exiting so it can e.g.,
* inspect /proc/<pid>.
*/
- if (cn.mask & COREDUMP_WAIT) {
- switch (cn.core_type) {
+ if (cn->mask & COREDUMP_WAIT) {
+ switch (cn->core_type) {
case COREDUMP_PIPE:
- wait_for_dump_helpers(cprm.file);
+ wait_for_dump_helpers(cprm->file);
break;
case COREDUMP_SOCK_REQ:
fallthrough;
case COREDUMP_SOCK:
- coredump_sock_wait(cprm.file);
+ coredump_sock_wait(cprm->file);
break;
default:
break;
}
}
+}
+
+void vfs_coredump(const kernel_siginfo_t *siginfo)
+{
+ size_t *argv __free(kfree) = NULL;
+ struct core_state core_state;
+ struct core_name cn;
+ const struct mm_struct *mm = current->mm;
+ const struct linux_binfmt *binfmt = mm->binfmt;
+ int argc = 0;
+ struct coredump_params cprm = {
+ .siginfo = siginfo,
+ .limit = rlimit(RLIMIT_CORE),
+ /*
+ * We must use the same mm->flags while dumping core to avoid
+ * inconsistency of bit flags, since this flag is not protected
+ * by any locks.
+ *
+ * Note that we only care about MMF_DUMP* flags.
+ */
+ .mm_flags = __mm_flags_get_dumpable(mm),
+ .vma_meta = NULL,
+ .cpu = raw_smp_processor_id(),
+ };
+
+ audit_core_dumps(siginfo->si_signo);
+
+ if (coredump_skip(&cprm, binfmt))
+ return;
+
+ CLASS(prepare_creds, cred)();
+ if (!cred)
+ return;
+ /*
+ * We cannot trust fsuid as being the "true" uid of the process
+ * nor do we know its entire history. We only know it was tainted
+ * so we dump it as root in mode 2, and only into a controlled
+ * environment (pipe handler or fully qualified path).
+ */
+ if (coredump_force_suid_safe(&cprm))
+ cred->fsuid = GLOBAL_ROOT_UID;
+
+ if (coredump_wait(siginfo->si_signo, &core_state) < 0)
+ return;
-close_fail:
+ scoped_with_creds(cred)
+ do_coredump(&cn, &cprm, &argv, &argc, binfmt);
coredump_cleanup(&cn, &cprm);
- revert_creds(old_cred);
return;
}
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index ca54bf24b719..e54ebe402df7 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -95,7 +95,7 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
inode = iget_locked(sb, cramino(cramfs_inode, offset));
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
switch (cramfs_inode->mode & S_IFMT) {
diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c
index 3adbd7167055..5e939ea3ac28 100644
--- a/fs/crypto/keyring.c
+++ b/fs/crypto/keyring.c
@@ -945,7 +945,7 @@ static void evict_dentries_for_decrypted_inodes(struct fscrypt_master_key *mk)
list_for_each_entry(ci, &mk->mk_decrypted_inodes, ci_master_key_link) {
inode = ci->ci_inode;
spin_lock(&inode->i_lock);
- if (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) {
+ if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) {
spin_unlock(&inode->i_lock);
continue;
}
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index 4bd3918f50e3..40fa05688d3a 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -834,7 +834,7 @@ int fscrypt_drop_inode(struct inode *inode)
* userspace is still using the files, inodes can be dirtied between
* then and now. We mustn't lose any writes, so skip dirty inodes here.
*/
- if (inode->i_state & I_DIRTY_ALL)
+ if (inode_state_read(inode) & I_DIRTY_ALL)
return 0;
/*
diff --git a/fs/dax.c b/fs/dax.c
index 516f995a988c..38fae11ee419 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1507,7 +1507,7 @@ static int dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
/* already zeroed? we're done. */
if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
- return iomap_iter_advance(iter, &length);
+ return iomap_iter_advance(iter, length);
/*
* invalidate the pages whose sharing state is to be changed
@@ -1536,10 +1536,10 @@ static int dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
if (ret < 0)
return ret;
- ret = iomap_iter_advance(iter, &length);
+ ret = iomap_iter_advance(iter, length);
if (ret)
return ret;
- } while (length > 0);
+ } while ((length = iomap_length(iter)) > 0);
if (did_zero)
*did_zero = true;
@@ -1597,7 +1597,7 @@ static int dax_iomap_iter(struct iomap_iter *iomi, struct iov_iter *iter)
if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) {
done = iov_iter_zero(min(length, end - pos), iter);
- return iomap_iter_advance(iomi, &done);
+ return iomap_iter_advance(iomi, done);
}
}
@@ -1681,12 +1681,12 @@ static int dax_iomap_iter(struct iomap_iter *iomi, struct iov_iter *iter)
xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
map_len, iter);
- length = xfer;
- ret = iomap_iter_advance(iomi, &length);
+ ret = iomap_iter_advance(iomi, xfer);
if (!ret && xfer == 0)
ret = -EFAULT;
if (xfer < map_len)
break;
+ length = iomap_length(iomi);
}
dax_read_unlock(id);
@@ -1919,10 +1919,8 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, unsigned long *pfnp,
ret |= VM_FAULT_MAJOR;
}
- if (!(ret & VM_FAULT_ERROR)) {
- u64 length = PAGE_SIZE;
- iter.status = iomap_iter_advance(&iter, &length);
- }
+ if (!(ret & VM_FAULT_ERROR))
+ iter.status = iomap_iter_advance(&iter, PAGE_SIZE);
}
if (iomap_errp)
@@ -2034,10 +2032,8 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, unsigned long *pfnp,
continue; /* actually breaks out of the loop */
ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, true);
- if (ret != VM_FAULT_FALLBACK) {
- u64 length = PMD_SIZE;
- iter.status = iomap_iter_advance(&iter, &length);
- }
+ if (ret != VM_FAULT_FALLBACK)
+ iter.status = iomap_iter_advance(&iter, PMD_SIZE);
}
unlock_entry:
@@ -2163,7 +2159,6 @@ static int dax_range_compare_iter(struct iomap_iter *it_src,
const struct iomap *smap = &it_src->iomap;
const struct iomap *dmap = &it_dest->iomap;
loff_t pos1 = it_src->pos, pos2 = it_dest->pos;
- u64 dest_len;
void *saddr, *daddr;
int id, ret;
@@ -2196,10 +2191,9 @@ static int dax_range_compare_iter(struct iomap_iter *it_src,
dax_read_unlock(id);
advance:
- dest_len = len;
- ret = iomap_iter_advance(it_src, &len);
+ ret = iomap_iter_advance(it_src, len);
if (!ret)
- ret = iomap_iter_advance(it_dest, &dest_len);
+ ret = iomap_iter_advance(it_dest, len);
return ret;
out_unlock:
diff --git a/fs/dcache.c b/fs/dcache.c
index 035cccbc9276..9143fd502def 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -86,7 +86,8 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
EXPORT_SYMBOL(rename_lock);
-static struct kmem_cache *dentry_cache __ro_after_init;
+static struct kmem_cache *__dentry_cache __ro_after_init;
+#define dentry_cache runtime_const_ptr(__dentry_cache)
const struct qstr empty_name = QSTR_INIT("", 0);
EXPORT_SYMBOL(empty_name);
@@ -794,7 +795,7 @@ void d_mark_dontcache(struct inode *inode)
de->d_flags |= DCACHE_DONTCACHE;
spin_unlock(&de->d_lock);
}
- inode->i_state |= I_DONTCACHE;
+ inode_state_set(inode, I_DONTCACHE);
spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL(d_mark_dontcache);
@@ -1073,7 +1074,7 @@ struct dentry *d_find_alias_rcu(struct inode *inode)
spin_lock(&inode->i_lock);
// ->i_dentry and ->i_rcu are colocated, but the latter won't be
// used without having I_FREEING set, which means no aliases left
- if (likely(!(inode->i_state & I_FREEING) && !hlist_empty(l))) {
+ if (likely(!(inode_state_read(inode) & I_FREEING) && !hlist_empty(l))) {
if (S_ISDIR(inode->i_mode)) {
de = hlist_entry(l->first, struct dentry, d_u.d_alias);
} else {
@@ -1980,14 +1981,8 @@ void d_instantiate_new(struct dentry *entry, struct inode *inode)
security_d_instantiate(entry, inode);
spin_lock(&inode->i_lock);
__d_instantiate(entry, inode);
- WARN_ON(!(inode->i_state & I_NEW));
- inode->i_state &= ~I_NEW & ~I_CREATING;
- /*
- * Pairs with the barrier in prepare_to_wait_event() to make sure
- * ___wait_var_event() either sees the bit cleared or
- * waitqueue_active() check in wake_up_var() sees the waiter.
- */
- smp_mb();
+ WARN_ON(!(inode_state_read(inode) & I_NEW));
+ inode_state_clear(inode, I_NEW | I_CREATING);
inode_wake_up_bit(inode, __I_NEW);
spin_unlock(&inode->i_lock);
}
@@ -2306,11 +2301,20 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent,
seq = raw_seqcount_begin(&dentry->d_seq);
if (dentry->d_parent != parent)
continue;
- if (d_unhashed(dentry))
- continue;
if (dentry->d_name.hash_len != hashlen)
continue;
- if (dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0)
+ if (unlikely(dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0))
+ continue;
+ /*
+ * Check for the dentry being unhashed.
+ *
+ * As tempting as it is, we *can't* skip it because of a race window
+ * between us finding the dentry before it gets unhashed and loading
+ * the sequence counter after unhashing is finished.
+ *
+ * We can at least predict on it.
+ */
+ if (unlikely(d_unhashed(dentry)))
continue;
*seqp = seq;
return dentry;
@@ -3222,9 +3226,10 @@ static void __init dcache_init(void)
* but it is probably not worth it because of the cache nature
* of the dcache.
*/
- dentry_cache = KMEM_CACHE_USERCOPY(dentry,
+ __dentry_cache = KMEM_CACHE_USERCOPY(dentry,
SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_ACCOUNT,
d_shortname.string);
+ runtime_const_init(ptr, __dentry_cache);
/* Hash may have been set up in dcache_init_early */
if (!hashdist)
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 661a99a7dfbe..532bd7c46baf 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -403,7 +403,7 @@ static struct dentry *debugfs_start_creating(const char *name,
return dentry;
}
-static struct dentry *failed_creating(struct dentry *dentry)
+static struct dentry *debugfs_failed_creating(struct dentry *dentry)
{
inode_unlock(d_inode(dentry->d_parent));
dput(dentry);
@@ -411,7 +411,7 @@ static struct dentry *failed_creating(struct dentry *dentry)
return ERR_PTR(-ENOMEM);
}
-static struct dentry *end_creating(struct dentry *dentry)
+static struct dentry *debugfs_end_creating(struct dentry *dentry)
{
inode_unlock(d_inode(dentry->d_parent));
return dentry;
@@ -435,7 +435,7 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode,
return dentry;
if (!(debugfs_allow & DEBUGFS_ALLOW_API)) {
- failed_creating(dentry);
+ debugfs_failed_creating(dentry);
return ERR_PTR(-EPERM);
}
@@ -443,7 +443,7 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode,
if (unlikely(!inode)) {
pr_err("out of free dentries, can not create file '%s'\n",
name);
- return failed_creating(dentry);
+ return debugfs_failed_creating(dentry);
}
inode->i_mode = mode;
@@ -458,7 +458,7 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode,
d_instantiate(dentry, inode);
fsnotify_create(d_inode(dentry->d_parent), dentry);
- return end_creating(dentry);
+ return debugfs_end_creating(dentry);
}
struct dentry *debugfs_create_file_full(const char *name, umode_t mode,
@@ -585,7 +585,7 @@ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent)
return dentry;
if (!(debugfs_allow & DEBUGFS_ALLOW_API)) {
- failed_creating(dentry);
+ debugfs_failed_creating(dentry);
return ERR_PTR(-EPERM);
}
@@ -593,7 +593,7 @@ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent)
if (unlikely(!inode)) {
pr_err("out of free dentries, can not create directory '%s'\n",
name);
- return failed_creating(dentry);
+ return debugfs_failed_creating(dentry);
}
inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
@@ -605,7 +605,7 @@ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent)
d_instantiate(dentry, inode);
inc_nlink(d_inode(dentry->d_parent));
fsnotify_mkdir(d_inode(dentry->d_parent), dentry);
- return end_creating(dentry);
+ return debugfs_end_creating(dentry);
}
EXPORT_SYMBOL_GPL(debugfs_create_dir);
@@ -632,7 +632,7 @@ struct dentry *debugfs_create_automount(const char *name,
return dentry;
if (!(debugfs_allow & DEBUGFS_ALLOW_API)) {
- failed_creating(dentry);
+ debugfs_failed_creating(dentry);
return ERR_PTR(-EPERM);
}
@@ -640,7 +640,7 @@ struct dentry *debugfs_create_automount(const char *name,
if (unlikely(!inode)) {
pr_err("out of free dentries, can not create automount '%s'\n",
name);
- return failed_creating(dentry);
+ return debugfs_failed_creating(dentry);
}
make_empty_dir_inode(inode);
@@ -652,7 +652,7 @@ struct dentry *debugfs_create_automount(const char *name,
d_instantiate(dentry, inode);
inc_nlink(d_inode(dentry->d_parent));
fsnotify_mkdir(d_inode(dentry->d_parent), dentry);
- return end_creating(dentry);
+ return debugfs_end_creating(dentry);
}
EXPORT_SYMBOL(debugfs_create_automount);
@@ -699,13 +699,13 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
pr_err("out of free dentries, can not create symlink '%s'\n",
name);
kfree(link);
- return failed_creating(dentry);
+ return debugfs_failed_creating(dentry);
}
inode->i_mode = S_IFLNK | S_IRWXUGO;
inode->i_op = &debugfs_symlink_inode_operations;
inode->i_link = link;
d_instantiate(dentry, inode);
- return end_creating(dentry);
+ return debugfs_end_creating(dentry);
}
EXPORT_SYMBOL_GPL(debugfs_create_symlink);
@@ -842,7 +842,8 @@ int __printf(2, 3) debugfs_change_name(struct dentry *dentry, const char *fmt, .
int error = 0;
const char *new_name;
struct name_snapshot old_name;
- struct dentry *parent, *target;
+ struct dentry *target;
+ struct renamedata rd = {};
struct inode *dir;
va_list ap;
@@ -855,36 +856,31 @@ int __printf(2, 3) debugfs_change_name(struct dentry *dentry, const char *fmt, .
if (!new_name)
return -ENOMEM;
- parent = dget_parent(dentry);
- dir = d_inode(parent);
- inode_lock(dir);
+ rd.old_parent = dget_parent(dentry);
+ rd.new_parent = rd.old_parent;
+ rd.flags = RENAME_NOREPLACE;
+ target = lookup_noperm_unlocked(&QSTR(new_name), rd.new_parent);
+ if (IS_ERR(target))
+ return PTR_ERR(target);
- take_dentry_name_snapshot(&old_name, dentry);
-
- if (WARN_ON_ONCE(dentry->d_parent != parent)) {
- error = -EINVAL;
- goto out;
- }
- if (strcmp(old_name.name.name, new_name) == 0)
- goto out;
- target = lookup_noperm(&QSTR(new_name), parent);
- if (IS_ERR(target)) {
- error = PTR_ERR(target);
- goto out;
- }
- if (d_really_is_positive(target)) {
- dput(target);
- error = -EINVAL;
+ error = start_renaming_two_dentries(&rd, dentry, target);
+ if (error) {
+ if (error == -EEXIST && target == dentry)
+ /* it isn't an error to rename a thing to itself */
+ error = 0;
goto out;
}
- simple_rename_timestamp(dir, dentry, dir, target);
- d_move(dentry, target);
- dput(target);
+
+ dir = d_inode(rd.old_parent);
+ take_dentry_name_snapshot(&old_name, dentry);
+ simple_rename_timestamp(dir, dentry, dir, rd.new_dentry);
+ d_move(dentry, rd.new_dentry);
fsnotify_move(dir, dir, &old_name.name, d_is_dir(dentry), NULL, dentry);
-out:
release_dentry_name_snapshot(&old_name);
- inode_unlock(dir);
- dput(parent);
+ end_renaming(&rd);
+out:
+ dput(rd.old_parent);
+ dput(target);
kfree_const(new_name);
return error;
}
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 019a8b4eaaf9..49f56a598ecb 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -28,7 +28,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
* inodes without pages but we deliberately won't in case
* we need to reschedule to avoid softlockups.
*/
- if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
+ if ((inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) ||
(mapping_empty(inode->i_mapping) && !need_resched())) {
spin_unlock(&inode->i_lock);
continue;
diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig
index 1bdeaa6d5790..c2f4fb41b4e6 100644
--- a/fs/ecryptfs/Kconfig
+++ b/fs/ecryptfs/Kconfig
@@ -4,7 +4,7 @@ config ECRYPT_FS
depends on KEYS && CRYPTO && (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n)
select CRYPTO_ECB
select CRYPTO_CBC
- select CRYPTO_MD5
+ select CRYPTO_LIB_MD5
help
Encrypted filesystem that operates on the VFS layer. See
<file:Documentation/filesystems/ecryptfs.rst> to learn more about
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 69536cacdea8..260f8a4938b0 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -9,7 +9,6 @@
* Michael C. Thompson <mcthomps@us.ibm.com>
*/
-#include <crypto/hash.h>
#include <crypto/skcipher.h>
#include <linux/fs.h>
#include <linux/mount.h>
@@ -48,32 +47,6 @@ void ecryptfs_from_hex(char *dst, char *src, int dst_size)
}
}
-/**
- * ecryptfs_calculate_md5 - calculates the md5 of @src
- * @dst: Pointer to 16 bytes of allocated memory
- * @crypt_stat: Pointer to crypt_stat struct for the current inode
- * @src: Data to be md5'd
- * @len: Length of @src
- *
- * Uses the allocated crypto context that crypt_stat references to
- * generate the MD5 sum of the contents of src.
- */
-static int ecryptfs_calculate_md5(char *dst,
- struct ecryptfs_crypt_stat *crypt_stat,
- char *src, int len)
-{
- int rc = crypto_shash_tfm_digest(crypt_stat->hash_tfm, src, len, dst);
-
- if (rc) {
- printk(KERN_ERR
- "%s: Error computing crypto hash; rc = [%d]\n",
- __func__, rc);
- goto out;
- }
-out:
- return rc;
-}
-
static int ecryptfs_crypto_api_algify_cipher_name(char **algified_name,
char *cipher_name,
char *chaining_modifier)
@@ -104,13 +77,10 @@ out:
*
* Generate the initialization vector from the given root IV and page
* offset.
- *
- * Returns zero on success; non-zero on error.
*/
-int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
- loff_t offset)
+void ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
+ loff_t offset)
{
- int rc = 0;
char dst[MD5_DIGEST_SIZE];
char src[ECRYPTFS_MAX_IV_BYTES + 16];
@@ -129,20 +99,12 @@ int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
ecryptfs_printk(KERN_DEBUG, "source:\n");
ecryptfs_dump_hex(src, (crypt_stat->iv_bytes + 16));
}
- rc = ecryptfs_calculate_md5(dst, crypt_stat, src,
- (crypt_stat->iv_bytes + 16));
- if (rc) {
- ecryptfs_printk(KERN_WARNING, "Error attempting to compute "
- "MD5 while generating IV for a page\n");
- goto out;
- }
+ md5(src, crypt_stat->iv_bytes + 16, dst);
memcpy(iv, dst, crypt_stat->iv_bytes);
if (unlikely(ecryptfs_verbosity > 0)) {
ecryptfs_printk(KERN_DEBUG, "derived iv:\n");
ecryptfs_dump_hex(iv, crypt_stat->iv_bytes);
}
-out:
- return rc;
}
/**
@@ -151,29 +113,14 @@ out:
*
* Initialize the crypt_stat structure.
*/
-int ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat)
+void ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat)
{
- struct crypto_shash *tfm;
- int rc;
-
- tfm = crypto_alloc_shash(ECRYPTFS_DEFAULT_HASH, 0, 0);
- if (IS_ERR(tfm)) {
- rc = PTR_ERR(tfm);
- ecryptfs_printk(KERN_ERR, "Error attempting to "
- "allocate crypto context; rc = [%d]\n",
- rc);
- return rc;
- }
-
memset((void *)crypt_stat, 0, sizeof(struct ecryptfs_crypt_stat));
INIT_LIST_HEAD(&crypt_stat->keysig_list);
mutex_init(&crypt_stat->keysig_list_mutex);
mutex_init(&crypt_stat->cs_mutex);
mutex_init(&crypt_stat->cs_tfm_mutex);
- crypt_stat->hash_tfm = tfm;
crypt_stat->flags |= ECRYPTFS_STRUCT_INITIALIZED;
-
- return 0;
}
/**
@@ -187,7 +134,6 @@ void ecryptfs_destroy_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat)
struct ecryptfs_key_sig *key_sig, *key_sig_tmp;
crypto_free_skcipher(crypt_stat->tfm);
- crypto_free_shash(crypt_stat->hash_tfm);
list_for_each_entry_safe(key_sig, key_sig_tmp,
&crypt_stat->keysig_list, crypt_stat_list) {
list_del(&key_sig->crypt_stat_list);
@@ -361,14 +307,7 @@ static int crypt_extent(struct ecryptfs_crypt_stat *crypt_stat,
int rc;
extent_base = (((loff_t)page_index) * (PAGE_SIZE / extent_size));
- rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
- (extent_base + extent_offset));
- if (rc) {
- ecryptfs_printk(KERN_ERR, "Error attempting to derive IV for "
- "extent [0x%.16llx]; rc = [%d]\n",
- (unsigned long long)(extent_base + extent_offset), rc);
- goto out;
- }
+ ecryptfs_derive_iv(extent_iv, crypt_stat, extent_base + extent_offset);
sg_init_table(&src_sg, 1);
sg_init_table(&dst_sg, 1);
@@ -609,31 +548,20 @@ void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat)
*/
int ecryptfs_compute_root_iv(struct ecryptfs_crypt_stat *crypt_stat)
{
- int rc = 0;
char dst[MD5_DIGEST_SIZE];
BUG_ON(crypt_stat->iv_bytes > MD5_DIGEST_SIZE);
BUG_ON(crypt_stat->iv_bytes <= 0);
if (!(crypt_stat->flags & ECRYPTFS_KEY_VALID)) {
- rc = -EINVAL;
ecryptfs_printk(KERN_WARNING, "Session key not valid; "
"cannot generate root IV\n");
- goto out;
- }
- rc = ecryptfs_calculate_md5(dst, crypt_stat, crypt_stat->key,
- crypt_stat->key_size);
- if (rc) {
- ecryptfs_printk(KERN_WARNING, "Error attempting to compute "
- "MD5 while generating root IV\n");
- goto out;
- }
- memcpy(crypt_stat->root_iv, dst, crypt_stat->iv_bytes);
-out:
- if (rc) {
memset(crypt_stat->root_iv, 0, crypt_stat->iv_bytes);
crypt_stat->flags |= ECRYPTFS_SECURITY_WARNING;
+ return -EINVAL;
}
- return rc;
+ md5(crypt_stat->key, crypt_stat->key_size, dst);
+ memcpy(crypt_stat->root_iv, dst, crypt_stat->iv_bytes);
+ return 0;
}
static void ecryptfs_generate_new_key(struct ecryptfs_crypt_stat *crypt_stat)
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 9e6ab0b41337..62a2ea7f59ed 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -14,6 +14,7 @@
#ifndef ECRYPTFS_KERNEL_H
#define ECRYPTFS_KERNEL_H
+#include <crypto/md5.h>
#include <crypto/skcipher.h>
#include <keys/user-type.h>
#include <keys/encrypted-type.h>
@@ -137,8 +138,6 @@ ecryptfs_get_key_payload_data(struct key *key)
+ MAGIC_ECRYPTFS_MARKER_SIZE_BYTES)
#define ECRYPTFS_DEFAULT_CIPHER "aes"
#define ECRYPTFS_DEFAULT_KEY_BYTES 16
-#define ECRYPTFS_DEFAULT_HASH "md5"
-#define ECRYPTFS_TAG_70_DIGEST ECRYPTFS_DEFAULT_HASH
#define ECRYPTFS_TAG_1_PACKET_TYPE 0x01
#define ECRYPTFS_TAG_3_PACKET_TYPE 0x8C
#define ECRYPTFS_TAG_11_PACKET_TYPE 0xED
@@ -163,8 +162,6 @@ ecryptfs_get_key_payload_data(struct key *key)
* ECRYPTFS_MAX_IV_BYTES */
#define ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES 16
#define ECRYPTFS_NON_NULL 0x42 /* A reasonable substitute for NULL */
-#define MD5_DIGEST_SIZE 16
-#define ECRYPTFS_TAG_70_DIGEST_SIZE MD5_DIGEST_SIZE
#define ECRYPTFS_TAG_70_MIN_METADATA_SIZE (1 + ECRYPTFS_MIN_PKT_LEN_SIZE \
+ ECRYPTFS_SIG_SIZE + 1 + 1)
#define ECRYPTFS_TAG_70_MAX_METADATA_SIZE (1 + ECRYPTFS_MAX_PKT_LEN_SIZE \
@@ -237,8 +234,6 @@ struct ecryptfs_crypt_stat {
unsigned int extent_mask;
struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
struct crypto_skcipher *tfm;
- struct crypto_shash *hash_tfm; /* Crypto context for generating
- * the initialization vectors */
unsigned char cipher[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1];
unsigned char key[ECRYPTFS_MAX_KEY_BYTES];
unsigned char root_iv[ECRYPTFS_MAX_IV_BYTES];
@@ -558,7 +553,7 @@ int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg,
int sg_size);
int ecryptfs_compute_root_iv(struct ecryptfs_crypt_stat *crypt_stat);
void ecryptfs_rotate_iv(unsigned char *iv);
-int ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat);
+void ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat);
void ecryptfs_destroy_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat);
void ecryptfs_destroy_mount_crypt_stat(
struct ecryptfs_mount_crypt_stat *mount_crypt_stat);
@@ -693,8 +688,8 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
char *data, size_t max_packet_size);
int ecryptfs_set_f_namelen(long *namelen, long lower_namelen,
struct ecryptfs_mount_crypt_stat *mount_crypt_stat);
-int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
- loff_t offset);
+void ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
+ loff_t offset);
extern const struct xattr_handler * const ecryptfs_xattr_handlers[];
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index ed1394da8d6b..3978248247dc 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -24,18 +24,26 @@
#include <linux/unaligned.h>
#include "ecryptfs_kernel.h"
-static int lock_parent(struct dentry *dentry,
- struct dentry **lower_dentry,
- struct inode **lower_dir)
+static struct dentry *ecryptfs_start_creating_dentry(struct dentry *dentry)
{
- struct dentry *lower_dir_dentry;
+ struct dentry *parent = dget_parent(dentry);
+ struct dentry *ret;
- lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent);
- *lower_dir = d_inode(lower_dir_dentry);
- *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+ ret = start_creating_dentry(ecryptfs_dentry_to_lower(parent),
+ ecryptfs_dentry_to_lower(dentry));
+ dput(parent);
+ return ret;
+}
- inode_lock_nested(*lower_dir, I_MUTEX_PARENT);
- return (*lower_dentry)->d_parent == lower_dir_dentry ? 0 : -EINVAL;
+static struct dentry *ecryptfs_start_removing_dentry(struct dentry *dentry)
+{
+ struct dentry *parent = dget_parent(dentry);
+ struct dentry *ret;
+
+ ret = start_removing_dentry(ecryptfs_dentry_to_lower(parent),
+ ecryptfs_dentry_to_lower(dentry));
+ dput(parent);
+ return ret;
}
static int ecryptfs_inode_test(struct inode *inode, void *lower_inode)
@@ -95,7 +103,7 @@ static struct inode *__ecryptfs_get_inode(struct inode *lower_inode,
iput(lower_inode);
return ERR_PTR(-EACCES);
}
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
iput(lower_inode);
return inode;
@@ -106,7 +114,7 @@ struct inode *ecryptfs_get_inode(struct inode *lower_inode,
{
struct inode *inode = __ecryptfs_get_inode(lower_inode, sb);
- if (!IS_ERR(inode) && (inode->i_state & I_NEW))
+ if (!IS_ERR(inode) && (inode_state_read_once(inode) & I_NEW))
unlock_new_inode(inode);
return inode;
@@ -141,15 +149,12 @@ static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry,
struct inode *lower_dir;
int rc;
- rc = lock_parent(dentry, &lower_dentry, &lower_dir);
- dget(lower_dentry); // don't even try to make the lower negative
- if (!rc) {
- if (d_unhashed(lower_dentry))
- rc = -EINVAL;
- else
- rc = vfs_unlink(&nop_mnt_idmap, lower_dir, lower_dentry,
- NULL);
- }
+ lower_dentry = ecryptfs_start_removing_dentry(dentry);
+ if (IS_ERR(lower_dentry))
+ return PTR_ERR(lower_dentry);
+
+ lower_dir = lower_dentry->d_parent->d_inode;
+ rc = vfs_unlink(&nop_mnt_idmap, lower_dir, lower_dentry, NULL);
if (rc) {
printk(KERN_ERR "Error in vfs_unlink; rc = [%d]\n", rc);
goto out_unlock;
@@ -158,8 +163,7 @@ static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry,
set_nlink(inode, ecryptfs_inode_to_lower(inode)->i_nlink);
inode_set_ctime_to_ts(inode, inode_get_ctime(dir));
out_unlock:
- dput(lower_dentry);
- inode_unlock(lower_dir);
+ end_removing(lower_dentry);
if (!rc)
d_drop(dentry);
return rc;
@@ -186,10 +190,11 @@ ecryptfs_do_create(struct inode *directory_inode,
struct inode *lower_dir;
struct inode *inode;
- rc = lock_parent(ecryptfs_dentry, &lower_dentry, &lower_dir);
- if (!rc)
- rc = vfs_create(&nop_mnt_idmap, lower_dir,
- lower_dentry, mode, true);
+ lower_dentry = ecryptfs_start_creating_dentry(ecryptfs_dentry);
+ if (IS_ERR(lower_dentry))
+ return ERR_CAST(lower_dentry);
+ lower_dir = lower_dentry->d_parent->d_inode;
+ rc = vfs_create(&nop_mnt_idmap, lower_dentry, mode, NULL);
if (rc) {
printk(KERN_ERR "%s: Failure to create dentry in lower fs; "
"rc = [%d]\n", __func__, rc);
@@ -205,7 +210,7 @@ ecryptfs_do_create(struct inode *directory_inode,
fsstack_copy_attr_times(directory_inode, lower_dir);
fsstack_copy_inode_size(directory_inode, lower_dir);
out_lock:
- inode_unlock(lower_dir);
+ end_creating(lower_dentry);
return inode;
}
@@ -364,7 +369,7 @@ static struct dentry *ecryptfs_lookup_interpose(struct dentry *dentry,
}
}
- if (inode->i_state & I_NEW)
+ if (inode_state_read_once(inode) & I_NEW)
unlock_new_inode(inode);
return d_splice_alias(inode, dentry);
}
@@ -433,10 +438,12 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
file_size_save = i_size_read(d_inode(old_dentry));
lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
- rc = lock_parent(new_dentry, &lower_new_dentry, &lower_dir);
- if (!rc)
- rc = vfs_link(lower_old_dentry, &nop_mnt_idmap, lower_dir,
- lower_new_dentry, NULL);
+ lower_new_dentry = ecryptfs_start_creating_dentry(new_dentry);
+ if (IS_ERR(lower_new_dentry))
+ return PTR_ERR(lower_new_dentry);
+ lower_dir = lower_new_dentry->d_parent->d_inode;
+ rc = vfs_link(lower_old_dentry, &nop_mnt_idmap, lower_dir,
+ lower_new_dentry, NULL);
if (rc || d_really_is_negative(lower_new_dentry))
goto out_lock;
rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb);
@@ -448,7 +455,7 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
ecryptfs_inode_to_lower(d_inode(old_dentry))->i_nlink);
i_size_write(d_inode(new_dentry), file_size_save);
out_lock:
- inode_unlock(lower_dir);
+ end_creating(lower_new_dentry);
return rc;
}
@@ -468,9 +475,11 @@ static int ecryptfs_symlink(struct mnt_idmap *idmap,
size_t encoded_symlen;
struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
- rc = lock_parent(dentry, &lower_dentry, &lower_dir);
- if (rc)
- goto out_lock;
+ lower_dentry = ecryptfs_start_creating_dentry(dentry);
+ if (IS_ERR(lower_dentry))
+ return PTR_ERR(lower_dentry);
+ lower_dir = lower_dentry->d_parent->d_inode;
+
mount_crypt_stat = &ecryptfs_superblock_to_private(
dir->i_sb)->mount_crypt_stat;
rc = ecryptfs_encrypt_and_encode_filename(&encoded_symname,
@@ -480,7 +489,7 @@ static int ecryptfs_symlink(struct mnt_idmap *idmap,
if (rc)
goto out_lock;
rc = vfs_symlink(&nop_mnt_idmap, lower_dir, lower_dentry,
- encoded_symname);
+ encoded_symname, NULL);
kfree(encoded_symname);
if (rc || d_really_is_negative(lower_dentry))
goto out_lock;
@@ -490,7 +499,7 @@ static int ecryptfs_symlink(struct mnt_idmap *idmap,
fsstack_copy_attr_times(dir, lower_dir);
fsstack_copy_inode_size(dir, lower_dir);
out_lock:
- inode_unlock(lower_dir);
+ end_creating(lower_dentry);
if (d_really_is_negative(dentry))
d_drop(dentry);
return rc;
@@ -501,14 +510,16 @@ static struct dentry *ecryptfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
{
int rc;
struct dentry *lower_dentry;
+ struct dentry *lower_dir_dentry;
struct inode *lower_dir;
- rc = lock_parent(dentry, &lower_dentry, &lower_dir);
- if (rc)
- goto out;
-
+ lower_dentry = ecryptfs_start_creating_dentry(dentry);
+ if (IS_ERR(lower_dentry))
+ return lower_dentry;
+ lower_dir_dentry = dget(lower_dentry->d_parent);
+ lower_dir = lower_dir_dentry->d_inode;
lower_dentry = vfs_mkdir(&nop_mnt_idmap, lower_dir,
- lower_dentry, mode);
+ lower_dentry, mode, NULL);
rc = PTR_ERR(lower_dentry);
if (IS_ERR(lower_dentry))
goto out;
@@ -522,7 +533,7 @@ static struct dentry *ecryptfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
fsstack_copy_inode_size(dir, lower_dir);
set_nlink(dir, lower_dir->i_nlink);
out:
- inode_unlock(lower_dir);
+ end_creating(lower_dentry);
if (d_really_is_negative(dentry))
d_drop(dentry);
return ERR_PTR(rc);
@@ -534,21 +545,18 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
struct inode *lower_dir;
int rc;
- rc = lock_parent(dentry, &lower_dentry, &lower_dir);
- dget(lower_dentry); // don't even try to make the lower negative
- if (!rc) {
- if (d_unhashed(lower_dentry))
- rc = -EINVAL;
- else
- rc = vfs_rmdir(&nop_mnt_idmap, lower_dir, lower_dentry);
- }
+ lower_dentry = ecryptfs_start_removing_dentry(dentry);
+ if (IS_ERR(lower_dentry))
+ return PTR_ERR(lower_dentry);
+ lower_dir = lower_dentry->d_parent->d_inode;
+
+ rc = vfs_rmdir(&nop_mnt_idmap, lower_dir, lower_dentry, NULL);
if (!rc) {
clear_nlink(d_inode(dentry));
fsstack_copy_attr_times(dir, lower_dir);
set_nlink(dir, lower_dir->i_nlink);
}
- dput(lower_dentry);
- inode_unlock(lower_dir);
+ end_removing(lower_dentry);
if (!rc)
d_drop(dentry);
return rc;
@@ -562,10 +570,12 @@ ecryptfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *lower_dentry;
struct inode *lower_dir;
- rc = lock_parent(dentry, &lower_dentry, &lower_dir);
- if (!rc)
- rc = vfs_mknod(&nop_mnt_idmap, lower_dir,
- lower_dentry, mode, dev);
+ lower_dentry = ecryptfs_start_creating_dentry(dentry);
+ if (IS_ERR(lower_dentry))
+ return PTR_ERR(lower_dentry);
+ lower_dir = lower_dentry->d_parent->d_inode;
+
+ rc = vfs_mknod(&nop_mnt_idmap, lower_dir, lower_dentry, mode, dev, NULL);
if (rc || d_really_is_negative(lower_dentry))
goto out;
rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
@@ -574,7 +584,7 @@ ecryptfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
fsstack_copy_attr_times(dir, lower_dir);
fsstack_copy_inode_size(dir, lower_dir);
out:
- inode_unlock(lower_dir);
+ end_removing(lower_dentry);
if (d_really_is_negative(dentry))
d_drop(dentry);
return rc;
@@ -590,7 +600,6 @@ ecryptfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
struct dentry *lower_new_dentry;
struct dentry *lower_old_dir_dentry;
struct dentry *lower_new_dir_dentry;
- struct dentry *trap;
struct inode *target_inode;
struct renamedata rd = {};
@@ -605,31 +614,13 @@ ecryptfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
target_inode = d_inode(new_dentry);
- trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
- if (IS_ERR(trap))
- return PTR_ERR(trap);
- dget(lower_new_dentry);
- rc = -EINVAL;
- if (lower_old_dentry->d_parent != lower_old_dir_dentry)
- goto out_lock;
- if (lower_new_dentry->d_parent != lower_new_dir_dentry)
- goto out_lock;
- if (d_unhashed(lower_old_dentry) || d_unhashed(lower_new_dentry))
- goto out_lock;
- /* source should not be ancestor of target */
- if (trap == lower_old_dentry)
- goto out_lock;
- /* target should not be ancestor of source */
- if (trap == lower_new_dentry) {
- rc = -ENOTEMPTY;
- goto out_lock;
- }
+ rd.mnt_idmap = &nop_mnt_idmap;
+ rd.old_parent = lower_old_dir_dentry;
+ rd.new_parent = lower_new_dir_dentry;
+ rc = start_renaming_two_dentries(&rd, lower_old_dentry, lower_new_dentry);
+ if (rc)
+ return rc;
- rd.mnt_idmap = &nop_mnt_idmap;
- rd.old_parent = lower_old_dir_dentry;
- rd.old_dentry = lower_old_dentry;
- rd.new_parent = lower_new_dir_dentry;
- rd.new_dentry = lower_new_dentry;
rc = vfs_rename(&rd);
if (rc)
goto out_lock;
@@ -640,8 +631,7 @@ ecryptfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
if (new_dir != old_dir)
fsstack_copy_attr_all(old_dir, d_inode(lower_old_dir_dentry));
out_lock:
- dput(lower_new_dentry);
- unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
+ end_renaming(&rd);
return rc;
}
@@ -903,11 +893,8 @@ static int ecryptfs_setattr(struct mnt_idmap *idmap,
struct ecryptfs_crypt_stat *crypt_stat;
crypt_stat = &ecryptfs_inode_to_private(d_inode(dentry))->crypt_stat;
- if (!(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED)) {
- rc = ecryptfs_init_crypt_stat(crypt_stat);
- if (rc)
- return rc;
- }
+ if (!(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED))
+ ecryptfs_init_crypt_stat(crypt_stat);
inode = d_inode(dentry);
lower_inode = ecryptfs_inode_to_lower(inode);
lower_dentry = ecryptfs_dentry_to_lower(dentry);
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 7f9f68c00ef6..bbf8603242fa 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -11,7 +11,6 @@
* Trevor S. Highland <trevor.highland@gmail.com>
*/
-#include <crypto/hash.h>
#include <crypto/skcipher.h>
#include <linux/string.h>
#include <linux/pagemap.h>
@@ -601,10 +600,7 @@ struct ecryptfs_write_tag_70_packet_silly_stack {
struct crypto_skcipher *skcipher_tfm;
struct skcipher_request *skcipher_req;
char iv[ECRYPTFS_MAX_IV_BYTES];
- char hash[ECRYPTFS_TAG_70_DIGEST_SIZE];
- char tmp_hash[ECRYPTFS_TAG_70_DIGEST_SIZE];
- struct crypto_shash *hash_tfm;
- struct shash_desc *hash_desc;
+ char hash[MD5_DIGEST_SIZE];
};
/*
@@ -741,51 +737,15 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
"password tokens\n", __func__);
goto out_free_unlock;
}
- s->hash_tfm = crypto_alloc_shash(ECRYPTFS_TAG_70_DIGEST, 0, 0);
- if (IS_ERR(s->hash_tfm)) {
- rc = PTR_ERR(s->hash_tfm);
- printk(KERN_ERR "%s: Error attempting to "
- "allocate hash crypto context; rc = [%d]\n",
- __func__, rc);
- goto out_free_unlock;
- }
-
- s->hash_desc = kmalloc(sizeof(*s->hash_desc) +
- crypto_shash_descsize(s->hash_tfm), GFP_KERNEL);
- if (!s->hash_desc) {
- rc = -ENOMEM;
- goto out_release_free_unlock;
- }
- s->hash_desc->tfm = s->hash_tfm;
-
- rc = crypto_shash_digest(s->hash_desc,
- (u8 *)s->auth_tok->token.password.session_key_encryption_key,
- s->auth_tok->token.password.session_key_encryption_key_bytes,
- s->hash);
- if (rc) {
- printk(KERN_ERR
- "%s: Error computing crypto hash; rc = [%d]\n",
- __func__, rc);
- goto out_release_free_unlock;
- }
+ md5(s->auth_tok->token.password.session_key_encryption_key,
+ s->auth_tok->token.password.session_key_encryption_key_bytes,
+ s->hash);
for (s->j = 0; s->j < (s->num_rand_bytes - 1); s->j++) {
s->block_aligned_filename[s->j] =
- s->hash[(s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)];
- if ((s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)
- == (ECRYPTFS_TAG_70_DIGEST_SIZE - 1)) {
- rc = crypto_shash_digest(s->hash_desc, (u8 *)s->hash,
- ECRYPTFS_TAG_70_DIGEST_SIZE,
- s->tmp_hash);
- if (rc) {
- printk(KERN_ERR
- "%s: Error computing crypto hash; "
- "rc = [%d]\n", __func__, rc);
- goto out_release_free_unlock;
- }
- memcpy(s->hash, s->tmp_hash,
- ECRYPTFS_TAG_70_DIGEST_SIZE);
- }
+ s->hash[s->j % MD5_DIGEST_SIZE];
+ if ((s->j % MD5_DIGEST_SIZE) == (MD5_DIGEST_SIZE - 1))
+ md5(s->hash, MD5_DIGEST_SIZE, s->hash);
if (s->block_aligned_filename[s->j] == '\0')
s->block_aligned_filename[s->j] = ECRYPTFS_NON_NULL;
}
@@ -798,7 +758,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
"convert filename memory to scatterlist; rc = [%d]. "
"block_aligned_filename_size = [%zd]\n", __func__, rc,
s->block_aligned_filename_size);
- goto out_release_free_unlock;
+ goto out_free_unlock;
}
rc = virt_to_scatterlist(&dest[s->i], s->block_aligned_filename_size,
s->dst_sg, 2);
@@ -807,7 +767,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
"convert encrypted filename memory to scatterlist; "
"rc = [%d]. block_aligned_filename_size = [%zd]\n",
__func__, rc, s->block_aligned_filename_size);
- goto out_release_free_unlock;
+ goto out_free_unlock;
}
/* The characters in the first block effectively do the job
* of the IV here, so we just use 0's for the IV. Note the
@@ -825,7 +785,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
rc,
s->auth_tok->token.password.session_key_encryption_key,
mount_crypt_stat->global_default_fn_cipher_key_bytes);
- goto out_release_free_unlock;
+ goto out_free_unlock;
}
skcipher_request_set_crypt(s->skcipher_req, s->src_sg, s->dst_sg,
s->block_aligned_filename_size, s->iv);
@@ -833,13 +793,11 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
if (rc) {
printk(KERN_ERR "%s: Error attempting to encrypt filename; "
"rc = [%d]\n", __func__, rc);
- goto out_release_free_unlock;
+ goto out_free_unlock;
}
s->i += s->block_aligned_filename_size;
(*packet_size) = s->i;
(*remaining_bytes) -= (*packet_size);
-out_release_free_unlock:
- crypto_free_shash(s->hash_tfm);
out_free_unlock:
kfree_sensitive(s->block_aligned_filename);
out_unlock:
@@ -850,7 +808,6 @@ out:
key_put(auth_tok_key);
}
skcipher_request_free(s->skcipher_req);
- kfree_sensitive(s->hash_desc);
kfree(s);
return rc;
}
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 16ea14dd2c62..c12dc680f8fe 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -12,6 +12,7 @@
#include <linux/dcache.h>
#include <linux/file.h>
+#include <linux/fips.h>
#include <linux/module.h>
#include <linux/namei.h>
#include <linux/skbuff.h>
@@ -454,6 +455,12 @@ static int ecryptfs_get_tree(struct fs_context *fc)
goto out;
}
+ if (fips_enabled) {
+ rc = -EINVAL;
+ err = "eCryptfs support is disabled due to FIPS";
+ goto out;
+ }
+
s = sget_fc(fc, NULL, set_anon_super_fc);
if (IS_ERR(s)) {
rc = PTR_ERR(s);
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index e7b7f426fecf..3bc21d677564 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -41,10 +41,7 @@ static struct inode *ecryptfs_alloc_inode(struct super_block *sb)
inode_info = alloc_inode_sb(sb, ecryptfs_inode_info_cache, GFP_KERNEL);
if (unlikely(!inode_info))
goto out;
- if (ecryptfs_init_crypt_stat(&inode_info->crypt_stat)) {
- kmem_cache_free(ecryptfs_inode_info_cache, inode_info);
- goto out;
- }
+ ecryptfs_init_crypt_stat(&inode_info->crypt_stat);
mutex_init(&inode_info->lower_file_mutex);
atomic_set(&inode_info->lower_file_count, 0);
inode_info->lower_file = NULL;
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 1f4d8ce56667..6de97565d5f7 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -533,6 +533,7 @@ static struct file_system_type efivarfs_type = {
.init_fs_context = efivarfs_init_fs_context,
.kill_sb = efivarfs_kill_sb,
.parameters = efivarfs_parameters,
+ .fs_flags = FS_POWER_FREEZE,
};
static __init int efivarfs_init(void)
diff --git a/fs/efs/inode.c b/fs/efs/inode.c
index 462619e59766..28407578f83a 100644
--- a/fs/efs/inode.c
+++ b/fs/efs/inode.c
@@ -62,7 +62,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino)
inode = iget_locked(super, ino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
in = INODE_INFO(inode);
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 8ca29962a3dd..bb13c4cb8455 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -371,7 +371,8 @@ static int erofs_read_folio(struct file *file, struct folio *folio)
{
trace_erofs_read_folio(folio, true);
- return iomap_read_folio(folio, &erofs_iomap_ops);
+ iomap_bio_read_folio(folio, &erofs_iomap_ops);
+ return 0;
}
static void erofs_readahead(struct readahead_control *rac)
@@ -379,7 +380,7 @@ static void erofs_readahead(struct readahead_control *rac)
trace_erofs_readahead(rac->mapping->host, readahead_index(rac),
readahead_count(rac), true);
- return iomap_readahead(rac, &erofs_iomap_ops);
+ iomap_bio_readahead(rac, &erofs_iomap_ops);
}
static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
diff --git a/fs/erofs/decompressor_zstd.c b/fs/erofs/decompressor_zstd.c
index b4bfe14229f9..e38d93bb2104 100644
--- a/fs/erofs/decompressor_zstd.c
+++ b/fs/erofs/decompressor_zstd.c
@@ -172,7 +172,6 @@ static int z_erofs_zstd_decompress(struct z_erofs_decompress_req *rq,
dctx.bounce = strm->bounce;
do {
- dctx.avail_out = out_buf.size - out_buf.pos;
dctx.inbuf_sz = in_buf.size;
dctx.inbuf_pos = in_buf.pos;
err = z_erofs_stream_switch_bufs(&dctx, &out_buf.dst,
@@ -188,14 +187,18 @@ static int z_erofs_zstd_decompress(struct z_erofs_decompress_req *rq,
in_buf.pos = dctx.inbuf_pos;
zerr = zstd_decompress_stream(stream, &out_buf, &in_buf);
- if (zstd_is_error(zerr) || (!zerr && rq->outputsize)) {
+ dctx.avail_out = out_buf.size - out_buf.pos;
+ if (zstd_is_error(zerr) ||
+ ((rq->outputsize + dctx.avail_out) && (!zerr || (zerr > 0 &&
+ !(rq->inputsize + in_buf.size - in_buf.pos))))) {
erofs_err(sb, "failed to decompress in[%u] out[%u]: %s",
rq->inputsize, rq->outputsize,
- zerr ? zstd_get_error_name(zerr) : "unexpected end of stream");
+ zstd_is_error(zerr) ? zstd_get_error_name(zerr) :
+ "unexpected end of stream");
err = -EFSCORRUPTED;
break;
}
- } while (rq->outputsize || out_buf.pos < out_buf.size);
+ } while (rq->outputsize + dctx.avail_out);
if (dctx.kout)
kunmap_local(dctx.kout);
diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c
index b7b3432a9882..d27938435b2f 100644
--- a/fs/erofs/fileio.c
+++ b/fs/erofs/fileio.c
@@ -47,7 +47,6 @@ static void erofs_fileio_ki_complete(struct kiocb *iocb, long ret)
static void erofs_fileio_rq_submit(struct erofs_fileio_rq *rq)
{
- const struct cred *old_cred;
struct iov_iter iter;
int ret;
@@ -61,9 +60,8 @@ static void erofs_fileio_rq_submit(struct erofs_fileio_rq *rq)
rq->iocb.ki_flags = IOCB_DIRECT;
iov_iter_bvec(&iter, ITER_DEST, rq->bvecs, rq->bio.bi_vcnt,
rq->bio.bi_iter.bi_size);
- old_cred = override_creds(rq->iocb.ki_filp->f_cred);
- ret = vfs_iocb_iter_read(rq->iocb.ki_filp, &rq->iocb, &iter);
- revert_creds(old_cred);
+ scoped_with_creds(rq->iocb.ki_filp->f_cred)
+ ret = vfs_iocb_iter_read(rq->iocb.ki_filp, &rq->iocb, &iter);
if (ret != -EIOCBQUEUED)
erofs_fileio_ki_complete(&rq->iocb, ret);
}
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index cb780c095d28..bce98c845a18 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -295,7 +295,7 @@ struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid)
if (!inode)
return ERR_PTR(-ENOMEM);
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
int err = erofs_fill_inode(inode);
if (err) {
diff --git a/fs/eventfd.c b/fs/eventfd.c
index af42b2c7d235..3219e0d596fe 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -378,9 +378,7 @@ EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);
static int do_eventfd(unsigned int count, int flags)
{
- struct eventfd_ctx *ctx;
- struct file *file;
- int fd;
+ struct eventfd_ctx *ctx __free(kfree) = NULL;
/* Check the EFD_* constants for consistency. */
BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
@@ -398,26 +396,19 @@ static int do_eventfd(unsigned int count, int flags)
init_waitqueue_head(&ctx->wqh);
ctx->count = count;
ctx->flags = flags;
- ctx->id = ida_alloc(&eventfd_ida, GFP_KERNEL);
flags &= EFD_SHARED_FCNTL_FLAGS;
flags |= O_RDWR;
- fd = get_unused_fd_flags(flags);
- if (fd < 0)
- goto err;
-
- file = anon_inode_getfile_fmode("[eventfd]", &eventfd_fops,
- ctx, flags, FMODE_NOWAIT);
- if (IS_ERR(file)) {
- put_unused_fd(fd);
- fd = PTR_ERR(file);
- goto err;
- }
- fd_install(fd, file);
- return fd;
-err:
- eventfd_free_ctx(ctx);
- return fd;
+
+ FD_PREPARE(fdf, flags,
+ anon_inode_getfile_fmode("[eventfd]", &eventfd_fops, ctx,
+ flags, FMODE_NOWAIT));
+ if (fdf.err)
+ return fdf.err;
+
+ ctx->id = ida_alloc(&eventfd_ida, GFP_KERNEL);
+ retain_and_null_ptr(ctx);
+ return fd_publish(fdf);
}
SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index ee7c4b683ec3..6c36d9dc6926 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -2165,9 +2165,8 @@ static void clear_tfile_check_list(void)
*/
static int do_epoll_create(int flags)
{
- int error, fd;
- struct eventpoll *ep = NULL;
- struct file *file;
+ int error;
+ struct eventpoll *ep;
/* Check the EPOLL_* constant for consistency. */
BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
@@ -2184,26 +2183,15 @@ static int do_epoll_create(int flags)
* Creates all the items needed to setup an eventpoll file. That is,
* a file structure and a free file descriptor.
*/
- fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
- if (fd < 0) {
- error = fd;
- goto out_free_ep;
- }
- file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
- O_RDWR | (flags & O_CLOEXEC));
- if (IS_ERR(file)) {
- error = PTR_ERR(file);
- goto out_free_fd;
+ FD_PREPARE(fdf, O_RDWR | (flags & O_CLOEXEC),
+ anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
+ O_RDWR | (flags & O_CLOEXEC)));
+ if (fdf.err) {
+ ep_clear_and_put(ep);
+ return fdf.err;
}
- ep->file = file;
- fd_install(fd, file);
- return fd;
-
-out_free_fd:
- put_unused_fd(fd);
-out_free_ep:
- ep_clear_and_put(ep);
- return error;
+ ep->file = fd_prepare_file(fdf);
+ return fd_publish(fdf);
}
SYSCALL_DEFINE1(epoll_create1, int, flags)
diff --git a/fs/exec.c b/fs/exec.c
index 4298e7e08d5d..7cb001a222d1 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1280,10 +1280,9 @@ int begin_new_exec(struct linux_binprm * bprm)
/* Pass the opened binary to the interpreter. */
if (bprm->have_execfd) {
- retval = get_unused_fd_flags(0);
+ retval = FD_ADD(0, bprm->executable);
if (retval < 0)
goto out_unlock;
- fd_install(retval, bprm->executable);
bprm->executable = NULL;
bprm->execfd = retval;
}
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
index 7f9592856bf7..74d451f732c7 100644
--- a/fs/exfat/super.c
+++ b/fs/exfat/super.c
@@ -433,7 +433,10 @@ static int exfat_read_boot_sector(struct super_block *sb)
struct exfat_sb_info *sbi = EXFAT_SB(sb);
/* set block size to read super block */
- sb_min_blocksize(sb, 512);
+ if (!sb_min_blocksize(sb, 512)) {
+ exfat_err(sb, "unable to set blocksize");
+ return -EINVAL;
+ }
/* read boot sector */
sbi->boot_bh = sb_bread(sb, 0);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index e10c376843d7..dbfe9098a124 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1398,7 +1398,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
inode = iget_locked(sb, ino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
ei = EXT2_I(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e99306a8f47c..78ea864fa8cd 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -202,8 +202,7 @@ void ext4_evict_inode(struct inode *inode)
* the inode. Flush worker is ignoring it because of I_FREEING flag but
* we still need to remove the inode from the writeback lists.
*/
- if (!list_empty_careful(&inode->i_io_list))
- inode_io_list_del(inode);
+ inode_io_list_del(inode);
/*
* Protect us against freezing - iput() caller didn't have to have any
@@ -425,7 +424,7 @@ void ext4_check_map_extents_env(struct inode *inode)
if (!S_ISREG(inode->i_mode) ||
IS_NOQUOTA(inode) || IS_VERITY(inode) ||
is_special_ino(inode->i_sb, inode->i_ino) ||
- (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) ||
+ (inode_state_read_once(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) ||
ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) ||
ext4_verity_in_progress(inode))
return;
@@ -1319,8 +1318,8 @@ retry_grab:
if (IS_ERR(folio))
return PTR_ERR(folio);
- if (pos + len > folio_pos(folio) + folio_size(folio))
- len = folio_pos(folio) + folio_size(folio) - pos;
+ if (len > folio_next_pos(folio) - pos)
+ len = folio_next_pos(folio) - pos;
from = offset_in_folio(folio, pos);
to = from + len;
@@ -2619,10 +2618,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
handle_t *handle = NULL;
int bpp = ext4_journal_blocks_per_folio(mpd->inode);
- if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
- tag = PAGECACHE_TAG_TOWRITE;
- else
- tag = PAGECACHE_TAG_DIRTY;
+ tag = wbc_to_tag(mpd->wbc);
mpd->map.m_len = 0;
mpd->next_pos = mpd->start_pos;
@@ -2704,7 +2700,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
if (mpd->map.m_len == 0)
mpd->start_pos = folio_pos(folio);
- mpd->next_pos = folio_pos(folio) + folio_size(folio);
+ mpd->next_pos = folio_next_pos(folio);
/*
* Writeout when we cannot modify metadata is simple.
* Just submit the page. For data=journal mode we
@@ -3146,8 +3142,8 @@ retry:
if (IS_ERR(folio))
return PTR_ERR(folio);
- if (pos + len > folio_pos(folio) + folio_size(folio))
- len = folio_pos(folio) + folio_size(folio) - pos;
+ if (len > folio_next_pos(folio) - pos)
+ len = folio_next_pos(folio) - pos;
ret = ext4_block_write_begin(NULL, folio, pos, len,
ext4_da_get_block_prep);
@@ -3473,7 +3469,7 @@ static bool ext4_inode_datasync_dirty(struct inode *inode)
/* Any metadata buffers to write? */
if (!list_empty(&inode->i_mapping->i_private_list))
return true;
- return inode->i_state & I_DIRTY_DATASYNC;
+ return inode_state_read_once(inode) & I_DIRTY_DATASYNC;
}
static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
@@ -4552,7 +4548,7 @@ int ext4_truncate(struct inode *inode)
* or it's a completely new inode. In those cases we might not
* have i_rwsem locked because it's not necessary.
*/
- if (!(inode->i_state & (I_NEW|I_FREEING)))
+ if (!(inode_state_read_once(inode) & (I_NEW | I_FREEING)))
WARN_ON(!inode_is_locked(inode));
trace_ext4_truncate_enter(inode);
@@ -5210,7 +5206,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
inode = iget_locked(sb, ino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW)) {
+ if (!(inode_state_read_once(inode) & I_NEW)) {
ret = check_igot_inode(inode, flags, function, line);
if (ret) {
iput(inode);
@@ -5549,7 +5545,7 @@ static void __ext4_update_other_inode_time(struct super_block *sb,
if (inode_is_dirtytime_only(inode)) {
struct ext4_inode_info *ei = EXT4_I(inode);
- inode->i_state &= ~I_DIRTY_TIME;
+ inode_state_clear(inode, I_DIRTY_TIME);
spin_unlock(&inode->i_lock);
spin_lock(&ei->i_raw_lock);
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index ab1ff51302fb..6f57c181ff77 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -57,16 +57,12 @@ static int write_mmp_block_thawed(struct super_block *sb,
static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
{
- int err;
-
/*
* We protect against freezing so that we don't create dirty buffers
* on frozen filesystem.
*/
- sb_start_write(sb);
- err = write_mmp_block_thawed(sb, bh);
- sb_end_write(sb);
- return err;
+ scoped_guard(super_write, sb)
+ return write_mmp_block_thawed(sb, bh);
}
/*
diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c
index 82d5e7501455..5fd54adf0c88 100644
--- a/fs/ext4/orphan.c
+++ b/fs/ext4/orphan.c
@@ -107,7 +107,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
if (!sbi->s_journal || is_bad_inode(inode))
return 0;
- WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
+ WARN_ON_ONCE(!(inode_state_read_once(inode) & (I_NEW | I_FREEING)) &&
!inode_is_locked(inode));
if (ext4_inode_orphan_tracked(inode))
return 0;
@@ -232,7 +232,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS))
return 0;
- WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
+ WARN_ON_ONCE(!(inode_state_read_once(inode) & (I_NEW | I_FREEING)) &&
!inode_is_locked(inode));
if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE))
return ext4_orphan_file_del(handle, inode);
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index d4d7f329d23f..fa8d81a30fb9 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -9,6 +9,7 @@
*
* Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
*/
+#include <linux/fs_struct.h>
#include <linux/f2fs_fs.h>
#include "f2fs.h"
#include "xattr.h"
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 6ad8d3bc6df7..be53e06caf3d 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -1329,7 +1329,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
}
folio = page_folio(cc->rpages[last_index]);
- psize = folio_pos(folio) + folio_size(folio);
+ psize = folio_next_pos(folio);
err = f2fs_get_node_info(fio.sbi, dn.nid, &ni, false);
if (err)
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 775aa4f63aa3..8bf4feda42b0 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -2986,10 +2986,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping,
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
}
- if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
- tag = PAGECACHE_TAG_TOWRITE;
- else
- tag = PAGECACHE_TAG_DIRTY;
+ tag = wbc_to_tag(wbc);
retry:
retry = 0;
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
@@ -4222,7 +4219,7 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
if (map.m_flags & F2FS_MAP_NEW)
iomap->flags |= IOMAP_F_NEW;
- if ((inode->i_state & I_DIRTY_DATASYNC) ||
+ if ((inode_state_read_once(inode) & I_DIRTY_DATASYNC) ||
offset + length > i_size_read(inode))
iomap->flags |= IOMAP_F_DIRTY;
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 8c4eafe9ffac..f1cda1900658 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -569,7 +569,7 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW)) {
+ if (!(inode_state_read_once(inode) & I_NEW)) {
if (is_meta_ino(sbi, ino)) {
f2fs_err(sbi, "inaccessible inode: %lu, run fsck to repair", ino);
set_sbi_flag(sbi, SBI_NEED_FSCK);
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index b882771e4699..af40282a6948 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -844,7 +844,7 @@ static int __f2fs_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
f2fs_i_links_write(inode, false);
spin_lock(&inode->i_lock);
- inode->i_state |= I_LINKABLE;
+ inode_state_set(inode, I_LINKABLE);
spin_unlock(&inode->i_lock);
} else {
if (file)
@@ -1057,7 +1057,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
goto put_out_dir;
spin_lock(&whiteout->i_lock);
- whiteout->i_state &= ~I_LINKABLE;
+ inode_state_clear(whiteout, I_LINKABLE);
spin_unlock(&whiteout->i_lock);
iput(whiteout);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index db7afb806411..47489d48f2b9 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1798,7 +1798,7 @@ static int f2fs_drop_inode(struct inode *inode)
* - f2fs_gc -> iput -> evict
* - inode_wait_for_writeback(inode)
*/
- if ((!inode_unhashed(inode) && inode->i_state & I_SYNC)) {
+ if ((!inode_unhashed(inode) && inode_state_read(inode) & I_SYNC)) {
if (!inode->i_nlink && !is_bad_inode(inode)) {
/* to avoid evict_inode call simultaneously */
__iget(inode);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 9648ed097816..0b6009cd1844 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -22,6 +22,7 @@
#include <linux/unaligned.h>
#include <linux/random.h>
#include <linux/iversion.h>
+#include <linux/fs_struct.h>
#include "fat.h"
#ifndef CONFIG_FAT_DEFAULT_IOCHARSET
@@ -1595,8 +1596,12 @@ int fat_fill_super(struct super_block *sb, struct fs_context *fc,
setup(sb); /* flavour-specific stuff that needs options */
+ error = -EINVAL;
+ if (!sb_min_blocksize(sb, 512)) {
+ fat_msg(sb, KERN_ERR, "unable to set blocksize");
+ goto out_fail;
+ }
error = -EIO;
- sb_min_blocksize(sb, 512);
bh = sb_bread(sb, 0);
if (bh == NULL) {
fat_msg(sb, KERN_ERR, "unable to read boot sector");
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 72f8433d9109..f93dbca08435 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -445,6 +445,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
struct file *filp)
{
void __user *argp = (void __user *)arg;
+ struct delegation deleg;
int argi = (int)arg;
struct flock flock;
long err = -EINVAL;
@@ -550,6 +551,18 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
case F_SET_RW_HINT:
err = fcntl_set_rw_hint(filp, arg);
break;
+ case F_GETDELEG:
+ if (copy_from_user(&deleg, argp, sizeof(deleg)))
+ return -EFAULT;
+ err = fcntl_getdeleg(filp, &deleg);
+ if (!err && copy_to_user(argp, &deleg, sizeof(deleg)))
+ return -EFAULT;
+ break;
+ case F_SETDELEG:
+ if (copy_from_user(&deleg, argp, sizeof(deleg)))
+ return -EFAULT;
+ err = fcntl_setdeleg(fd, filp, &deleg);
+ break;
default:
break;
}
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 052f9c9368fb..3de1547ec9d4 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -404,32 +404,28 @@ out_path:
return retval;
}
+static struct file *file_open_handle(struct path *path, int open_flag)
+{
+ const struct export_operations *eops;
+
+ eops = path->mnt->mnt_sb->s_export_op;
+ if (eops->open)
+ return eops->open(path, open_flag);
+
+ return file_open_root(path, "", open_flag, 0);
+}
+
static long do_handle_open(int mountdirfd, struct file_handle __user *ufh,
int open_flag)
{
- long retval = 0;
+ long retval;
struct path path __free(path_put) = {};
- struct file *file;
- const struct export_operations *eops;
retval = handle_to_path(mountdirfd, ufh, &path, open_flag);
if (retval)
return retval;
- CLASS(get_unused_fd, fd)(open_flag);
- if (fd < 0)
- return fd;
-
- eops = path.mnt->mnt_sb->s_export_op;
- if (eops->open)
- file = eops->open(&path, open_flag);
- else
- file = file_open_root(&path, "", open_flag, 0);
- if (IS_ERR(file))
- return PTR_ERR(file);
-
- fd_install(fd, file);
- return take_fd(fd);
+ return FD_ADD(open_flag, file_open_handle(&path, open_flag));
}
/**
diff --git a/fs/file.c b/fs/file.c
index 28743b742e3c..0a4f3bdb2dec 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -641,6 +641,34 @@ void put_unused_fd(unsigned int fd)
EXPORT_SYMBOL(put_unused_fd);
+/*
+ * Install a file pointer in the fd array while it is being resized.
+ *
+ * We need to make sure our update to the array does not get lost as the resizing
+ * thread can be copying the content as we modify it.
+ *
+ * We have two ways to do it:
+ * - go off CPU waiting for resize_in_progress to clear
+ * - take the spin lock
+ *
+ * The latter is trivial to implement and saves us from having to might_sleep()
+ * for debugging purposes.
+ *
+ * This is moved out of line from fd_install() to convince gcc to optimize that
+ * routine better.
+ */
+static void noinline fd_install_slowpath(unsigned int fd, struct file *file)
+{
+ struct files_struct *files = current->files;
+ struct fdtable *fdt;
+
+ spin_lock(&files->file_lock);
+ fdt = files_fdtable(files);
+ VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL);
+ rcu_assign_pointer(fdt->fd[fd], file);
+ spin_unlock(&files->file_lock);
+}
+
/**
* fd_install - install a file pointer in the fd array
* @fd: file descriptor to install the file in
@@ -658,14 +686,9 @@ void fd_install(unsigned int fd, struct file *file)
return;
rcu_read_lock_sched();
-
if (unlikely(files->resize_in_progress)) {
rcu_read_unlock_sched();
- spin_lock(&files->file_lock);
- fdt = files_fdtable(files);
- VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL);
- rcu_assign_pointer(fdt->fd[fd], file);
- spin_unlock(&files->file_lock);
+ fd_install_slowpath(fd, file);
return;
}
/* coupled with smp_wmb() in expand_fdtable() */
@@ -1357,28 +1380,25 @@ out_unlock:
*/
int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags)
{
- int new_fd;
int error;
error = security_file_receive(file);
if (error)
return error;
- new_fd = get_unused_fd_flags(o_flags);
- if (new_fd < 0)
- return new_fd;
+ FD_PREPARE(fdf, o_flags, file);
+ if (fdf.err)
+ return fdf.err;
+ get_file(file);
if (ufd) {
- error = put_user(new_fd, ufd);
- if (error) {
- put_unused_fd(new_fd);
+ error = put_user(fd_prepare_fd(fdf), ufd);
+ if (error)
return error;
- }
}
- fd_install(new_fd, get_file(file));
- __receive_sock(file);
- return new_fd;
+ __receive_sock(fd_prepare_file(fdf));
+ return fd_publish(fdf);
}
EXPORT_SYMBOL_GPL(receive_fd);
diff --git a/fs/file_attr.c b/fs/file_attr.c
index 1dcec88c0680..4c4916632f11 100644
--- a/fs/file_attr.c
+++ b/fs/file_attr.c
@@ -316,7 +316,6 @@ int ioctl_getflags(struct file *file, unsigned int __user *argp)
err = put_user(fa.flags, argp);
return err;
}
-EXPORT_SYMBOL(ioctl_getflags);
int ioctl_setflags(struct file *file, unsigned int __user *argp)
{
@@ -337,7 +336,6 @@ int ioctl_setflags(struct file *file, unsigned int __user *argp)
}
return err;
}
-EXPORT_SYMBOL(ioctl_setflags);
int ioctl_fsgetxattr(struct file *file, void __user *argp)
{
@@ -350,7 +348,6 @@ int ioctl_fsgetxattr(struct file *file, void __user *argp)
return err;
}
-EXPORT_SYMBOL(ioctl_fsgetxattr);
int ioctl_fssetxattr(struct file *file, void __user *argp)
{
@@ -369,7 +366,6 @@ int ioctl_fssetxattr(struct file *file, void __user *argp)
}
return err;
}
-EXPORT_SYMBOL(ioctl_fssetxattr);
SYSCALL_DEFINE5(file_getattr, int, dfd, const char __user *, filename,
struct file_attr __user *, ufattr, size_t, usize,
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 20600e9ea202..21fc94b98209 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -258,7 +258,7 @@ vxfs_iget(struct super_block *sbp, ino_t ino)
ip = iget_locked(sbp, ino);
if (!ip)
return ERR_PTR(-ENOMEM);
- if (!(ip->i_state & I_NEW))
+ if (!(inode_state_read_once(ip) & I_NEW))
return ip;
vip = VXFS_INO(ip);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 2b35e80037fe..6800886c4d10 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -14,6 +14,7 @@
* Additions for address_space-based writeback
*/
+#include <linux/sched/sysctl.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/spinlock.h>
@@ -32,11 +33,6 @@
#include "internal.h"
/*
- * 4MB minimal write chunk size
- */
-#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10))
-
-/*
* Passed into wb_writeback(), essentially a subset of writeback_control
*/
struct wb_writeback_work {
@@ -121,7 +117,7 @@ static bool inode_io_list_move_locked(struct inode *inode,
{
assert_spin_locked(&wb->list_lock);
assert_spin_locked(&inode->i_lock);
- WARN_ON_ONCE(inode->i_state & I_FREEING);
+ WARN_ON_ONCE(inode_state_read(inode) & I_FREEING);
list_move(&inode->i_io_list, head);
@@ -200,6 +196,19 @@ static void wb_queue_work(struct bdi_writeback *wb,
spin_unlock_irq(&wb->work_lock);
}
+static bool wb_wait_for_completion_cb(struct wb_completion *done)
+{
+ unsigned long waited_secs = (jiffies - done->wait_start) / HZ;
+
+ done->progress_stamp = jiffies;
+ if (waited_secs > sysctl_hung_task_timeout_secs)
+ pr_info("INFO: The task %s:%d has been waiting for writeback "
+ "completion for more than %lu seconds.",
+ current->comm, current->pid, waited_secs);
+
+ return !atomic_read(&done->cnt);
+}
+
/**
* wb_wait_for_completion - wait for completion of bdi_writeback_works
* @done: target wb_completion
@@ -212,8 +221,9 @@ static void wb_queue_work(struct bdi_writeback *wb,
*/
void wb_wait_for_completion(struct wb_completion *done)
{
+ done->wait_start = jiffies;
atomic_dec(&done->cnt); /* put down the initial count */
- wait_event(*done->waitq, !atomic_read(&done->cnt));
+ wait_event(*done->waitq, wb_wait_for_completion_cb(done));
}
#ifdef CONFIG_CGROUP_WRITEBACK
@@ -304,9 +314,9 @@ static void inode_cgwb_move_to_attached(struct inode *inode,
{
assert_spin_locked(&wb->list_lock);
assert_spin_locked(&inode->i_lock);
- WARN_ON_ONCE(inode->i_state & I_FREEING);
+ WARN_ON_ONCE(inode_state_read(inode) & I_FREEING);
- inode->i_state &= ~I_SYNC_QUEUED;
+ inode_state_clear(inode, I_SYNC_QUEUED);
if (wb != &wb->bdi->wb)
list_move(&inode->i_io_list, &wb->b_attached);
else
@@ -408,7 +418,7 @@ static bool inode_do_switch_wbs(struct inode *inode,
* Once I_FREEING or I_WILL_FREE are visible under i_lock, the eviction
* path owns the inode and we shouldn't modify ->i_io_list.
*/
- if (unlikely(inode->i_state & (I_FREEING | I_WILL_FREE)))
+ if (unlikely(inode_state_read(inode) & (I_FREEING | I_WILL_FREE)))
goto skip_switch;
trace_inode_switch_wbs(inode, old_wb, new_wb);
@@ -451,7 +461,7 @@ static bool inode_do_switch_wbs(struct inode *inode,
if (!list_empty(&inode->i_io_list)) {
inode->i_wb = new_wb;
- if (inode->i_state & I_DIRTY_ALL) {
+ if (inode_state_read(inode) & I_DIRTY_ALL) {
/*
* We need to keep b_dirty list sorted by
* dirtied_time_when. However properly sorting the
@@ -476,10 +486,11 @@ static bool inode_do_switch_wbs(struct inode *inode,
switched = true;
skip_switch:
/*
- * Paired with load_acquire in unlocked_inode_to_wb_begin() and
+ * Paired with an acquire fence in unlocked_inode_to_wb_begin() and
* ensures that the new wb is visible if they see !I_WB_SWITCH.
*/
- smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH);
+ smp_wmb();
+ inode_state_clear(inode, I_WB_SWITCH);
xa_unlock_irq(&mapping->i_pages);
spin_unlock(&inode->i_lock);
@@ -600,12 +611,12 @@ static bool inode_prepare_wbs_switch(struct inode *inode,
/* while holding I_WB_SWITCH, no one else can update the association */
spin_lock(&inode->i_lock);
if (!(inode->i_sb->s_flags & SB_ACTIVE) ||
- inode->i_state & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) ||
+ inode_state_read(inode) & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) ||
inode_to_wb(inode) == new_wb) {
spin_unlock(&inode->i_lock);
return false;
}
- inode->i_state |= I_WB_SWITCH;
+ inode_state_set(inode, I_WB_SWITCH);
__iget(inode);
spin_unlock(&inode->i_lock);
@@ -635,7 +646,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
struct bdi_writeback *new_wb = NULL;
/* noop if seems to be already in progress */
- if (inode->i_state & I_WB_SWITCH)
+ if (inode_state_read_once(inode) & I_WB_SWITCH)
return;
/* avoid queueing a new switch if too many are already in flight */
@@ -807,9 +818,9 @@ static void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
* @wbc: writeback_control of interest
* @inode: target inode
*
- * This function is to be used by __filemap_fdatawrite_range(), which is an
- * alternative entry point into writeback code, and first ensures @inode is
- * associated with a bdi_writeback and attaches it to @wbc.
+ * This function is to be used by filemap_writeback(), which is an alternative
+ * entry point into writeback code, and first ensures @inode is associated with
+ * a bdi_writeback and attaches it to @wbc.
*/
void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
struct inode *inode)
@@ -1236,9 +1247,9 @@ static void inode_cgwb_move_to_attached(struct inode *inode,
{
assert_spin_locked(&wb->list_lock);
assert_spin_locked(&inode->i_lock);
- WARN_ON_ONCE(inode->i_state & I_FREEING);
+ WARN_ON_ONCE(inode_state_read(inode) & I_FREEING);
- inode->i_state &= ~I_SYNC_QUEUED;
+ inode_state_clear(inode, I_SYNC_QUEUED);
list_del_init(&inode->i_io_list);
wb_io_lists_depopulated(wb);
}
@@ -1348,10 +1359,17 @@ void inode_io_list_del(struct inode *inode)
{
struct bdi_writeback *wb;
+ /*
+ * FIXME: ext4 can call here from ext4_evict_inode() after evict() already
+ * unlinked the inode.
+ */
+ if (list_empty_careful(&inode->i_io_list))
+ return;
+
wb = inode_to_wb_and_lock_list(inode);
spin_lock(&inode->i_lock);
- inode->i_state &= ~I_SYNC_QUEUED;
+ inode_state_clear(inode, I_SYNC_QUEUED);
list_del_init(&inode->i_io_list);
wb_io_lists_depopulated(wb);
@@ -1409,13 +1427,13 @@ static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb)
{
assert_spin_locked(&inode->i_lock);
- inode->i_state &= ~I_SYNC_QUEUED;
+ inode_state_clear(inode, I_SYNC_QUEUED);
/*
* When the inode is being freed just don't bother with dirty list
* tracking. Flush worker will ignore this inode anyway and it will
* trigger assertions in inode_io_list_move_locked().
*/
- if (inode->i_state & I_FREEING) {
+ if (inode_state_read(inode) & I_FREEING) {
list_del_init(&inode->i_io_list);
wb_io_lists_depopulated(wb);
return;
@@ -1449,9 +1467,9 @@ static void inode_sync_complete(struct inode *inode)
{
assert_spin_locked(&inode->i_lock);
- inode->i_state &= ~I_SYNC;
+ inode_state_clear(inode, I_SYNC);
/* If inode is clean an unused, put it into LRU now... */
- inode_add_lru(inode);
+ inode_lru_list_add(inode);
/* Called with inode->i_lock which ensures memory ordering. */
inode_wake_up_bit(inode, __I_SYNC);
}
@@ -1493,7 +1511,7 @@ static int move_expired_inodes(struct list_head *delaying_queue,
spin_lock(&inode->i_lock);
list_move(&inode->i_io_list, &tmp);
moved++;
- inode->i_state |= I_SYNC_QUEUED;
+ inode_state_set(inode, I_SYNC_QUEUED);
spin_unlock(&inode->i_lock);
if (sb_is_blkdev_sb(inode->i_sb))
continue;
@@ -1579,14 +1597,14 @@ void inode_wait_for_writeback(struct inode *inode)
assert_spin_locked(&inode->i_lock);
- if (!(inode->i_state & I_SYNC))
+ if (!(inode_state_read(inode) & I_SYNC))
return;
wq_head = inode_bit_waitqueue(&wqe, inode, __I_SYNC);
for (;;) {
prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE);
/* Checking I_SYNC with inode->i_lock guarantees memory ordering. */
- if (!(inode->i_state & I_SYNC))
+ if (!(inode_state_read(inode) & I_SYNC))
break;
spin_unlock(&inode->i_lock);
schedule();
@@ -1612,7 +1630,7 @@ static void inode_sleep_on_writeback(struct inode *inode)
wq_head = inode_bit_waitqueue(&wqe, inode, __I_SYNC);
prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE);
/* Checking I_SYNC with inode->i_lock guarantees memory ordering. */
- sleep = !!(inode->i_state & I_SYNC);
+ sleep = !!(inode_state_read(inode) & I_SYNC);
spin_unlock(&inode->i_lock);
if (sleep)
schedule();
@@ -1631,7 +1649,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
struct writeback_control *wbc,
unsigned long dirtied_before)
{
- if (inode->i_state & I_FREEING)
+ if (inode_state_read(inode) & I_FREEING)
return;
/*
@@ -1639,7 +1657,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
* shot. If still dirty, it will be redirty_tail()'ed below. Update
* the dirty time to prevent enqueue and sync it again.
*/
- if ((inode->i_state & I_DIRTY) &&
+ if ((inode_state_read(inode) & I_DIRTY) &&
(wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
inode->dirtied_when = jiffies;
@@ -1650,7 +1668,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
* is odd for clean inodes, it can happen for some
* filesystems so handle that gracefully.
*/
- if (inode->i_state & I_DIRTY_ALL)
+ if (inode_state_read(inode) & I_DIRTY_ALL)
redirty_tail_locked(inode, wb);
else
inode_cgwb_move_to_attached(inode, wb);
@@ -1676,17 +1694,17 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
*/
redirty_tail_locked(inode, wb);
}
- } else if (inode->i_state & I_DIRTY) {
+ } else if (inode_state_read(inode) & I_DIRTY) {
/*
* Filesystems can dirty the inode during writeback operations,
* such as delayed allocation during submission or metadata
* updates after data IO completion.
*/
redirty_tail_locked(inode, wb);
- } else if (inode->i_state & I_DIRTY_TIME) {
+ } else if (inode_state_read(inode) & I_DIRTY_TIME) {
inode->dirtied_when = jiffies;
inode_io_list_move_locked(inode, wb, &wb->b_dirty_time);
- inode->i_state &= ~I_SYNC_QUEUED;
+ inode_state_clear(inode, I_SYNC_QUEUED);
} else {
/* The inode is clean. Remove from writeback lists. */
inode_cgwb_move_to_attached(inode, wb);
@@ -1712,7 +1730,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
unsigned dirty;
int ret;
- WARN_ON(!(inode->i_state & I_SYNC));
+ WARN_ON(!(inode_state_read_once(inode) & I_SYNC));
trace_writeback_single_inode_start(inode, wbc, nr_to_write);
@@ -1736,7 +1754,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
* mark_inode_dirty_sync() to notify the filesystem about it and to
* change I_DIRTY_TIME into I_DIRTY_SYNC.
*/
- if ((inode->i_state & I_DIRTY_TIME) &&
+ if ((inode_state_read_once(inode) & I_DIRTY_TIME) &&
(wbc->sync_mode == WB_SYNC_ALL ||
time_after(jiffies, inode->dirtied_time_when +
dirtytime_expire_interval * HZ))) {
@@ -1751,8 +1769,8 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
* after handling timestamp expiration, as that may dirty the inode too.
*/
spin_lock(&inode->i_lock);
- dirty = inode->i_state & I_DIRTY;
- inode->i_state &= ~dirty;
+ dirty = inode_state_read(inode) & I_DIRTY;
+ inode_state_clear(inode, dirty);
/*
* Paired with smp_mb() in __mark_inode_dirty(). This allows
@@ -1768,10 +1786,10 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
smp_mb();
if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
- inode->i_state |= I_DIRTY_PAGES;
- else if (unlikely(inode->i_state & I_PINNING_NETFS_WB)) {
- if (!(inode->i_state & I_DIRTY_PAGES)) {
- inode->i_state &= ~I_PINNING_NETFS_WB;
+ inode_state_set(inode, I_DIRTY_PAGES);
+ else if (unlikely(inode_state_read(inode) & I_PINNING_NETFS_WB)) {
+ if (!(inode_state_read(inode) & I_DIRTY_PAGES)) {
+ inode_state_clear(inode, I_PINNING_NETFS_WB);
wbc->unpinned_netfs_wb = true;
dirty |= I_PINNING_NETFS_WB; /* Cause write_inode */
}
@@ -1807,11 +1825,11 @@ static int writeback_single_inode(struct inode *inode,
spin_lock(&inode->i_lock);
if (!icount_read(inode))
- WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
+ WARN_ON(!(inode_state_read(inode) & (I_WILL_FREE | I_FREEING)));
else
- WARN_ON(inode->i_state & I_WILL_FREE);
+ WARN_ON(inode_state_read(inode) & I_WILL_FREE);
- if (inode->i_state & I_SYNC) {
+ if (inode_state_read(inode) & I_SYNC) {
/*
* Writeback is already running on the inode. For WB_SYNC_NONE,
* that's enough and we can just return. For WB_SYNC_ALL, we
@@ -1822,7 +1840,7 @@ static int writeback_single_inode(struct inode *inode,
goto out;
inode_wait_for_writeback(inode);
}
- WARN_ON(inode->i_state & I_SYNC);
+ WARN_ON(inode_state_read(inode) & I_SYNC);
/*
* If the inode is already fully clean, then there's nothing to do.
*
@@ -1830,11 +1848,11 @@ static int writeback_single_inode(struct inode *inode,
* still under writeback, e.g. due to prior WB_SYNC_NONE writeback. If
* there are any such pages, we'll need to wait for them.
*/
- if (!(inode->i_state & I_DIRTY_ALL) &&
+ if (!(inode_state_read(inode) & I_DIRTY_ALL) &&
(wbc->sync_mode != WB_SYNC_ALL ||
!mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
goto out;
- inode->i_state |= I_SYNC;
+ inode_state_set(inode, I_SYNC);
wbc_attach_and_unlock_inode(wbc, inode);
ret = __writeback_single_inode(inode, wbc);
@@ -1847,18 +1865,18 @@ static int writeback_single_inode(struct inode *inode,
* If the inode is freeing, its i_io_list shoudn't be updated
* as it can be finally deleted at this moment.
*/
- if (!(inode->i_state & I_FREEING)) {
+ if (!(inode_state_read(inode) & I_FREEING)) {
/*
* If the inode is now fully clean, then it can be safely
* removed from its writeback list (if any). Otherwise the
* flusher threads are responsible for the writeback lists.
*/
- if (!(inode->i_state & I_DIRTY_ALL))
+ if (!(inode_state_read(inode) & I_DIRTY_ALL))
inode_cgwb_move_to_attached(inode, wb);
- else if (!(inode->i_state & I_SYNC_QUEUED)) {
- if ((inode->i_state & I_DIRTY))
+ else if (!(inode_state_read(inode) & I_SYNC_QUEUED)) {
+ if ((inode_state_read(inode) & I_DIRTY))
redirty_tail_locked(inode, wb);
- else if (inode->i_state & I_DIRTY_TIME) {
+ else if (inode_state_read(inode) & I_DIRTY_TIME) {
inode->dirtied_when = jiffies;
inode_io_list_move_locked(inode,
wb,
@@ -1874,8 +1892,8 @@ out:
return ret;
}
-static long writeback_chunk_size(struct bdi_writeback *wb,
- struct wb_writeback_work *work)
+static long writeback_chunk_size(struct super_block *sb,
+ struct bdi_writeback *wb, struct wb_writeback_work *work)
{
long pages;
@@ -1893,16 +1911,13 @@ static long writeback_chunk_size(struct bdi_writeback *wb,
* (maybe slowly) sync all tagged pages
*/
if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
- pages = LONG_MAX;
- else {
- pages = min(wb->avg_write_bandwidth / 2,
- global_wb_domain.dirty_limit / DIRTY_SCOPE);
- pages = min(pages, work->nr_pages);
- pages = round_down(pages + MIN_WRITEBACK_PAGES,
- MIN_WRITEBACK_PAGES);
- }
+ return LONG_MAX;
- return pages;
+ pages = min(wb->avg_write_bandwidth / 2,
+ global_wb_domain.dirty_limit / DIRTY_SCOPE);
+ pages = min(pages, work->nr_pages);
+ return round_down(pages + sb->s_min_writeback_pages,
+ sb->s_min_writeback_pages);
}
/*
@@ -1967,12 +1982,12 @@ static long writeback_sb_inodes(struct super_block *sb,
* kind writeout is handled by the freer.
*/
spin_lock(&inode->i_lock);
- if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
+ if (inode_state_read(inode) & (I_NEW | I_FREEING | I_WILL_FREE)) {
redirty_tail_locked(inode, wb);
spin_unlock(&inode->i_lock);
continue;
}
- if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {
+ if ((inode_state_read(inode) & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {
/*
* If this inode is locked for writeback and we are not
* doing writeback-for-data-integrity, move it to
@@ -1994,17 +2009,17 @@ static long writeback_sb_inodes(struct super_block *sb,
* are doing WB_SYNC_NONE writeback. So this catches only the
* WB_SYNC_ALL case.
*/
- if (inode->i_state & I_SYNC) {
+ if (inode_state_read(inode) & I_SYNC) {
/* Wait for I_SYNC. This function drops i_lock... */
inode_sleep_on_writeback(inode);
/* Inode may be gone, start again */
spin_lock(&wb->list_lock);
continue;
}
- inode->i_state |= I_SYNC;
+ inode_state_set(inode, I_SYNC);
wbc_attach_and_unlock_inode(&wbc, inode);
- write_chunk = writeback_chunk_size(wb, work);
+ write_chunk = writeback_chunk_size(inode->i_sb, wb, work);
wbc.nr_to_write = write_chunk;
wbc.pages_skipped = 0;
@@ -2014,6 +2029,12 @@ static long writeback_sb_inodes(struct super_block *sb,
*/
__writeback_single_inode(inode, &wbc);
+ /* Report progress to inform the hung task detector of the progress. */
+ if (work->done && work->done->progress_stamp &&
+ (jiffies - work->done->progress_stamp) > HZ *
+ sysctl_hung_task_timeout_secs / 2)
+ wake_up_all(work->done->waitq);
+
wbc_detach_inode(&wbc);
work->nr_pages -= write_chunk - wbc.nr_to_write;
wrote = write_chunk - wbc.nr_to_write - wbc.pages_skipped;
@@ -2039,7 +2060,7 @@ static long writeback_sb_inodes(struct super_block *sb,
*/
tmp_wb = inode_to_wb_and_lock_list(inode);
spin_lock(&inode->i_lock);
- if (!(inode->i_state & I_DIRTY_ALL))
+ if (!(inode_state_read(inode) & I_DIRTY_ALL))
total_wrote++;
requeue_inode(inode, tmp_wb, &wbc, dirtied_before);
inode_sync_complete(inode);
@@ -2545,10 +2566,10 @@ void __mark_inode_dirty(struct inode *inode, int flags)
* We tell ->dirty_inode callback that timestamps need to
* be updated by setting I_DIRTY_TIME in flags.
*/
- if (inode->i_state & I_DIRTY_TIME) {
+ if (inode_state_read_once(inode) & I_DIRTY_TIME) {
spin_lock(&inode->i_lock);
- if (inode->i_state & I_DIRTY_TIME) {
- inode->i_state &= ~I_DIRTY_TIME;
+ if (inode_state_read(inode) & I_DIRTY_TIME) {
+ inode_state_clear(inode, I_DIRTY_TIME);
flags |= I_DIRTY_TIME;
}
spin_unlock(&inode->i_lock);
@@ -2585,16 +2606,16 @@ void __mark_inode_dirty(struct inode *inode, int flags)
*/
smp_mb();
- if ((inode->i_state & flags) == flags)
+ if ((inode_state_read_once(inode) & flags) == flags)
return;
spin_lock(&inode->i_lock);
- if ((inode->i_state & flags) != flags) {
- const int was_dirty = inode->i_state & I_DIRTY;
+ if ((inode_state_read(inode) & flags) != flags) {
+ const int was_dirty = inode_state_read(inode) & I_DIRTY;
inode_attach_wb(inode, NULL);
- inode->i_state |= flags;
+ inode_state_set(inode, flags);
/*
* Grab inode's wb early because it requires dropping i_lock and we
@@ -2613,7 +2634,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
* the inode it will place it on the appropriate superblock
* list, based upon its state.
*/
- if (inode->i_state & I_SYNC_QUEUED)
+ if (inode_state_read(inode) & I_SYNC_QUEUED)
goto out_unlock;
/*
@@ -2624,7 +2645,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
if (inode_unhashed(inode))
goto out_unlock;
}
- if (inode->i_state & I_FREEING)
+ if (inode_state_read(inode) & I_FREEING)
goto out_unlock;
/*
@@ -2639,7 +2660,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
if (dirtytime)
inode->dirtied_time_when = jiffies;
- if (inode->i_state & I_DIRTY)
+ if (inode_state_read(inode) & I_DIRTY)
dirty_list = &wb->b_dirty;
else
dirty_list = &wb->b_dirty_time;
@@ -2736,7 +2757,7 @@ static void wait_sb_inodes(struct super_block *sb)
spin_unlock_irq(&sb->s_inode_wblist_lock);
spin_lock(&inode->i_lock);
- if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
+ if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) {
spin_unlock(&inode->i_lock);
spin_lock_irq(&sb->s_inode_wblist_lock);
diff --git a/fs/fs_types.c b/fs/fs_dirent.c
index 78365e5dc08c..e5e08f213816 100644
--- a/fs/fs_types.c
+++ b/fs/fs_dirent.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-#include <linux/fs.h>
+#include <linux/fs_dirent.h>
#include <linux/export.h>
/*
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 28be762ac1c6..b8c46c5a38a0 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -146,12 +146,6 @@ int unshare_fs_struct(void)
}
EXPORT_SYMBOL_GPL(unshare_fs_struct);
-int current_umask(void)
-{
- return current->fs->umask;
-}
-EXPORT_SYMBOL(current_umask);
-
/* to be mentioned only in INIT_TASK */
struct fs_struct init_fs = {
.users = 1,
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index ecaec0fea3a1..87a63ae93a45 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1192,7 +1192,7 @@ static void fuse_fillattr(struct mnt_idmap *idmap, struct inode *inode,
if (attr->blksize != 0)
blkbits = ilog2(attr->blksize);
else
- blkbits = fc->blkbits;
+ blkbits = inode->i_sb->s_blocksize_bits;
stat->blksize = 1 << blkbits;
}
@@ -1397,27 +1397,25 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
if (!parent)
return -ENOENT;
- inode_lock_nested(parent, I_MUTEX_PARENT);
if (!S_ISDIR(parent->i_mode))
- goto unlock;
+ goto put_parent;
err = -ENOENT;
dir = d_find_alias(parent);
if (!dir)
- goto unlock;
+ goto put_parent;
- name->hash = full_name_hash(dir, name->name, name->len);
- entry = d_lookup(dir, name);
+ entry = start_removing_noperm(dir, name);
dput(dir);
- if (!entry)
- goto unlock;
+ if (IS_ERR(entry))
+ goto put_parent;
fuse_dir_changed(parent);
if (!(flags & FUSE_EXPIRE_ONLY))
d_invalidate(entry);
fuse_invalidate_entry_cache(entry);
- if (child_nodeid != 0 && d_really_is_positive(entry)) {
+ if (child_nodeid != 0) {
inode_lock(d_inode(entry));
if (get_node_id(d_inode(entry)) != child_nodeid) {
err = -ENOENT;
@@ -1445,10 +1443,9 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
} else {
err = 0;
}
- dput(entry);
- unlock:
- inode_unlock(parent);
+ end_removing(entry);
+ put_parent:
iput(parent);
return err;
}
@@ -2230,6 +2227,7 @@ static const struct file_operations fuse_dir_operations = {
.fsync = fuse_dir_fsync,
.unlocked_ioctl = fuse_dir_ioctl,
.compat_ioctl = fuse_dir_compat_ioctl,
+ .setlease = simple_nosetlease,
};
static const struct inode_operations fuse_common_inode_operations = {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index f1ef77a0be05..7bcb650a9f26 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -834,23 +834,142 @@ static int fuse_do_readfolio(struct file *file, struct folio *folio,
return 0;
}
+static int fuse_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+ unsigned int flags, struct iomap *iomap,
+ struct iomap *srcmap)
+{
+ iomap->type = IOMAP_MAPPED;
+ iomap->length = length;
+ iomap->offset = offset;
+ return 0;
+}
+
+static const struct iomap_ops fuse_iomap_ops = {
+ .iomap_begin = fuse_iomap_begin,
+};
+
+struct fuse_fill_read_data {
+ struct file *file;
+
+ /* Fields below are used if sending the read request asynchronously */
+ struct fuse_conn *fc;
+ struct fuse_io_args *ia;
+ unsigned int nr_bytes;
+};
+
+/* forward declarations */
+static bool fuse_folios_need_send(struct fuse_conn *fc, loff_t pos,
+ unsigned len, struct fuse_args_pages *ap,
+ unsigned cur_bytes, bool write);
+static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file,
+ unsigned int count, bool async);
+
+static int fuse_handle_readahead(struct folio *folio,
+ struct readahead_control *rac,
+ struct fuse_fill_read_data *data, loff_t pos,
+ size_t len)
+{
+ struct fuse_io_args *ia = data->ia;
+ size_t off = offset_in_folio(folio, pos);
+ struct fuse_conn *fc = data->fc;
+ struct fuse_args_pages *ap;
+ unsigned int nr_pages;
+
+ if (ia && fuse_folios_need_send(fc, pos, len, &ia->ap, data->nr_bytes,
+ false)) {
+ fuse_send_readpages(ia, data->file, data->nr_bytes,
+ fc->async_read);
+ data->nr_bytes = 0;
+ data->ia = NULL;
+ ia = NULL;
+ }
+ if (!ia) {
+ if (fc->num_background >= fc->congestion_threshold &&
+ rac->ra->async_size >= readahead_count(rac))
+ /*
+ * Congested and only async pages left, so skip the
+ * rest.
+ */
+ return -EAGAIN;
+
+ nr_pages = min(fc->max_pages, readahead_count(rac));
+ data->ia = fuse_io_alloc(NULL, nr_pages);
+ if (!data->ia)
+ return -ENOMEM;
+ ia = data->ia;
+ }
+ folio_get(folio);
+ ap = &ia->ap;
+ ap->folios[ap->num_folios] = folio;
+ ap->descs[ap->num_folios].offset = off;
+ ap->descs[ap->num_folios].length = len;
+ data->nr_bytes += len;
+ ap->num_folios++;
+
+ return 0;
+}
+
+static int fuse_iomap_read_folio_range_async(const struct iomap_iter *iter,
+ struct iomap_read_folio_ctx *ctx,
+ size_t len)
+{
+ struct fuse_fill_read_data *data = ctx->read_ctx;
+ struct folio *folio = ctx->cur_folio;
+ loff_t pos = iter->pos;
+ size_t off = offset_in_folio(folio, pos);
+ struct file *file = data->file;
+ int ret;
+
+ if (ctx->rac) {
+ ret = fuse_handle_readahead(folio, ctx->rac, data, pos, len);
+ } else {
+ /*
+ * for non-readahead read requests, do reads synchronously
+ * since it's not guaranteed that the server can handle
+ * out-of-order reads
+ */
+ ret = fuse_do_readfolio(file, folio, off, len);
+ if (!ret)
+ iomap_finish_folio_read(folio, off, len, ret);
+ }
+ return ret;
+}
+
+static void fuse_iomap_read_submit(struct iomap_read_folio_ctx *ctx)
+{
+ struct fuse_fill_read_data *data = ctx->read_ctx;
+
+ if (data->ia)
+ fuse_send_readpages(data->ia, data->file, data->nr_bytes,
+ data->fc->async_read);
+}
+
+static const struct iomap_read_ops fuse_iomap_read_ops = {
+ .read_folio_range = fuse_iomap_read_folio_range_async,
+ .submit_read = fuse_iomap_read_submit,
+};
+
static int fuse_read_folio(struct file *file, struct folio *folio)
{
struct inode *inode = folio->mapping->host;
- int err;
+ struct fuse_fill_read_data data = {
+ .file = file,
+ };
+ struct iomap_read_folio_ctx ctx = {
+ .cur_folio = folio,
+ .ops = &fuse_iomap_read_ops,
+ .read_ctx = &data,
- err = -EIO;
- if (fuse_is_bad(inode))
- goto out;
+ };
- err = fuse_do_readfolio(file, folio, 0, folio_size(folio));
- if (!err)
- folio_mark_uptodate(folio);
+ if (fuse_is_bad(inode)) {
+ folio_unlock(folio);
+ return -EIO;
+ }
+ iomap_read_folio(&fuse_iomap_ops, &ctx);
fuse_invalidate_atime(inode);
- out:
- folio_unlock(folio);
- return err;
+ return 0;
}
static int fuse_iomap_read_folio_range(const struct iomap_iter *iter,
@@ -887,7 +1006,8 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args,
fuse_invalidate_atime(inode);
for (i = 0; i < ap->num_folios; i++) {
- folio_end_read(ap->folios[i], !err);
+ iomap_finish_folio_read(ap->folios[i], ap->descs[i].offset,
+ ap->descs[i].length, err);
folio_put(ap->folios[i]);
}
if (ia->ff)
@@ -897,7 +1017,7 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args,
}
static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file,
- unsigned int count)
+ unsigned int count, bool async)
{
struct fuse_file *ff = file->private_data;
struct fuse_mount *fm = ff->fm;
@@ -919,7 +1039,7 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file,
fuse_read_args_fill(ia, file, pos, count, FUSE_READ);
ia->read.attr_ver = fuse_get_attr_version(fm->fc);
- if (fm->fc->async_read) {
+ if (async) {
ia->ff = fuse_file_get(ff);
ap->args.end = fuse_readpages_end;
err = fuse_simple_background(fm, &ap->args, GFP_KERNEL);
@@ -936,81 +1056,20 @@ static void fuse_readahead(struct readahead_control *rac)
{
struct inode *inode = rac->mapping->host;
struct fuse_conn *fc = get_fuse_conn(inode);
- unsigned int max_pages, nr_pages;
- struct folio *folio = NULL;
+ struct fuse_fill_read_data data = {
+ .file = rac->file,
+ .fc = fc,
+ };
+ struct iomap_read_folio_ctx ctx = {
+ .ops = &fuse_iomap_read_ops,
+ .rac = rac,
+ .read_ctx = &data
+ };
if (fuse_is_bad(inode))
return;
- max_pages = min_t(unsigned int, fc->max_pages,
- fc->max_read / PAGE_SIZE);
-
- /*
- * This is only accurate the first time through, since readahead_folio()
- * doesn't update readahead_count() from the previous folio until the
- * next call. Grab nr_pages here so we know how many pages we're going
- * to have to process. This means that we will exit here with
- * readahead_count() == folio_nr_pages(last_folio), but we will have
- * consumed all of the folios, and read_pages() will call
- * readahead_folio() again which will clean up the rac.
- */
- nr_pages = readahead_count(rac);
-
- while (nr_pages) {
- struct fuse_io_args *ia;
- struct fuse_args_pages *ap;
- unsigned cur_pages = min(max_pages, nr_pages);
- unsigned int pages = 0;
-
- if (fc->num_background >= fc->congestion_threshold &&
- rac->ra->async_size >= readahead_count(rac))
- /*
- * Congested and only async pages left, so skip the
- * rest.
- */
- break;
-
- ia = fuse_io_alloc(NULL, cur_pages);
- if (!ia)
- break;
- ap = &ia->ap;
-
- while (pages < cur_pages) {
- unsigned int folio_pages;
-
- /*
- * This returns a folio with a ref held on it.
- * The ref needs to be held until the request is
- * completed, since the splice case (see
- * fuse_try_move_page()) drops the ref after it's
- * replaced in the page cache.
- */
- if (!folio)
- folio = __readahead_folio(rac);
-
- folio_pages = folio_nr_pages(folio);
- if (folio_pages > cur_pages - pages) {
- /*
- * Large folios belonging to fuse will never
- * have more pages than max_pages.
- */
- WARN_ON(!pages);
- break;
- }
-
- ap->folios[ap->num_folios] = folio;
- ap->descs[ap->num_folios].length = folio_size(folio);
- ap->num_folios++;
- pages += folio_pages;
- folio = NULL;
- }
- fuse_send_readpages(ia, rac->file, pages << PAGE_SHIFT);
- nr_pages -= pages;
- }
- if (folio) {
- folio_end_read(folio, false);
- folio_put(folio);
- }
+ iomap_readahead(&fuse_iomap_ops, &ctx);
}
static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to)
@@ -1397,20 +1456,6 @@ static const struct iomap_write_ops fuse_iomap_write_ops = {
.read_folio_range = fuse_iomap_read_folio_range,
};
-static int fuse_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
- unsigned int flags, struct iomap *iomap,
- struct iomap *srcmap)
-{
- iomap->type = IOMAP_MAPPED;
- iomap->length = length;
- iomap->offset = offset;
- return 0;
-}
-
-static const struct iomap_ops fuse_iomap_ops = {
- .iomap_begin = fuse_iomap_begin,
-};
-
static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
@@ -1834,7 +1879,8 @@ static void fuse_writepage_finish(struct fuse_writepage_args *wpa)
* scope of the fi->lock alleviates xarray lock
* contention and noticeably improves performance.
*/
- iomap_finish_folio_write(inode, ap->folios[i], 1);
+ iomap_finish_folio_write(inode, ap->folios[i],
+ ap->descs[i].length);
wake_up(&fi->page_waitq);
}
@@ -2047,7 +2093,7 @@ struct fuse_fill_wb_data {
struct fuse_file *ff;
unsigned int max_folios;
/*
- * nr_bytes won't overflow since fuse_writepage_need_send() caps
+ * nr_bytes won't overflow since fuse_folios_need_send() caps
* wb requests to never exceed fc->max_pages (which has an upper bound
* of U16_MAX).
*/
@@ -2092,14 +2138,15 @@ static void fuse_writepages_send(struct inode *inode,
spin_unlock(&fi->lock);
}
-static bool fuse_writepage_need_send(struct fuse_conn *fc, loff_t pos,
- unsigned len, struct fuse_args_pages *ap,
- struct fuse_fill_wb_data *data)
+static bool fuse_folios_need_send(struct fuse_conn *fc, loff_t pos,
+ unsigned len, struct fuse_args_pages *ap,
+ unsigned cur_bytes, bool write)
{
struct folio *prev_folio;
struct fuse_folio_desc prev_desc;
- unsigned bytes = data->nr_bytes + len;
+ unsigned bytes = cur_bytes + len;
loff_t prev_pos;
+ size_t max_bytes = write ? fc->max_write : fc->max_read;
WARN_ON(!ap->num_folios);
@@ -2107,8 +2154,7 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, loff_t pos,
if ((bytes + PAGE_SIZE - 1) >> PAGE_SHIFT > fc->max_pages)
return true;
- /* Reached max write bytes */
- if (bytes > fc->max_write)
+ if (bytes > max_bytes)
return true;
/* Discontinuity */
@@ -2118,11 +2164,6 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, loff_t pos,
if (prev_pos != pos)
return true;
- /* Need to grow the pages array? If so, did the expansion fail? */
- if (ap->num_folios == data->max_folios &&
- !fuse_pages_realloc(data, fc->max_pages))
- return true;
-
return false;
}
@@ -2146,10 +2187,24 @@ static ssize_t fuse_iomap_writeback_range(struct iomap_writepage_ctx *wpc,
return -EIO;
}
- if (wpa && fuse_writepage_need_send(fc, pos, len, ap, data)) {
- fuse_writepages_send(inode, data);
- data->wpa = NULL;
- data->nr_bytes = 0;
+ if (wpa) {
+ bool send = fuse_folios_need_send(fc, pos, len, ap,
+ data->nr_bytes, true);
+
+ if (!send) {
+ /*
+ * Need to grow the pages array? If so, did the
+ * expansion fail?
+ */
+ send = (ap->num_folios == data->max_folios) &&
+ !fuse_pages_realloc(data, fc->max_pages);
+ }
+
+ if (send) {
+ fuse_writepages_send(inode, data);
+ data->wpa = NULL;
+ data->nr_bytes = 0;
+ }
}
if (data->wpa == NULL) {
@@ -2161,7 +2216,6 @@ static ssize_t fuse_iomap_writeback_range(struct iomap_writepage_ctx *wpc,
ap = &wpa->ia.ap;
}
- iomap_start_folio_write(inode, folio, 1);
fuse_writepage_args_page_fill(wpa, folio, ap->num_folios,
offset, len);
data->nr_bytes += len;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index c2f2a48156d6..f616c1991fed 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -981,14 +981,6 @@ struct fuse_conn {
/* Request timeout (in jiffies). 0 = no timeout */
unsigned int req_timeout;
} timeout;
-
- /*
- * This is a workaround until fuse uses iomap for reads.
- * For fuseblk servers, this represents the blocksize passed in at
- * mount time and for regular fuse servers, this is equivalent to
- * inode->i_blkbits.
- */
- u8 blkbits;
};
/*
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index d1babf56f254..1a397be53f49 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -160,7 +160,7 @@ static void fuse_evict_inode(struct inode *inode)
struct fuse_inode *fi = get_fuse_inode(inode);
/* Will write inode on close/munmap and in all other dirtiers */
- WARN_ON(inode->i_state & I_DIRTY_INODE);
+ WARN_ON(inode_state_read_once(inode) & I_DIRTY_INODE);
if (FUSE_IS_DAX(inode))
dax_break_layout_final(inode);
@@ -291,7 +291,7 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
if (attr->blksize)
fi->cached_i_blkbits = ilog2(attr->blksize);
else
- fi->cached_i_blkbits = fc->blkbits;
+ fi->cached_i_blkbits = inode->i_sb->s_blocksize_bits;
/*
* Don't set the sticky bit in i_mode, unless we want the VFS
@@ -505,7 +505,7 @@ retry:
if (!inode)
return NULL;
- if ((inode->i_state & I_NEW)) {
+ if ((inode_state_read_once(inode) & I_NEW)) {
inode->i_flags |= S_NOATIME;
if (!fc->writeback_cache || !S_ISREG(attr->mode))
inode->i_flags |= S_NOCMTIME;
@@ -1838,22 +1838,11 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
err = -EINVAL;
if (!sb_set_blocksize(sb, ctx->blksize))
goto err;
- /*
- * This is a workaround until fuse hooks into iomap for reads.
- * Use PAGE_SIZE for the blocksize else if the writeback cache
- * is enabled, buffered writes go through iomap and a read may
- * overwrite partially written data if blocksize < PAGE_SIZE
- */
- fc->blkbits = sb->s_blocksize_bits;
- if (ctx->blksize != PAGE_SIZE &&
- !sb_set_blocksize(sb, PAGE_SIZE))
- goto err;
#endif
fc->sync_fs = 1;
} else {
sb->s_blocksize = PAGE_SIZE;
sb->s_blocksize_bits = PAGE_SHIFT;
- fc->blkbits = sb->s_blocksize_bits;
}
sb->s_subtype = ctx->subtype;
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 6bc7c97b017d..b2f6486fe1d5 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -373,7 +373,7 @@ static int virtio_fs_add_queues_sysfs(struct virtio_fs *fs)
sprintf(buff, "%d", i);
fsvq->kobj = kobject_create_and_add(buff, fs->mqs_kobj);
- if (!fs->mqs_kobj) {
+ if (!fsvq->kobj) {
ret = -ENOMEM;
goto out_del;
}
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 47d74afd63ac..ff1cf335449a 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -81,8 +81,7 @@ static int gfs2_write_jdata_folio(struct folio *folio,
* the page size, the remaining memory is zeroed when mapped, and
* writes to that region are not written out to the file."
*/
- if (folio_pos(folio) < i_size &&
- i_size < folio_pos(folio) + folio_size(folio))
+ if (folio_pos(folio) < i_size && i_size < folio_next_pos(folio))
folio_zero_segment(folio, offset_in_folio(folio, i_size),
folio_size(folio));
@@ -311,10 +310,7 @@ static int gfs2_write_cache_jdata(struct address_space *mapping,
range_whole = 1;
cycled = 1; /* ignore range_cyclic tests */
}
- if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
- tag = PAGECACHE_TAG_TOWRITE;
- else
- tag = PAGECACHE_TAG_DIRTY;
+ tag = wbc_to_tag(wbc);
retry:
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
@@ -424,11 +420,11 @@ static int gfs2_read_folio(struct file *file, struct folio *folio)
struct inode *inode = folio->mapping->host;
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
- int error;
+ int error = 0;
if (!gfs2_is_jdata(ip) ||
(i_blocksize(inode) == PAGE_SIZE && !folio_buffers(folio))) {
- error = iomap_read_folio(folio, &gfs2_iomap_ops);
+ iomap_bio_read_folio(folio, &gfs2_iomap_ops);
} else if (gfs2_is_stuffed(ip)) {
error = stuffed_read_folio(ip, folio);
} else {
@@ -503,7 +499,7 @@ static void gfs2_readahead(struct readahead_control *rac)
else if (gfs2_is_jdata(ip))
mpage_readahead(rac, gfs2_block_map);
else
- iomap_readahead(rac, &gfs2_iomap_ops);
+ iomap_bio_readahead(rac, &gfs2_iomap_ops);
}
/**
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index bc67fa058c84..ee92f5910ae1 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -744,7 +744,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
{
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
- int sync_state = inode->i_state & I_DIRTY;
+ int sync_state = inode_state_read_once(inode) & I_DIRTY;
struct gfs2_inode *ip = GFS2_I(inode);
int ret = 0, ret1 = 0;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index b677c0e6b9ab..c9712235e7a0 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -957,7 +957,7 @@ static struct gfs2_inode *gfs2_grab_existing_inode(struct gfs2_glock *gl)
ip = NULL;
spin_unlock(&gl->gl_lockref.lock);
if (ip) {
- wait_on_inode(&ip->i_inode);
+ wait_on_new_inode(&ip->i_inode);
if (is_bad_inode(&ip->i_inode)) {
iput(&ip->i_inode);
ip = NULL;
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 0c0a80b3baca..c94e42b0c94d 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -394,7 +394,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
u16 height, depth;
umode_t mode = be32_to_cpu(str->di_mode);
struct inode *inode = &ip->i_inode;
- bool is_new = inode->i_state & I_NEW;
+ bool is_new = inode_state_read_once(inode) & I_NEW;
if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr))) {
gfs2_consist_inode(ip);
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 8a7ed80d9f2d..890c87e3e365 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -127,7 +127,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
ip = GFS2_I(inode);
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
struct gfs2_sbd *sdp = GFS2_SB(inode);
struct gfs2_glock *io_gl;
int extra_flags = 0;
@@ -924,7 +924,7 @@ fail_gunlock:
gfs2_dir_no_add(&da);
gfs2_glock_dq_uninit(&d_gh);
if (!IS_ERR_OR_NULL(inode)) {
- if (inode->i_state & I_NEW)
+ if (inode_state_read_once(inode) & I_NEW)
iget_failed(inode);
else
iput(inode);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index aa15183f9a16..889682f051ea 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1751,7 +1751,7 @@ static void gfs2_evict_inodes(struct super_block *sb)
spin_lock(&sb->s_inode_list_lock);
list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
spin_lock(&inode->i_lock);
- if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) &&
+ if ((inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) &&
!need_resched()) {
spin_unlock(&inode->i_lock);
continue;
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 22e62fe7448b..54c20d01c342 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -42,7 +42,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
tree->inode = iget_locked(sb, id);
if (!tree->inode)
goto free_tree;
- BUG_ON(!(tree->inode->i_state & I_NEW));
+ BUG_ON(!(inode_state_read_once(tree->inode) & I_NEW));
{
struct hfs_mdb *mdb = HFS_SB(sb)->mdb;
HFS_I(tree->inode)->flags = 0;
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 9cd449913dc8..81ad93e6312f 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -412,7 +412,7 @@ struct inode *hfs_iget(struct super_block *sb, struct hfs_cat_key *key, hfs_cat_
return NULL;
}
inode = iget5_locked(sb, cnid, hfs_test_inode, hfs_read_inode, &data);
- if (inode && (inode->i_state & I_NEW))
+ if (inode && (inode_state_read_once(inode) & I_NEW))
unlock_new_inode(inode);
return inode;
}
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index a66a09a56bf7..9b377481f397 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -12,6 +12,7 @@
#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/sched.h>
+#include <linux/fs_struct.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/nls.h>
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 16bc4abc67e0..54e85e25a259 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -65,7 +65,7 @@ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
inode = iget_locked(sb, ino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
atomic_set(&HFSPLUS_I(inode)->opencnt, 0);
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 1e1acf5775ab..51d26aa2b93e 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -581,7 +581,7 @@ static struct inode *hostfs_iget(struct super_block *sb, char *name)
if (!inode)
return ERR_PTR(-ENOMEM);
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
unlock_new_inode(inode);
} else {
spin_lock(&inode->i_lock);
@@ -979,7 +979,7 @@ static int hostfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
struct hostfs_fs_info *fsi = fc->s_fs_info;
struct fs_parse_result result;
- char *host_root;
+ char *host_root, *tmp_root;
int opt;
opt = fs_parse(fc, hostfs_param_specs, param, &result);
@@ -990,11 +990,13 @@ static int hostfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
case Opt_hostfs:
host_root = param->string;
if (!*host_root)
- host_root = "";
- fsi->host_root_path =
- kasprintf(GFP_KERNEL, "%s/%s", root_ino, host_root);
- if (fsi->host_root_path == NULL)
+ break;
+ tmp_root = kasprintf(GFP_KERNEL, "%s%s",
+ fsi->host_root_path, host_root);
+ if (!tmp_root)
return -ENOMEM;
+ kfree(fsi->host_root_path);
+ fsi->host_root_path = tmp_root;
break;
}
@@ -1004,17 +1006,17 @@ static int hostfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
static int hostfs_parse_monolithic(struct fs_context *fc, void *data)
{
struct hostfs_fs_info *fsi = fc->s_fs_info;
- char *host_root = (char *)data;
+ char *tmp_root, *host_root = (char *)data;
/* NULL is printed as '(null)' by printf(): avoid that. */
if (host_root == NULL)
- host_root = "";
+ return 0;
- fsi->host_root_path =
- kasprintf(GFP_KERNEL, "%s/%s", root_ino, host_root);
- if (fsi->host_root_path == NULL)
+ tmp_root = kasprintf(GFP_KERNEL, "%s%s", fsi->host_root_path, host_root);
+ if (!tmp_root)
return -ENOMEM;
-
+ kfree(fsi->host_root_path);
+ fsi->host_root_path = tmp_root;
return 0;
}
@@ -1049,6 +1051,11 @@ static int hostfs_init_fs_context(struct fs_context *fc)
if (!fsi)
return -ENOMEM;
+ fsi->host_root_path = kasprintf(GFP_KERNEL, "%s/", root_ino);
+ if (!fsi->host_root_path) {
+ kfree(fsi);
+ return -ENOMEM;
+ }
fc->s_fs_info = fsi;
fc->ops = &hostfs_context_ops;
return 0;
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 49dd585c2b17..ceb50b2dc91a 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -247,7 +247,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, unsigned in
result = ERR_PTR(-ENOMEM);
goto bail1;
}
- if (result->i_state & I_NEW) {
+ if (inode_state_read_once(result) & I_NEW) {
hpfs_init_inode(result);
if (de->directory)
hpfs_read_inode(result);
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 34008442ee26..93d528f4f4f2 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -196,7 +196,7 @@ void hpfs_write_inode(struct inode *i)
parent = iget_locked(i->i_sb, hpfs_inode->i_parent_dir);
if (parent) {
hpfs_inode->i_dirty = 0;
- if (parent->i_state & I_NEW) {
+ if (inode_state_read_once(parent) & I_NEW) {
hpfs_init_inode(parent);
hpfs_read_inode(parent);
unlock_new_inode(parent);
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 8ab85e7ac91e..371aa6de8075 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -9,6 +9,7 @@
#include "hpfs_fn.h"
#include <linux/module.h>
+#include <linux/fs_struct.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/init.h>
diff --git a/fs/init.c b/fs/init.c
index 07f592ccdba8..e0f5429c0a49 100644
--- a/fs/init.c
+++ b/fs/init.c
@@ -157,7 +157,7 @@ int __init init_mknod(const char *filename, umode_t mode, unsigned int dev)
error = security_path_mknod(&path, dentry, mode, dev);
if (!error)
error = vfs_mknod(mnt_idmap(path.mnt), path.dentry->d_inode,
- dentry, mode, new_decode_dev(dev));
+ dentry, mode, new_decode_dev(dev), NULL);
end_creating_path(&path, dentry);
return error;
}
@@ -209,7 +209,7 @@ int __init init_symlink(const char *oldname, const char *newname)
error = security_path_symlink(&path, dentry, oldname);
if (!error)
error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode,
- dentry, oldname);
+ dentry, oldname, NULL);
end_creating_path(&path, dentry);
return error;
}
@@ -233,7 +233,7 @@ int __init init_mkdir(const char *pathname, umode_t mode)
error = security_path_mkdir(&path, dentry, mode);
if (!error) {
dentry = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode,
- dentry, mode);
+ dentry, mode, NULL);
if (IS_ERR(dentry))
error = PTR_ERR(dentry);
}
diff --git a/fs/inode.c b/fs/inode.c
index ec9339024ac3..cc8265cfe80e 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -233,7 +233,7 @@ int inode_init_always_gfp(struct super_block *sb, struct inode *inode, gfp_t gfp
inode->i_sb = sb;
inode->i_blkbits = sb->s_blocksize_bits;
inode->i_flags = 0;
- inode->i_state = 0;
+ inode_state_assign_raw(inode, 0);
atomic64_set(&inode->i_sequence, 0);
atomic_set(&inode->i_count, 1);
inode->i_op = &empty_iops;
@@ -471,7 +471,7 @@ EXPORT_SYMBOL(set_nlink);
void inc_nlink(struct inode *inode)
{
if (unlikely(inode->i_nlink == 0)) {
- WARN_ON(!(inode->i_state & I_LINKABLE));
+ WARN_ON(!(inode_state_read_once(inode) & I_LINKABLE));
atomic_long_dec(&inode->i_sb->s_remove_count);
}
@@ -530,9 +530,48 @@ void ihold(struct inode *inode)
}
EXPORT_SYMBOL(ihold);
-static void __inode_add_lru(struct inode *inode, bool rotate)
+struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe,
+ struct inode *inode, u32 bit)
+{
+ void *bit_address;
+
+ bit_address = inode_state_wait_address(inode, bit);
+ init_wait_var_entry(wqe, bit_address, 0);
+ return __var_waitqueue(bit_address);
+}
+EXPORT_SYMBOL(inode_bit_waitqueue);
+
+void wait_on_new_inode(struct inode *inode)
+{
+ struct wait_bit_queue_entry wqe;
+ struct wait_queue_head *wq_head;
+
+ spin_lock(&inode->i_lock);
+ if (!(inode_state_read(inode) & I_NEW)) {
+ spin_unlock(&inode->i_lock);
+ return;
+ }
+
+ wq_head = inode_bit_waitqueue(&wqe, inode, __I_NEW);
+ for (;;) {
+ prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE);
+ if (!(inode_state_read(inode) & I_NEW))
+ break;
+ spin_unlock(&inode->i_lock);
+ schedule();
+ spin_lock(&inode->i_lock);
+ }
+ finish_wait(wq_head, &wqe.wq_entry);
+ WARN_ON(inode_state_read(inode) & I_NEW);
+ spin_unlock(&inode->i_lock);
+}
+EXPORT_SYMBOL(wait_on_new_inode);
+
+static void __inode_lru_list_add(struct inode *inode, bool rotate)
{
- if (inode->i_state & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE))
+ lockdep_assert_held(&inode->i_lock);
+
+ if (inode_state_read(inode) & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE))
return;
if (icount_read(inode))
return;
@@ -544,32 +583,22 @@ static void __inode_add_lru(struct inode *inode, bool rotate)
if (list_lru_add_obj(&inode->i_sb->s_inode_lru, &inode->i_lru))
this_cpu_inc(nr_unused);
else if (rotate)
- inode->i_state |= I_REFERENCED;
-}
-
-struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe,
- struct inode *inode, u32 bit)
-{
- void *bit_address;
-
- bit_address = inode_state_wait_address(inode, bit);
- init_wait_var_entry(wqe, bit_address, 0);
- return __var_waitqueue(bit_address);
+ inode_state_set(inode, I_REFERENCED);
}
-EXPORT_SYMBOL(inode_bit_waitqueue);
/*
* Add inode to LRU if needed (inode is unused and clean).
- *
- * Needs inode->i_lock held.
*/
-void inode_add_lru(struct inode *inode)
+void inode_lru_list_add(struct inode *inode)
{
- __inode_add_lru(inode, false);
+ __inode_lru_list_add(inode, false);
}
static void inode_lru_list_del(struct inode *inode)
{
+ if (list_empty(&inode->i_lru))
+ return;
+
if (list_lru_del_obj(&inode->i_sb->s_inode_lru, &inode->i_lru))
this_cpu_dec(nr_unused);
}
@@ -577,15 +606,15 @@ static void inode_lru_list_del(struct inode *inode)
static void inode_pin_lru_isolating(struct inode *inode)
{
lockdep_assert_held(&inode->i_lock);
- WARN_ON(inode->i_state & (I_LRU_ISOLATING | I_FREEING | I_WILL_FREE));
- inode->i_state |= I_LRU_ISOLATING;
+ WARN_ON(inode_state_read(inode) & (I_LRU_ISOLATING | I_FREEING | I_WILL_FREE));
+ inode_state_set(inode, I_LRU_ISOLATING);
}
static void inode_unpin_lru_isolating(struct inode *inode)
{
spin_lock(&inode->i_lock);
- WARN_ON(!(inode->i_state & I_LRU_ISOLATING));
- inode->i_state &= ~I_LRU_ISOLATING;
+ WARN_ON(!(inode_state_read(inode) & I_LRU_ISOLATING));
+ inode_state_clear(inode, I_LRU_ISOLATING);
/* Called with inode->i_lock which ensures memory ordering. */
inode_wake_up_bit(inode, __I_LRU_ISOLATING);
spin_unlock(&inode->i_lock);
@@ -597,7 +626,7 @@ static void inode_wait_for_lru_isolating(struct inode *inode)
struct wait_queue_head *wq_head;
lockdep_assert_held(&inode->i_lock);
- if (!(inode->i_state & I_LRU_ISOLATING))
+ if (!(inode_state_read(inode) & I_LRU_ISOLATING))
return;
wq_head = inode_bit_waitqueue(&wqe, inode, __I_LRU_ISOLATING);
@@ -607,14 +636,14 @@ static void inode_wait_for_lru_isolating(struct inode *inode)
* Checking I_LRU_ISOLATING with inode->i_lock guarantees
* memory ordering.
*/
- if (!(inode->i_state & I_LRU_ISOLATING))
+ if (!(inode_state_read(inode) & I_LRU_ISOLATING))
break;
spin_unlock(&inode->i_lock);
schedule();
spin_lock(&inode->i_lock);
}
finish_wait(wq_head, &wqe.wq_entry);
- WARN_ON(inode->i_state & I_LRU_ISOLATING);
+ WARN_ON(inode_state_read(inode) & I_LRU_ISOLATING);
}
/**
@@ -761,11 +790,11 @@ void clear_inode(struct inode *inode)
*/
xa_unlock_irq(&inode->i_data.i_pages);
BUG_ON(!list_empty(&inode->i_data.i_private_list));
- BUG_ON(!(inode->i_state & I_FREEING));
- BUG_ON(inode->i_state & I_CLEAR);
+ BUG_ON(!(inode_state_read_once(inode) & I_FREEING));
+ BUG_ON(inode_state_read_once(inode) & I_CLEAR);
BUG_ON(!list_empty(&inode->i_wb_list));
/* don't need i_lock here, no concurrent mods to i_state */
- inode->i_state = I_FREEING | I_CLEAR;
+ inode_state_assign_raw(inode, I_FREEING | I_CLEAR);
}
EXPORT_SYMBOL(clear_inode);
@@ -786,12 +815,10 @@ static void evict(struct inode *inode)
{
const struct super_operations *op = inode->i_sb->s_op;
- BUG_ON(!(inode->i_state & I_FREEING));
+ BUG_ON(!(inode_state_read_once(inode) & I_FREEING));
BUG_ON(!list_empty(&inode->i_lru));
- if (!list_empty(&inode->i_io_list))
- inode_io_list_del(inode);
-
+ inode_io_list_del(inode);
inode_sb_list_del(inode);
spin_lock(&inode->i_lock);
@@ -829,7 +856,7 @@ static void evict(struct inode *inode)
* This also means we don't need any fences for the call below.
*/
inode_wake_up_bit(inode, __I_NEW);
- BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
+ BUG_ON(inode_state_read_once(inode) != (I_FREEING | I_CLEAR));
destroy_inode(inode);
}
@@ -879,12 +906,12 @@ again:
spin_unlock(&inode->i_lock);
continue;
}
- if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
+ if (inode_state_read(inode) & (I_NEW | I_FREEING | I_WILL_FREE)) {
spin_unlock(&inode->i_lock);
continue;
}
- inode->i_state |= I_FREEING;
+ inode_state_set(inode, I_FREEING);
inode_lru_list_del(inode);
spin_unlock(&inode->i_lock);
list_add(&inode->i_lru, &dispose);
@@ -938,7 +965,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
* sync, or the last page cache deletion will requeue them.
*/
if (icount_read(inode) ||
- (inode->i_state & ~I_REFERENCED) ||
+ (inode_state_read(inode) & ~I_REFERENCED) ||
!mapping_shrinkable(&inode->i_data)) {
list_lru_isolate(lru, &inode->i_lru);
spin_unlock(&inode->i_lock);
@@ -947,8 +974,8 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
}
/* Recently referenced inodes get one more pass */
- if (inode->i_state & I_REFERENCED) {
- inode->i_state &= ~I_REFERENCED;
+ if (inode_state_read(inode) & I_REFERENCED) {
+ inode_state_clear(inode, I_REFERENCED);
spin_unlock(&inode->i_lock);
return LRU_ROTATE;
}
@@ -975,8 +1002,8 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
return LRU_RETRY;
}
- WARN_ON(inode->i_state & I_NEW);
- inode->i_state |= I_FREEING;
+ WARN_ON(inode_state_read(inode) & I_NEW);
+ inode_state_set(inode, I_FREEING);
list_lru_isolate_move(lru, &inode->i_lru, freeable);
spin_unlock(&inode->i_lock);
@@ -1008,7 +1035,8 @@ static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_lock
static struct inode *find_inode(struct super_block *sb,
struct hlist_head *head,
int (*test)(struct inode *, void *),
- void *data, bool is_inode_hash_locked)
+ void *data, bool is_inode_hash_locked,
+ bool *isnew)
{
struct inode *inode = NULL;
@@ -1025,16 +1053,17 @@ repeat:
if (!test(inode, data))
continue;
spin_lock(&inode->i_lock);
- if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
+ if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE)) {
__wait_on_freeing_inode(inode, is_inode_hash_locked);
goto repeat;
}
- if (unlikely(inode->i_state & I_CREATING)) {
+ if (unlikely(inode_state_read(inode) & I_CREATING)) {
spin_unlock(&inode->i_lock);
rcu_read_unlock();
return ERR_PTR(-ESTALE);
}
__iget(inode);
+ *isnew = !!(inode_state_read(inode) & I_NEW);
spin_unlock(&inode->i_lock);
rcu_read_unlock();
return inode;
@@ -1049,7 +1078,7 @@ repeat:
*/
static struct inode *find_inode_fast(struct super_block *sb,
struct hlist_head *head, unsigned long ino,
- bool is_inode_hash_locked)
+ bool is_inode_hash_locked, bool *isnew)
{
struct inode *inode = NULL;
@@ -1066,16 +1095,17 @@ repeat:
if (inode->i_sb != sb)
continue;
spin_lock(&inode->i_lock);
- if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
+ if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE)) {
__wait_on_freeing_inode(inode, is_inode_hash_locked);
goto repeat;
}
- if (unlikely(inode->i_state & I_CREATING)) {
+ if (unlikely(inode_state_read(inode) & I_CREATING)) {
spin_unlock(&inode->i_lock);
rcu_read_unlock();
return ERR_PTR(-ESTALE);
}
__iget(inode);
+ *isnew = !!(inode_state_read(inode) & I_NEW);
spin_unlock(&inode->i_lock);
rcu_read_unlock();
return inode;
@@ -1180,14 +1210,8 @@ void unlock_new_inode(struct inode *inode)
{
lockdep_annotate_inode_mutex_key(inode);
spin_lock(&inode->i_lock);
- WARN_ON(!(inode->i_state & I_NEW));
- inode->i_state &= ~I_NEW & ~I_CREATING;
- /*
- * Pairs with the barrier in prepare_to_wait_event() to make sure
- * ___wait_var_event() either sees the bit cleared or
- * waitqueue_active() check in wake_up_var() sees the waiter.
- */
- smp_mb();
+ WARN_ON(!(inode_state_read(inode) & I_NEW));
+ inode_state_clear(inode, I_NEW | I_CREATING);
inode_wake_up_bit(inode, __I_NEW);
spin_unlock(&inode->i_lock);
}
@@ -1197,14 +1221,8 @@ void discard_new_inode(struct inode *inode)
{
lockdep_annotate_inode_mutex_key(inode);
spin_lock(&inode->i_lock);
- WARN_ON(!(inode->i_state & I_NEW));
- inode->i_state &= ~I_NEW;
- /*
- * Pairs with the barrier in prepare_to_wait_event() to make sure
- * ___wait_var_event() either sees the bit cleared or
- * waitqueue_active() check in wake_up_var() sees the waiter.
- */
- smp_mb();
+ WARN_ON(!(inode_state_read(inode) & I_NEW));
+ inode_state_clear(inode, I_NEW);
inode_wake_up_bit(inode, __I_NEW);
spin_unlock(&inode->i_lock);
iput(inode);
@@ -1260,6 +1278,7 @@ EXPORT_SYMBOL(unlock_two_nondirectories);
* @test: callback used for comparisons between inodes
* @set: callback used to initialize a new struct inode
* @data: opaque data pointer to pass to @test and @set
+ * @isnew: pointer to a bool which will indicate whether I_NEW is set
*
* Search for the inode specified by @hashval and @data in the inode cache,
* and if present return it with an increased reference count. This is a
@@ -1278,12 +1297,13 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
{
struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval);
struct inode *old;
+ bool isnew;
might_sleep();
again:
spin_lock(&inode_hash_lock);
- old = find_inode(inode->i_sb, head, test, data, true);
+ old = find_inode(inode->i_sb, head, test, data, true, &isnew);
if (unlikely(old)) {
/*
* Uhhuh, somebody else created the same inode under us.
@@ -1292,7 +1312,8 @@ again:
spin_unlock(&inode_hash_lock);
if (IS_ERR(old))
return NULL;
- wait_on_inode(old);
+ if (unlikely(isnew))
+ wait_on_new_inode(old);
if (unlikely(inode_unhashed(old))) {
iput(old);
goto again;
@@ -1310,7 +1331,7 @@ again:
* caller is responsible for filling in the contents
*/
spin_lock(&inode->i_lock);
- inode->i_state |= I_NEW;
+ inode_state_set(inode, I_NEW);
hlist_add_head_rcu(&inode->i_hash, head);
spin_unlock(&inode->i_lock);
@@ -1383,15 +1404,17 @@ struct inode *iget5_locked_rcu(struct super_block *sb, unsigned long hashval,
{
struct hlist_head *head = inode_hashtable + hash(sb, hashval);
struct inode *inode, *new;
+ bool isnew;
might_sleep();
again:
- inode = find_inode(sb, head, test, data, false);
+ inode = find_inode(sb, head, test, data, false, &isnew);
if (inode) {
if (IS_ERR(inode))
return NULL;
- wait_on_inode(inode);
+ if (unlikely(isnew))
+ wait_on_new_inode(inode);
if (unlikely(inode_unhashed(inode))) {
iput(inode);
goto again;
@@ -1426,15 +1449,17 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
{
struct hlist_head *head = inode_hashtable + hash(sb, ino);
struct inode *inode;
+ bool isnew;
might_sleep();
again:
- inode = find_inode_fast(sb, head, ino, false);
+ inode = find_inode_fast(sb, head, ino, false, &isnew);
if (inode) {
if (IS_ERR(inode))
return NULL;
- wait_on_inode(inode);
+ if (unlikely(isnew))
+ wait_on_new_inode(inode);
if (unlikely(inode_unhashed(inode))) {
iput(inode);
goto again;
@@ -1448,11 +1473,11 @@ again:
spin_lock(&inode_hash_lock);
/* We released the lock, so.. */
- old = find_inode_fast(sb, head, ino, true);
+ old = find_inode_fast(sb, head, ino, true, &isnew);
if (!old) {
inode->i_ino = ino;
spin_lock(&inode->i_lock);
- inode->i_state = I_NEW;
+ inode_state_assign(inode, I_NEW);
hlist_add_head_rcu(&inode->i_hash, head);
spin_unlock(&inode->i_lock);
spin_unlock(&inode_hash_lock);
@@ -1474,7 +1499,8 @@ again:
if (IS_ERR(old))
return NULL;
inode = old;
- wait_on_inode(inode);
+ if (unlikely(isnew))
+ wait_on_new_inode(inode);
if (unlikely(inode_unhashed(inode))) {
iput(inode);
goto again;
@@ -1545,7 +1571,7 @@ EXPORT_SYMBOL(iunique);
struct inode *igrab(struct inode *inode)
{
spin_lock(&inode->i_lock);
- if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) {
+ if (!(inode_state_read(inode) & (I_FREEING | I_WILL_FREE))) {
__iget(inode);
spin_unlock(&inode->i_lock);
} else {
@@ -1578,13 +1604,13 @@ EXPORT_SYMBOL(igrab);
* Note2: @test is called with the inode_hash_lock held, so can't sleep.
*/
struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
- int (*test)(struct inode *, void *), void *data)
+ int (*test)(struct inode *, void *), void *data, bool *isnew)
{
struct hlist_head *head = inode_hashtable + hash(sb, hashval);
struct inode *inode;
spin_lock(&inode_hash_lock);
- inode = find_inode(sb, head, test, data, true);
+ inode = find_inode(sb, head, test, data, true, isnew);
spin_unlock(&inode_hash_lock);
return IS_ERR(inode) ? NULL : inode;
@@ -1612,13 +1638,15 @@ struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
int (*test)(struct inode *, void *), void *data)
{
struct inode *inode;
+ bool isnew;
might_sleep();
again:
- inode = ilookup5_nowait(sb, hashval, test, data);
+ inode = ilookup5_nowait(sb, hashval, test, data, &isnew);
if (inode) {
- wait_on_inode(inode);
+ if (unlikely(isnew))
+ wait_on_new_inode(inode);
if (unlikely(inode_unhashed(inode))) {
iput(inode);
goto again;
@@ -1640,16 +1668,18 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino)
{
struct hlist_head *head = inode_hashtable + hash(sb, ino);
struct inode *inode;
+ bool isnew;
might_sleep();
again:
- inode = find_inode_fast(sb, head, ino, false);
+ inode = find_inode_fast(sb, head, ino, false, &isnew);
if (inode) {
if (IS_ERR(inode))
return NULL;
- wait_on_inode(inode);
+ if (unlikely(isnew))
+ wait_on_new_inode(inode);
if (unlikely(inode_unhashed(inode))) {
iput(inode);
goto again;
@@ -1741,7 +1771,7 @@ struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval,
hlist_for_each_entry_rcu(inode, head, i_hash) {
if (inode->i_sb == sb &&
- !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)) &&
+ !(inode_state_read_once(inode) & (I_FREEING | I_WILL_FREE)) &&
test(inode, data))
return inode;
}
@@ -1780,7 +1810,7 @@ struct inode *find_inode_by_ino_rcu(struct super_block *sb,
hlist_for_each_entry_rcu(inode, head, i_hash) {
if (inode->i_ino == ino &&
inode->i_sb == sb &&
- !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)))
+ !(inode_state_read_once(inode) & (I_FREEING | I_WILL_FREE)))
return inode;
}
return NULL;
@@ -1792,6 +1822,7 @@ int insert_inode_locked(struct inode *inode)
struct super_block *sb = inode->i_sb;
ino_t ino = inode->i_ino;
struct hlist_head *head = inode_hashtable + hash(sb, ino);
+ bool isnew;
might_sleep();
@@ -1804,7 +1835,7 @@ int insert_inode_locked(struct inode *inode)
if (old->i_sb != sb)
continue;
spin_lock(&old->i_lock);
- if (old->i_state & (I_FREEING|I_WILL_FREE)) {
+ if (inode_state_read(old) & (I_FREEING | I_WILL_FREE)) {
spin_unlock(&old->i_lock);
continue;
}
@@ -1812,21 +1843,23 @@ int insert_inode_locked(struct inode *inode)
}
if (likely(!old)) {
spin_lock(&inode->i_lock);
- inode->i_state |= I_NEW | I_CREATING;
+ inode_state_set(inode, I_NEW | I_CREATING);
hlist_add_head_rcu(&inode->i_hash, head);
spin_unlock(&inode->i_lock);
spin_unlock(&inode_hash_lock);
return 0;
}
- if (unlikely(old->i_state & I_CREATING)) {
+ if (unlikely(inode_state_read(old) & I_CREATING)) {
spin_unlock(&old->i_lock);
spin_unlock(&inode_hash_lock);
return -EBUSY;
}
__iget(old);
+ isnew = !!(inode_state_read(old) & I_NEW);
spin_unlock(&old->i_lock);
spin_unlock(&inode_hash_lock);
- wait_on_inode(old);
+ if (isnew)
+ wait_on_new_inode(old);
if (unlikely(!inode_unhashed(old))) {
iput(old);
return -EBUSY;
@@ -1843,7 +1876,7 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
might_sleep();
- inode->i_state |= I_CREATING;
+ inode_state_set_raw(inode, I_CREATING);
old = inode_insert5(inode, hashval, test, NULL, data);
if (old != inode) {
@@ -1875,10 +1908,10 @@ static void iput_final(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
const struct super_operations *op = inode->i_sb->s_op;
- unsigned long state;
int drop;
- WARN_ON(inode->i_state & I_NEW);
+ WARN_ON(inode_state_read(inode) & I_NEW);
+ VFS_BUG_ON_INODE(atomic_read(&inode->i_count) != 0, inode);
if (op->drop_inode)
drop = op->drop_inode(inode);
@@ -1886,29 +1919,33 @@ static void iput_final(struct inode *inode)
drop = inode_generic_drop(inode);
if (!drop &&
- !(inode->i_state & I_DONTCACHE) &&
+ !(inode_state_read(inode) & I_DONTCACHE) &&
(sb->s_flags & SB_ACTIVE)) {
- __inode_add_lru(inode, true);
+ __inode_lru_list_add(inode, true);
spin_unlock(&inode->i_lock);
return;
}
- state = inode->i_state;
- if (!drop) {
- WRITE_ONCE(inode->i_state, state | I_WILL_FREE);
+ /*
+ * Re-check ->i_count in case the ->drop_inode() hooks played games.
+ * Note we only execute this if the verdict was to drop the inode.
+ */
+ VFS_BUG_ON_INODE(atomic_read(&inode->i_count) != 0, inode);
+
+ if (drop) {
+ inode_state_set(inode, I_FREEING);
+ } else {
+ inode_state_set(inode, I_WILL_FREE);
spin_unlock(&inode->i_lock);
write_inode_now(inode, 1);
spin_lock(&inode->i_lock);
- state = inode->i_state;
- WARN_ON(state & I_NEW);
- state &= ~I_WILL_FREE;
+ WARN_ON(inode_state_read(inode) & I_NEW);
+ inode_state_replace(inode, I_WILL_FREE, I_FREEING);
}
- WRITE_ONCE(inode->i_state, state | I_FREEING);
- if (!list_empty(&inode->i_lru))
- inode_lru_list_del(inode);
+ inode_lru_list_del(inode);
spin_unlock(&inode->i_lock);
evict(inode);
@@ -1931,7 +1968,7 @@ void iput(struct inode *inode)
retry:
lockdep_assert_not_held(&inode->i_lock);
- VFS_BUG_ON_INODE(inode->i_state & I_CLEAR, inode);
+ VFS_BUG_ON_INODE(inode_state_read_once(inode) & I_CLEAR, inode);
/*
* Note this assert is technically racy as if the count is bogusly
* equal to one, then two CPUs racing to further drop it can both
@@ -1942,14 +1979,14 @@ retry:
if (atomic_add_unless(&inode->i_count, -1, 1))
return;
- if ((inode->i_state & I_DIRTY_TIME) && inode->i_nlink) {
+ if ((inode_state_read_once(inode) & I_DIRTY_TIME) && inode->i_nlink) {
trace_writeback_lazytime_iput(inode);
mark_inode_dirty_sync(inode);
goto retry;
}
spin_lock(&inode->i_lock);
- if (unlikely((inode->i_state & I_DIRTY_TIME) && inode->i_nlink)) {
+ if (unlikely((inode_state_read(inode) & I_DIRTY_TIME) && inode->i_nlink)) {
spin_unlock(&inode->i_lock);
goto retry;
}
@@ -1967,6 +2004,18 @@ retry:
}
EXPORT_SYMBOL(iput);
+/**
+ * iput_not_last - put an inode assuming this is not the last reference
+ * @inode: inode to put
+ */
+void iput_not_last(struct inode *inode)
+{
+ VFS_BUG_ON_INODE(atomic_read(&inode->i_count) < 2, inode);
+
+ WARN_ON(atomic_sub_return(1, &inode->i_count) == 0);
+}
+EXPORT_SYMBOL(iput_not_last);
+
#ifdef CONFIG_BLOCK
/**
* bmap - find a block number in a file
@@ -2310,42 +2359,40 @@ out:
}
EXPORT_SYMBOL(current_time);
-static int inode_needs_update_time(struct inode *inode)
+static int file_update_time_flags(struct file *file, unsigned int flags)
{
+ struct inode *inode = file_inode(file);
struct timespec64 now, ts;
- int sync_it = 0;
+ int sync_mode = 0;
+ int ret = 0;
/* First try to exhaust all avenues to not sync */
if (IS_NOCMTIME(inode))
return 0;
+ if (unlikely(file->f_mode & FMODE_NOCMTIME))
+ return 0;
now = current_time(inode);
ts = inode_get_mtime(inode);
if (!timespec64_equal(&ts, &now))
- sync_it |= S_MTIME;
-
+ sync_mode |= S_MTIME;
ts = inode_get_ctime(inode);
if (!timespec64_equal(&ts, &now))
- sync_it |= S_CTIME;
-
+ sync_mode |= S_CTIME;
if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode))
- sync_it |= S_VERSION;
+ sync_mode |= S_VERSION;
- return sync_it;
-}
-
-static int __file_update_time(struct file *file, int sync_mode)
-{
- int ret = 0;
- struct inode *inode = file_inode(file);
+ if (!sync_mode)
+ return 0;
- /* try to update time settings */
- if (!mnt_get_write_access_file(file)) {
- ret = inode_update_time(inode, sync_mode);
- mnt_put_write_access_file(file);
- }
+ if (flags & IOCB_NOWAIT)
+ return -EAGAIN;
+ if (mnt_get_write_access_file(file))
+ return 0;
+ ret = inode_update_time(inode, sync_mode);
+ mnt_put_write_access_file(file);
return ret;
}
@@ -2365,14 +2412,7 @@ static int __file_update_time(struct file *file, int sync_mode)
*/
int file_update_time(struct file *file)
{
- int ret;
- struct inode *inode = file_inode(file);
-
- ret = inode_needs_update_time(inode);
- if (ret <= 0)
- return ret;
-
- return __file_update_time(file, ret);
+ return file_update_time_flags(file, 0);
}
EXPORT_SYMBOL(file_update_time);
@@ -2394,7 +2434,6 @@ EXPORT_SYMBOL(file_update_time);
static int file_modified_flags(struct file *file, int flags)
{
int ret;
- struct inode *inode = file_inode(file);
/*
* Clear the security bits if the process is not being run by root.
@@ -2403,17 +2442,7 @@ static int file_modified_flags(struct file *file, int flags)
ret = file_remove_privs_flags(file, flags);
if (ret)
return ret;
-
- if (unlikely(file->f_mode & FMODE_NOCMTIME))
- return 0;
-
- ret = inode_needs_update_time(inode);
- if (ret <= 0)
- return ret;
- if (flags & IOCB_NOWAIT)
- return -EAGAIN;
-
- return __file_update_time(file, ret);
+ return file_update_time_flags(file, flags);
}
/**
@@ -2970,7 +2999,7 @@ void dump_inode(struct inode *inode, const char *reason)
pr_warn("%s encountered for inode %px\n"
"fs %s mode %ho opflags 0x%hx flags 0x%x state 0x%x count %d\n",
reason, inode, sb->s_type->name, inode->i_mode, inode->i_opflags,
- inode->i_flags, inode->i_state, atomic_read(&inode->i_count));
+ inode->i_flags, inode_state_read_once(inode), atomic_read(&inode->i_count));
}
EXPORT_SYMBOL(dump_inode);
diff --git a/fs/internal.h b/fs/internal.h
index 9b2b4d116880..d08d5e2235e9 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -67,6 +67,9 @@ int vfs_tmpfile(struct mnt_idmap *idmap,
const struct path *parentpath,
struct file *file, umode_t mode);
struct dentry *d_hash_and_lookup(struct dentry *, struct qstr *);
+struct dentry *start_dirop(struct dentry *parent, struct qstr *name,
+ unsigned int lookup_flags);
+int lookup_noperm_common(struct qstr *qname, struct dentry *base);
/*
* namespace.c
diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile
index f7e1c8534c46..a572b8808524 100644
--- a/fs/iomap/Makefile
+++ b/fs/iomap/Makefile
@@ -14,5 +14,6 @@ iomap-y += trace.o \
iomap-$(CONFIG_BLOCK) += direct-io.o \
ioend.o \
fiemap.o \
- seek.o
+ seek.o \
+ bio.o
iomap-$(CONFIG_SWAP) += swapfile.o
diff --git a/fs/iomap/bio.c b/fs/iomap/bio.c
new file mode 100644
index 000000000000..fc045f2e4c45
--- /dev/null
+++ b/fs/iomap/bio.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2010 Red Hat, Inc.
+ * Copyright (C) 2016-2023 Christoph Hellwig.
+ */
+#include <linux/iomap.h>
+#include <linux/pagemap.h>
+#include "internal.h"
+#include "trace.h"
+
+static void iomap_read_end_io(struct bio *bio)
+{
+ int error = blk_status_to_errno(bio->bi_status);
+ struct folio_iter fi;
+
+ bio_for_each_folio_all(fi, bio)
+ iomap_finish_folio_read(fi.folio, fi.offset, fi.length, error);
+ bio_put(bio);
+}
+
+static void iomap_bio_submit_read(struct iomap_read_folio_ctx *ctx)
+{
+ struct bio *bio = ctx->read_ctx;
+
+ if (bio)
+ submit_bio(bio);
+}
+
+static int iomap_bio_read_folio_range(const struct iomap_iter *iter,
+ struct iomap_read_folio_ctx *ctx, size_t plen)
+{
+ struct folio *folio = ctx->cur_folio;
+ const struct iomap *iomap = &iter->iomap;
+ loff_t pos = iter->pos;
+ size_t poff = offset_in_folio(folio, pos);
+ loff_t length = iomap_length(iter);
+ sector_t sector;
+ struct bio *bio = ctx->read_ctx;
+
+ sector = iomap_sector(iomap, pos);
+ if (!bio || bio_end_sector(bio) != sector ||
+ !bio_add_folio(bio, folio, plen, poff)) {
+ gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL);
+ gfp_t orig_gfp = gfp;
+ unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE);
+
+ if (bio)
+ submit_bio(bio);
+
+ if (ctx->rac) /* same as readahead_gfp_mask */
+ gfp |= __GFP_NORETRY | __GFP_NOWARN;
+ bio = bio_alloc(iomap->bdev, bio_max_segs(nr_vecs), REQ_OP_READ,
+ gfp);
+ /*
+ * If the bio_alloc fails, try it again for a single page to
+ * avoid having to deal with partial page reads. This emulates
+ * what do_mpage_read_folio does.
+ */
+ if (!bio)
+ bio = bio_alloc(iomap->bdev, 1, REQ_OP_READ, orig_gfp);
+ if (ctx->rac)
+ bio->bi_opf |= REQ_RAHEAD;
+ bio->bi_iter.bi_sector = sector;
+ bio->bi_end_io = iomap_read_end_io;
+ bio_add_folio_nofail(bio, folio, plen, poff);
+ ctx->read_ctx = bio;
+ }
+ return 0;
+}
+
+const struct iomap_read_ops iomap_bio_read_ops = {
+ .read_folio_range = iomap_bio_read_folio_range,
+ .submit_read = iomap_bio_submit_read,
+};
+EXPORT_SYMBOL_GPL(iomap_bio_read_ops);
+
+int iomap_bio_read_folio_range_sync(const struct iomap_iter *iter,
+ struct folio *folio, loff_t pos, size_t len)
+{
+ const struct iomap *srcmap = iomap_iter_srcmap(iter);
+ struct bio_vec bvec;
+ struct bio bio;
+
+ bio_init(&bio, srcmap->bdev, &bvec, 1, REQ_OP_READ);
+ bio.bi_iter.bi_sector = iomap_sector(srcmap, pos);
+ bio_add_folio_nofail(&bio, folio, len, offset_in_folio(folio, pos));
+ return submit_bio_wait(&bio);
+}
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 8b847a1e27f1..e5c1ca440d93 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -8,6 +8,7 @@
#include <linux/writeback.h>
#include <linux/swap.h>
#include <linux/migrate.h>
+#include "internal.h"
#include "trace.h"
#include "../internal.h"
@@ -37,10 +38,28 @@ static inline bool ifs_is_fully_uptodate(struct folio *folio,
return bitmap_full(ifs->state, i_blocks_per_folio(inode, folio));
}
-static inline bool ifs_block_is_uptodate(struct iomap_folio_state *ifs,
- unsigned int block)
+/*
+ * Find the next uptodate block in the folio. end_blk is inclusive.
+ * If no uptodate block is found, this will return end_blk + 1.
+ */
+static unsigned ifs_next_uptodate_block(struct folio *folio,
+ unsigned start_blk, unsigned end_blk)
+{
+ struct iomap_folio_state *ifs = folio->private;
+
+ return find_next_bit(ifs->state, end_blk + 1, start_blk);
+}
+
+/*
+ * Find the next non-uptodate block in the folio. end_blk is inclusive.
+ * If no non-uptodate block is found, this will return end_blk + 1.
+ */
+static unsigned ifs_next_nonuptodate_block(struct folio *folio,
+ unsigned start_blk, unsigned end_blk)
{
- return test_bit(block, ifs->state);
+ struct iomap_folio_state *ifs = folio->private;
+
+ return find_next_zero_bit(ifs->state, end_blk + 1, start_blk);
}
static bool ifs_set_range_uptodate(struct folio *folio,
@@ -75,13 +94,34 @@ static void iomap_set_range_uptodate(struct folio *folio, size_t off,
folio_mark_uptodate(folio);
}
-static inline bool ifs_block_is_dirty(struct folio *folio,
- struct iomap_folio_state *ifs, int block)
+/*
+ * Find the next dirty block in the folio. end_blk is inclusive.
+ * If no dirty block is found, this will return end_blk + 1.
+ */
+static unsigned ifs_next_dirty_block(struct folio *folio,
+ unsigned start_blk, unsigned end_blk)
{
+ struct iomap_folio_state *ifs = folio->private;
struct inode *inode = folio->mapping->host;
- unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
+ unsigned int blks = i_blocks_per_folio(inode, folio);
+
+ return find_next_bit(ifs->state, blks + end_blk + 1,
+ blks + start_blk) - blks;
+}
+
+/*
+ * Find the next clean block in the folio. end_blk is inclusive.
+ * If no clean block is found, this will return end_blk + 1.
+ */
+static unsigned ifs_next_clean_block(struct folio *folio,
+ unsigned start_blk, unsigned end_blk)
+{
+ struct iomap_folio_state *ifs = folio->private;
+ struct inode *inode = folio->mapping->host;
+ unsigned int blks = i_blocks_per_folio(inode, folio);
- return test_bit(block + blks_per_folio, ifs->state);
+ return find_next_zero_bit(ifs->state, blks + end_blk + 1,
+ blks + start_blk) - blks;
}
static unsigned ifs_find_dirty_range(struct folio *folio,
@@ -92,18 +132,17 @@ static unsigned ifs_find_dirty_range(struct folio *folio,
offset_in_folio(folio, *range_start) >> inode->i_blkbits;
unsigned end_blk = min_not_zero(
offset_in_folio(folio, range_end) >> inode->i_blkbits,
- i_blocks_per_folio(inode, folio));
- unsigned nblks = 1;
+ i_blocks_per_folio(inode, folio)) - 1;
+ unsigned nblks;
- while (!ifs_block_is_dirty(folio, ifs, start_blk))
- if (++start_blk == end_blk)
- return 0;
-
- while (start_blk + nblks < end_blk) {
- if (!ifs_block_is_dirty(folio, ifs, start_blk + nblks))
- break;
- nblks++;
- }
+ start_blk = ifs_next_dirty_block(folio, start_blk, end_blk);
+ if (start_blk > end_blk)
+ return 0;
+ if (start_blk == end_blk)
+ nblks = 1;
+ else
+ nblks = ifs_next_clean_block(folio, start_blk + 1, end_blk) -
+ start_blk;
*range_start = folio_pos(folio) + (start_blk << inode->i_blkbits);
return nblks << inode->i_blkbits;
@@ -218,6 +257,22 @@ static void ifs_free(struct folio *folio)
}
/*
+ * Calculate how many bytes to truncate based off the number of blocks to
+ * truncate and the end position to start truncating from.
+ */
+static size_t iomap_bytes_to_truncate(loff_t end_pos, unsigned block_bits,
+ unsigned blocks_truncated)
+{
+ unsigned block_size = 1 << block_bits;
+ unsigned block_offset = end_pos & (block_size - 1);
+
+ if (!block_offset)
+ return blocks_truncated << block_bits;
+
+ return ((blocks_truncated - 1) << block_bits) + block_offset;
+}
+
+/*
* Calculate the range inside the folio that we actually need to read.
*/
static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
@@ -240,24 +295,29 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
* to avoid reading in already uptodate ranges.
*/
if (ifs) {
- unsigned int i;
-
- /* move forward for each leading block marked uptodate */
- for (i = first; i <= last; i++) {
- if (!ifs_block_is_uptodate(ifs, i))
- break;
- *pos += block_size;
- poff += block_size;
- plen -= block_size;
- first++;
+ unsigned int next, blocks_skipped;
+
+ next = ifs_next_nonuptodate_block(folio, first, last);
+ blocks_skipped = next - first;
+
+ if (blocks_skipped) {
+ unsigned long block_offset = *pos & (block_size - 1);
+ unsigned bytes_skipped =
+ (blocks_skipped << block_bits) - block_offset;
+
+ *pos += bytes_skipped;
+ poff += bytes_skipped;
+ plen -= bytes_skipped;
}
+ first = next;
/* truncate len if we find any trailing uptodate block(s) */
- while (++i <= last) {
- if (ifs_block_is_uptodate(ifs, i)) {
- plen -= (last - i + 1) * block_size;
- last = i - 1;
- break;
+ if (++next <= last) {
+ next = ifs_next_uptodate_block(folio, next, last);
+ if (next <= last) {
+ plen -= iomap_bytes_to_truncate(*pos + plen,
+ block_bits, last - next + 1);
+ last = next - 1;
}
}
}
@@ -271,7 +331,8 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
unsigned end = offset_in_folio(folio, isize - 1) >> block_bits;
if (first <= end && last > end)
- plen -= (last - end) * block_size;
+ plen -= iomap_bytes_to_truncate(*pos + plen, block_bits,
+ last - end);
}
*offp = poff;
@@ -320,9 +381,8 @@ static int iomap_read_inline_data(const struct iomap_iter *iter,
return 0;
}
-#ifdef CONFIG_BLOCK
-static void iomap_finish_folio_read(struct folio *folio, size_t off,
- size_t len, int error)
+void iomap_finish_folio_read(struct folio *folio, size_t off, size_t len,
+ int error)
{
struct iomap_folio_state *ifs = folio->private;
bool uptodate = !error;
@@ -342,169 +402,201 @@ static void iomap_finish_folio_read(struct folio *folio, size_t off,
if (finished)
folio_end_read(folio, uptodate);
}
+EXPORT_SYMBOL_GPL(iomap_finish_folio_read);
-static void iomap_read_end_io(struct bio *bio)
+static void iomap_read_init(struct folio *folio)
{
- int error = blk_status_to_errno(bio->bi_status);
- struct folio_iter fi;
+ struct iomap_folio_state *ifs = folio->private;
- bio_for_each_folio_all(fi, bio)
- iomap_finish_folio_read(fi.folio, fi.offset, fi.length, error);
- bio_put(bio);
+ if (ifs) {
+ size_t len = folio_size(folio);
+
+ /*
+ * ifs->read_bytes_pending is used to track how many bytes are
+ * read in asynchronously by the IO helper. We need to track
+ * this so that we can know when the IO helper has finished
+ * reading in all the necessary ranges of the folio and can end
+ * the read.
+ *
+ * Increase ->read_bytes_pending by the folio size to start, and
+ * add a +1 bias. We'll subtract the bias and any uptodate /
+ * zeroed ranges that did not require IO in iomap_read_end()
+ * after we're done processing the folio.
+ *
+ * We do this because otherwise, we would have to increment
+ * ifs->read_bytes_pending every time a range in the folio needs
+ * to be read in, which can get expensive since the spinlock
+ * needs to be held whenever modifying ifs->read_bytes_pending.
+ *
+ * We add the bias to ensure the read has not been ended on the
+ * folio when iomap_read_end() is called, even if the IO helper
+ * has already finished reading in the entire folio.
+ */
+ spin_lock_irq(&ifs->state_lock);
+ WARN_ON_ONCE(ifs->read_bytes_pending != 0);
+ ifs->read_bytes_pending = len + 1;
+ spin_unlock_irq(&ifs->state_lock);
+ }
}
-struct iomap_readpage_ctx {
- struct folio *cur_folio;
- bool cur_folio_in_bio;
- struct bio *bio;
- struct readahead_control *rac;
-};
+/*
+ * This ends IO if no bytes were submitted to an IO helper.
+ *
+ * Otherwise, this calibrates ifs->read_bytes_pending to represent only the
+ * submitted bytes (see comment in iomap_read_init()). If all bytes submitted
+ * have already been completed by the IO helper, then this will end the read.
+ * Else the IO helper will end the read after all submitted ranges have been
+ * read.
+ */
+static void iomap_read_end(struct folio *folio, size_t bytes_submitted)
+{
+ struct iomap_folio_state *ifs = folio->private;
-static int iomap_readpage_iter(struct iomap_iter *iter,
- struct iomap_readpage_ctx *ctx)
+ if (ifs) {
+ bool end_read, uptodate;
+
+ spin_lock_irq(&ifs->state_lock);
+ if (!ifs->read_bytes_pending) {
+ WARN_ON_ONCE(bytes_submitted);
+ spin_unlock_irq(&ifs->state_lock);
+ folio_unlock(folio);
+ return;
+ }
+
+ /*
+ * Subtract any bytes that were initially accounted to
+ * read_bytes_pending but skipped for IO. The +1 accounts for
+ * the bias we added in iomap_read_init().
+ */
+ ifs->read_bytes_pending -=
+ (folio_size(folio) + 1 - bytes_submitted);
+
+ /*
+ * If !ifs->read_bytes_pending, this means all pending reads by
+ * the IO helper have already completed, which means we need to
+ * end the folio read here. If ifs->read_bytes_pending != 0,
+ * the IO helper will end the folio read.
+ */
+ end_read = !ifs->read_bytes_pending;
+ if (end_read)
+ uptodate = ifs_is_fully_uptodate(folio, ifs);
+ spin_unlock_irq(&ifs->state_lock);
+ if (end_read)
+ folio_end_read(folio, uptodate);
+ } else if (!bytes_submitted) {
+ /*
+ * If there were no bytes submitted, this means we are
+ * responsible for unlocking the folio here, since no IO helper
+ * has taken ownership of it. If there were bytes submitted,
+ * then the IO helper will end the read via
+ * iomap_finish_folio_read().
+ */
+ folio_unlock(folio);
+ }
+}
+
+static int iomap_read_folio_iter(struct iomap_iter *iter,
+ struct iomap_read_folio_ctx *ctx, size_t *bytes_submitted)
{
const struct iomap *iomap = &iter->iomap;
loff_t pos = iter->pos;
loff_t length = iomap_length(iter);
struct folio *folio = ctx->cur_folio;
- struct iomap_folio_state *ifs;
size_t poff, plen;
- sector_t sector;
+ loff_t pos_diff;
int ret;
if (iomap->type == IOMAP_INLINE) {
ret = iomap_read_inline_data(iter, folio);
if (ret)
return ret;
- return iomap_iter_advance(iter, &length);
+ return iomap_iter_advance(iter, length);
}
- /* zero post-eof blocks as the page may be mapped */
- ifs = ifs_alloc(iter->inode, folio, iter->flags);
- iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen);
- if (plen == 0)
- goto done;
+ ifs_alloc(iter->inode, folio, iter->flags);
- if (iomap_block_needs_zeroing(iter, pos)) {
- folio_zero_range(folio, poff, plen);
- iomap_set_range_uptodate(folio, poff, plen);
- goto done;
- }
+ length = min_t(loff_t, length,
+ folio_size(folio) - offset_in_folio(folio, pos));
+ while (length) {
+ iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff,
+ &plen);
- ctx->cur_folio_in_bio = true;
- if (ifs) {
- spin_lock_irq(&ifs->state_lock);
- ifs->read_bytes_pending += plen;
- spin_unlock_irq(&ifs->state_lock);
- }
+ pos_diff = pos - iter->pos;
+ if (WARN_ON_ONCE(pos_diff + plen > length))
+ return -EIO;
- sector = iomap_sector(iomap, pos);
- if (!ctx->bio ||
- bio_end_sector(ctx->bio) != sector ||
- !bio_add_folio(ctx->bio, folio, plen, poff)) {
- gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL);
- gfp_t orig_gfp = gfp;
- unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE);
-
- if (ctx->bio)
- submit_bio(ctx->bio);
-
- if (ctx->rac) /* same as readahead_gfp_mask */
- gfp |= __GFP_NORETRY | __GFP_NOWARN;
- ctx->bio = bio_alloc(iomap->bdev, bio_max_segs(nr_vecs),
- REQ_OP_READ, gfp);
- /*
- * If the bio_alloc fails, try it again for a single page to
- * avoid having to deal with partial page reads. This emulates
- * what do_mpage_read_folio does.
- */
- if (!ctx->bio) {
- ctx->bio = bio_alloc(iomap->bdev, 1, REQ_OP_READ,
- orig_gfp);
- }
- if (ctx->rac)
- ctx->bio->bi_opf |= REQ_RAHEAD;
- ctx->bio->bi_iter.bi_sector = sector;
- ctx->bio->bi_end_io = iomap_read_end_io;
- bio_add_folio_nofail(ctx->bio, folio, plen, poff);
- }
+ ret = iomap_iter_advance(iter, pos_diff);
+ if (ret)
+ return ret;
-done:
- /*
- * Move the caller beyond our range so that it keeps making progress.
- * For that, we have to include any leading non-uptodate ranges, but
- * we can skip trailing ones as they will be handled in the next
- * iteration.
- */
- length = pos - iter->pos + plen;
- return iomap_iter_advance(iter, &length);
-}
+ if (plen == 0)
+ return 0;
-static int iomap_read_folio_iter(struct iomap_iter *iter,
- struct iomap_readpage_ctx *ctx)
-{
- int ret;
+ /* zero post-eof blocks as the page may be mapped */
+ if (iomap_block_needs_zeroing(iter, pos)) {
+ folio_zero_range(folio, poff, plen);
+ iomap_set_range_uptodate(folio, poff, plen);
+ } else {
+ if (!*bytes_submitted)
+ iomap_read_init(folio);
+ ret = ctx->ops->read_folio_range(iter, ctx, plen);
+ if (ret)
+ return ret;
+ *bytes_submitted += plen;
+ }
- while (iomap_length(iter)) {
- ret = iomap_readpage_iter(iter, ctx);
+ ret = iomap_iter_advance(iter, plen);
if (ret)
return ret;
+ length -= pos_diff + plen;
+ pos = iter->pos;
}
-
return 0;
}
-int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops)
+void iomap_read_folio(const struct iomap_ops *ops,
+ struct iomap_read_folio_ctx *ctx)
{
+ struct folio *folio = ctx->cur_folio;
struct iomap_iter iter = {
.inode = folio->mapping->host,
.pos = folio_pos(folio),
.len = folio_size(folio),
};
- struct iomap_readpage_ctx ctx = {
- .cur_folio = folio,
- };
+ size_t bytes_submitted = 0;
int ret;
trace_iomap_readpage(iter.inode, 1);
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.status = iomap_read_folio_iter(&iter, &ctx);
+ iter.status = iomap_read_folio_iter(&iter, ctx,
+ &bytes_submitted);
- if (ctx.bio) {
- submit_bio(ctx.bio);
- WARN_ON_ONCE(!ctx.cur_folio_in_bio);
- } else {
- WARN_ON_ONCE(ctx.cur_folio_in_bio);
- folio_unlock(folio);
- }
+ if (ctx->ops->submit_read)
+ ctx->ops->submit_read(ctx);
- /*
- * Just like mpage_readahead and block_read_full_folio, we always
- * return 0 and just set the folio error flag on errors. This
- * should be cleaned up throughout the stack eventually.
- */
- return 0;
+ iomap_read_end(folio, bytes_submitted);
}
EXPORT_SYMBOL_GPL(iomap_read_folio);
static int iomap_readahead_iter(struct iomap_iter *iter,
- struct iomap_readpage_ctx *ctx)
+ struct iomap_read_folio_ctx *ctx, size_t *cur_bytes_submitted)
{
int ret;
while (iomap_length(iter)) {
if (ctx->cur_folio &&
offset_in_folio(ctx->cur_folio, iter->pos) == 0) {
- if (!ctx->cur_folio_in_bio)
- folio_unlock(ctx->cur_folio);
+ iomap_read_end(ctx->cur_folio, *cur_bytes_submitted);
ctx->cur_folio = NULL;
}
if (!ctx->cur_folio) {
ctx->cur_folio = readahead_folio(ctx->rac);
- ctx->cur_folio_in_bio = false;
+ if (WARN_ON_ONCE(!ctx->cur_folio))
+ return -EINVAL;
+ *cur_bytes_submitted = 0;
}
- ret = iomap_readpage_iter(iter, ctx);
+ ret = iomap_read_folio_iter(iter, ctx, cur_bytes_submitted);
if (ret)
return ret;
}
@@ -514,8 +606,8 @@ static int iomap_readahead_iter(struct iomap_iter *iter,
/**
* iomap_readahead - Attempt to read pages from a file.
- * @rac: Describes the pages to be read.
* @ops: The operations vector for the filesystem.
+ * @ctx: The ctx used for issuing readahead.
*
* This function is for filesystems to call to implement their readahead
* address_space operation.
@@ -527,51 +619,30 @@ static int iomap_readahead_iter(struct iomap_iter *iter,
* function is called with memalloc_nofs set, so allocations will not cause
* the filesystem to be reentered.
*/
-void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops)
+void iomap_readahead(const struct iomap_ops *ops,
+ struct iomap_read_folio_ctx *ctx)
{
+ struct readahead_control *rac = ctx->rac;
struct iomap_iter iter = {
.inode = rac->mapping->host,
.pos = readahead_pos(rac),
.len = readahead_length(rac),
};
- struct iomap_readpage_ctx ctx = {
- .rac = rac,
- };
+ size_t cur_bytes_submitted;
trace_iomap_readahead(rac->mapping->host, readahead_count(rac));
while (iomap_iter(&iter, ops) > 0)
- iter.status = iomap_readahead_iter(&iter, &ctx);
+ iter.status = iomap_readahead_iter(&iter, ctx,
+ &cur_bytes_submitted);
- if (ctx.bio)
- submit_bio(ctx.bio);
- if (ctx.cur_folio) {
- if (!ctx.cur_folio_in_bio)
- folio_unlock(ctx.cur_folio);
- }
-}
-EXPORT_SYMBOL_GPL(iomap_readahead);
-
-static int iomap_read_folio_range(const struct iomap_iter *iter,
- struct folio *folio, loff_t pos, size_t len)
-{
- const struct iomap *srcmap = iomap_iter_srcmap(iter);
- struct bio_vec bvec;
- struct bio bio;
+ if (ctx->ops->submit_read)
+ ctx->ops->submit_read(ctx);
- bio_init(&bio, srcmap->bdev, &bvec, 1, REQ_OP_READ);
- bio.bi_iter.bi_sector = iomap_sector(srcmap, pos);
- bio_add_folio_nofail(&bio, folio, len, offset_in_folio(folio, pos));
- return submit_bio_wait(&bio);
+ if (ctx->cur_folio)
+ iomap_read_end(ctx->cur_folio, cur_bytes_submitted);
}
-#else
-static int iomap_read_folio_range(const struct iomap_iter *iter,
- struct folio *folio, loff_t pos, size_t len)
-{
- WARN_ON_ONCE(1);
- return -EIO;
-}
-#endif /* CONFIG_BLOCK */
+EXPORT_SYMBOL_GPL(iomap_readahead);
/*
* iomap_is_partially_uptodate checks whether blocks within a folio are
@@ -584,7 +655,7 @@ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
{
struct iomap_folio_state *ifs = folio->private;
struct inode *inode = folio->mapping->host;
- unsigned first, last, i;
+ unsigned first, last;
if (!ifs)
return false;
@@ -596,10 +667,7 @@ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
first = from >> inode->i_blkbits;
last = (from + count - 1) >> inode->i_blkbits;
- for (i = first; i <= last; i++)
- if (!ifs_block_is_uptodate(ifs, i))
- return false;
- return true;
+ return ifs_next_nonuptodate_block(folio, first, last) > last;
}
EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
@@ -707,7 +775,7 @@ static int __iomap_write_begin(const struct iomap_iter *iter,
* are not changing pagecache contents.
*/
if (!(iter->flags & IOMAP_UNSHARE) && pos <= folio_pos(folio) &&
- pos + len >= folio_pos(folio) + folio_size(folio))
+ pos + len >= folio_next_pos(folio))
return 0;
ifs = ifs_alloc(iter->inode, folio, iter->flags);
@@ -723,9 +791,12 @@ static int __iomap_write_begin(const struct iomap_iter *iter,
if (plen == 0)
break;
- if (!(iter->flags & IOMAP_UNSHARE) &&
- (from <= poff || from >= poff + plen) &&
- (to <= poff || to >= poff + plen))
+ /*
+ * If the read range will be entirely overwritten by the write,
+ * we can skip having to zero/read it in.
+ */
+ if (!(iter->flags & IOMAP_UNSHARE) && from <= poff &&
+ to >= poff + plen)
continue;
if (iomap_block_needs_zeroing(iter, block_start)) {
@@ -742,7 +813,7 @@ static int __iomap_write_begin(const struct iomap_iter *iter,
status = write_ops->read_folio_range(iter,
folio, block_start, plen);
else
- status = iomap_read_folio_range(iter,
+ status = iomap_bio_read_folio_range_sync(iter,
folio, block_start, plen);
if (status)
return status;
@@ -761,6 +832,28 @@ static struct folio *__iomap_get_folio(struct iomap_iter *iter,
if (!mapping_large_folio_support(iter->inode->i_mapping))
len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos));
+ if (iter->fbatch) {
+ struct folio *folio = folio_batch_next(iter->fbatch);
+
+ if (!folio)
+ return NULL;
+
+ /*
+ * The folio mapping generally shouldn't have changed based on
+ * fs locks, but be consistent with filemap lookup and retry
+ * the iter if it does.
+ */
+ folio_lock(folio);
+ if (unlikely(folio->mapping != iter->inode->i_mapping)) {
+ iter->iomap.flags |= IOMAP_F_STALE;
+ folio_unlock(folio);
+ return NULL;
+ }
+
+ folio_get(folio);
+ return folio;
+ }
+
if (write_ops && write_ops->get_folio)
return write_ops->get_folio(iter, pos, len);
return iomap_get_folio(iter, pos, len);
@@ -815,15 +908,14 @@ static int iomap_write_begin(struct iomap_iter *iter,
size_t *poffset, u64 *plen)
{
const struct iomap *srcmap = iomap_iter_srcmap(iter);
- loff_t pos = iter->pos;
+ loff_t pos;
u64 len = min_t(u64, SIZE_MAX, iomap_length(iter));
struct folio *folio;
int status = 0;
len = min_not_zero(len, *plen);
- BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length);
- if (srcmap != &iter->iomap)
- BUG_ON(pos + len > srcmap->offset + srcmap->length);
+ *foliop = NULL;
+ *plen = 0;
if (fatal_signal_pending(current))
return -EINTR;
@@ -833,6 +925,15 @@ static int iomap_write_begin(struct iomap_iter *iter,
return PTR_ERR(folio);
/*
+ * No folio means we're done with a batch. We still have range to
+ * process so return and let the caller iterate and refill the batch.
+ */
+ if (!folio) {
+ WARN_ON_ONCE(!iter->fbatch);
+ return 0;
+ }
+
+ /*
* Now we have a locked folio, before we do anything with it we need to
* check that the iomap we have cached is not stale. The inode extent
* mapping can change due to concurrent IO in flight (e.g.
@@ -852,6 +953,22 @@ static int iomap_write_begin(struct iomap_iter *iter,
}
}
+ /*
+ * The folios in a batch may not be contiguous. If we've skipped
+ * forward, advance the iter to the pos of the current folio. If the
+ * folio starts beyond the end of the mapping, it may have been trimmed
+ * since the lookup for whatever reason. Return a NULL folio to
+ * terminate the op.
+ */
+ if (folio_pos(folio) > iter->pos) {
+ len = min_t(u64, folio_pos(folio) - iter->pos,
+ iomap_length(iter));
+ status = iomap_iter_advance(iter, len);
+ len = iomap_length(iter);
+ if (status || !len)
+ goto out_unlock;
+ }
+
pos = iomap_trim_folio_range(iter, folio, poffset, &len);
if (srcmap->type == IOMAP_INLINE)
@@ -1041,7 +1158,7 @@ retry:
}
} else {
total_written += written;
- iomap_iter_advance(iter, &written);
+ iomap_iter_advance(iter, written);
}
} while (iov_iter_count(i) && iomap_length(iter));
@@ -1082,7 +1199,7 @@ static void iomap_write_delalloc_ifs_punch(struct inode *inode,
struct folio *folio, loff_t start_byte, loff_t end_byte,
struct iomap *iomap, iomap_punch_t punch)
{
- unsigned int first_blk, last_blk, i;
+ unsigned int first_blk, last_blk;
loff_t last_byte;
u8 blkbits = inode->i_blkbits;
struct iomap_folio_state *ifs;
@@ -1097,14 +1214,14 @@ static void iomap_write_delalloc_ifs_punch(struct inode *inode,
if (!ifs)
return;
- last_byte = min_t(loff_t, end_byte - 1,
- folio_pos(folio) + folio_size(folio) - 1);
+ last_byte = min_t(loff_t, end_byte - 1, folio_next_pos(folio) - 1);
first_blk = offset_in_folio(folio, start_byte) >> blkbits;
last_blk = offset_in_folio(folio, last_byte) >> blkbits;
- for (i = first_blk; i <= last_blk; i++) {
- if (!ifs_block_is_dirty(folio, ifs, i))
- punch(inode, folio_pos(folio) + (i << blkbits),
- 1 << blkbits, iomap);
+ while ((first_blk = ifs_next_clean_block(folio, first_blk, last_blk))
+ <= last_blk) {
+ punch(inode, folio_pos(folio) + (first_blk << blkbits),
+ 1 << blkbits, iomap);
+ first_blk++;
}
}
@@ -1129,8 +1246,7 @@ static void iomap_write_delalloc_punch(struct inode *inode, struct folio *folio,
* Make sure the next punch start is correctly bound to
* the end of this data range, not the end of the folio.
*/
- *punch_start_byte = min_t(loff_t, end_byte,
- folio_pos(folio) + folio_size(folio));
+ *punch_start_byte = min_t(loff_t, end_byte, folio_next_pos(folio));
}
/*
@@ -1170,7 +1286,7 @@ static void iomap_write_delalloc_scan(struct inode *inode,
start_byte, end_byte, iomap, punch);
/* move offset to start of next folio in range */
- start_byte = folio_pos(folio) + folio_size(folio);
+ start_byte = folio_next_pos(folio);
folio_unlock(folio);
folio_put(folio);
}
@@ -1310,7 +1426,7 @@ static int iomap_unshare_iter(struct iomap_iter *iter,
int status;
if (!iomap_want_unshare_iter(iter))
- return iomap_iter_advance(iter, &bytes);
+ return iomap_iter_advance(iter, bytes);
do {
struct folio *folio;
@@ -1334,10 +1450,10 @@ static int iomap_unshare_iter(struct iomap_iter *iter,
balance_dirty_pages_ratelimited(iter->inode->i_mapping);
- status = iomap_iter_advance(iter, &bytes);
+ status = iomap_iter_advance(iter, bytes);
if (status)
break;
- } while (bytes > 0);
+ } while ((bytes = iomap_length(iter)) > 0);
return status;
}
@@ -1398,6 +1514,12 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero,
if (iter->iomap.flags & IOMAP_F_STALE)
break;
+ /* a NULL folio means we're done with a folio batch */
+ if (!folio) {
+ status = iomap_iter_advance_full(iter);
+ break;
+ }
+
/* warn about zeroing folios beyond eof that won't write back */
WARN_ON_ONCE(folio_pos(folio) > iter->inode->i_size);
@@ -1412,16 +1534,36 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero,
if (WARN_ON_ONCE(!ret))
return -EIO;
- status = iomap_iter_advance(iter, &bytes);
+ status = iomap_iter_advance(iter, bytes);
if (status)
break;
- } while (bytes > 0);
+ } while ((bytes = iomap_length(iter)) > 0);
if (did_zero)
*did_zero = true;
return status;
}
+loff_t
+iomap_fill_dirty_folios(
+ struct iomap_iter *iter,
+ loff_t offset,
+ loff_t length)
+{
+ struct address_space *mapping = iter->inode->i_mapping;
+ pgoff_t start = offset >> PAGE_SHIFT;
+ pgoff_t end = (offset + length - 1) >> PAGE_SHIFT;
+
+ iter->fbatch = kmalloc(sizeof(struct folio_batch), GFP_KERNEL);
+ if (!iter->fbatch)
+ return offset + length;
+ folio_batch_init(iter->fbatch);
+
+ filemap_get_folios_dirty(mapping, &start, end, iter->fbatch);
+ return (start << PAGE_SHIFT);
+}
+EXPORT_SYMBOL_GPL(iomap_fill_dirty_folios);
+
int
iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
const struct iomap_ops *ops,
@@ -1435,46 +1577,26 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
.private = private,
};
struct address_space *mapping = inode->i_mapping;
- unsigned int blocksize = i_blocksize(inode);
- unsigned int off = pos & (blocksize - 1);
- loff_t plen = min_t(loff_t, len, blocksize - off);
int ret;
bool range_dirty;
/*
- * Zero range can skip mappings that are zero on disk so long as
- * pagecache is clean. If pagecache was dirty prior to zero range, the
- * mapping converts on writeback completion and so must be zeroed.
- *
- * The simplest way to deal with this across a range is to flush
- * pagecache and process the updated mappings. To avoid excessive
- * flushing on partial eof zeroing, special case it to zero the
- * unaligned start portion if already dirty in pagecache.
- */
- if (off &&
- filemap_range_needs_writeback(mapping, pos, pos + plen - 1)) {
- iter.len = plen;
- while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.status = iomap_zero_iter(&iter, did_zero,
- write_ops);
-
- iter.len = len - (iter.pos - pos);
- if (ret || !iter.len)
- return ret;
- }
-
- /*
* To avoid an unconditional flush, check pagecache state and only flush
* if dirty and the fs returns a mapping that might convert on
* writeback.
*/
- range_dirty = filemap_range_needs_writeback(inode->i_mapping,
- iter.pos, iter.pos + iter.len - 1);
+ range_dirty = filemap_range_needs_writeback(mapping, iter.pos,
+ iter.pos + iter.len - 1);
while ((ret = iomap_iter(&iter, ops)) > 0) {
const struct iomap *srcmap = iomap_iter_srcmap(&iter);
- if (srcmap->type == IOMAP_HOLE ||
- srcmap->type == IOMAP_UNWRITTEN) {
+ if (WARN_ON_ONCE(iter.fbatch &&
+ srcmap->type != IOMAP_UNWRITTEN))
+ return -EIO;
+
+ if (!iter.fbatch &&
+ (srcmap->type == IOMAP_HOLE ||
+ srcmap->type == IOMAP_UNWRITTEN)) {
s64 status;
if (range_dirty) {
@@ -1526,7 +1648,7 @@ static int iomap_folio_mkwrite_iter(struct iomap_iter *iter,
folio_mark_dirty(folio);
}
- return iomap_iter_advance(iter, &length);
+ return iomap_iter_advance(iter, length);
}
vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops,
@@ -1559,16 +1681,25 @@ out_unlock:
}
EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
-void iomap_start_folio_write(struct inode *inode, struct folio *folio,
- size_t len)
+static void iomap_writeback_init(struct inode *inode, struct folio *folio)
{
struct iomap_folio_state *ifs = folio->private;
WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs);
- if (ifs)
- atomic_add(len, &ifs->write_bytes_pending);
+ if (ifs) {
+ WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != 0);
+ /*
+ * Set this to the folio size. After processing the folio for
+ * writeback in iomap_writeback_folio(), we'll subtract any
+ * ranges not written back.
+ *
+ * We do this because otherwise, we would have to atomically
+ * increment ifs->write_bytes_pending every time a range in the
+ * folio needs to be written back.
+ */
+ atomic_set(&ifs->write_bytes_pending, folio_size(folio));
+ }
}
-EXPORT_SYMBOL_GPL(iomap_start_folio_write);
void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
size_t len)
@@ -1585,7 +1716,7 @@ EXPORT_SYMBOL_GPL(iomap_finish_folio_write);
static int iomap_writeback_range(struct iomap_writepage_ctx *wpc,
struct folio *folio, u64 pos, u32 rlen, u64 end_pos,
- bool *wb_pending)
+ size_t *bytes_submitted)
{
do {
ssize_t ret;
@@ -1599,11 +1730,11 @@ static int iomap_writeback_range(struct iomap_writepage_ctx *wpc,
pos += ret;
/*
- * Holes are not be written back by ->writeback_range, so track
+ * Holes are not written back by ->writeback_range, so track
* if we did handle anything that is not a hole here.
*/
if (wpc->iomap.type != IOMAP_HOLE)
- *wb_pending = true;
+ *bytes_submitted += ret;
} while (rlen);
return 0;
@@ -1674,7 +1805,7 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio)
u64 pos = folio_pos(folio);
u64 end_pos = pos + folio_size(folio);
u64 end_aligned = 0;
- bool wb_pending = false;
+ size_t bytes_submitted = 0;
int error = 0;
u32 rlen;
@@ -1694,14 +1825,7 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio)
iomap_set_range_dirty(folio, 0, end_pos - pos);
}
- /*
- * Keep the I/O completion handler from clearing the writeback
- * bit until we have submitted all blocks by adding a bias to
- * ifs->write_bytes_pending, which is dropped after submitting
- * all blocks.
- */
- WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != 0);
- iomap_start_folio_write(inode, folio, 1);
+ iomap_writeback_init(inode, folio);
}
/*
@@ -1716,13 +1840,13 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio)
end_aligned = round_up(end_pos, i_blocksize(inode));
while ((rlen = iomap_find_dirty_range(folio, &pos, end_aligned))) {
error = iomap_writeback_range(wpc, folio, pos, rlen, end_pos,
- &wb_pending);
+ &bytes_submitted);
if (error)
break;
pos += rlen;
}
- if (wb_pending)
+ if (bytes_submitted)
wpc->nr_folios++;
/*
@@ -1740,12 +1864,20 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio)
* bit ourselves right after unlocking the page.
*/
if (ifs) {
- if (atomic_dec_and_test(&ifs->write_bytes_pending))
- folio_end_writeback(folio);
- } else {
- if (!wb_pending)
- folio_end_writeback(folio);
+ /*
+ * Subtract any bytes that were initially accounted to
+ * write_bytes_pending but skipped for writeback.
+ */
+ size_t bytes_not_submitted = folio_size(folio) -
+ bytes_submitted;
+
+ if (bytes_not_submitted)
+ iomap_finish_folio_write(inode, folio,
+ bytes_not_submitted);
+ } else if (!bytes_submitted) {
+ folio_end_writeback(folio);
}
+
mapping_set_error(inode->i_mapping, error);
return error;
}
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 5d5d63efbd57..8e273408453a 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -16,21 +16,13 @@
* Private flags for iomap_dio, must not overlap with the public ones in
* iomap.h:
*/
-#define IOMAP_DIO_NO_INVALIDATE (1U << 25)
-#define IOMAP_DIO_CALLER_COMP (1U << 26)
-#define IOMAP_DIO_INLINE_COMP (1U << 27)
+#define IOMAP_DIO_NO_INVALIDATE (1U << 26)
+#define IOMAP_DIO_COMP_WORK (1U << 27)
#define IOMAP_DIO_WRITE_THROUGH (1U << 28)
#define IOMAP_DIO_NEED_SYNC (1U << 29)
#define IOMAP_DIO_WRITE (1U << 30)
#define IOMAP_DIO_DIRTY (1U << 31)
-/*
- * Used for sub block zeroing in iomap_dio_zero()
- */
-#define IOMAP_ZERO_PAGE_SIZE (SZ_64K)
-#define IOMAP_ZERO_PAGE_ORDER (get_order(IOMAP_ZERO_PAGE_SIZE))
-static struct page *zero_page;
-
struct iomap_dio {
struct kiocb *iocb;
const struct iomap_dio_ops *dops;
@@ -140,11 +132,6 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio)
}
EXPORT_SYMBOL_GPL(iomap_dio_complete);
-static ssize_t iomap_dio_deferred_complete(void *data)
-{
- return iomap_dio_complete(data);
-}
-
static void iomap_dio_complete_work(struct work_struct *work)
{
struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
@@ -179,33 +166,33 @@ static void iomap_dio_done(struct iomap_dio *dio)
WRITE_ONCE(dio->submit.waiter, NULL);
blk_wake_io_task(waiter);
- } else if (dio->flags & IOMAP_DIO_INLINE_COMP) {
- WRITE_ONCE(iocb->private, NULL);
- iomap_dio_complete_work(&dio->aio.work);
- } else if (dio->flags & IOMAP_DIO_CALLER_COMP) {
- /*
- * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then
- * schedule our completion that way to avoid an async punt to a
- * workqueue.
- */
- /* only polled IO cares about private cleared */
- iocb->private = dio;
- iocb->dio_complete = iomap_dio_deferred_complete;
+ return;
+ }
- /*
- * Invoke ->ki_complete() directly. We've assigned our
- * dio_complete callback handler, and since the issuer set
- * IOCB_DIO_CALLER_COMP, we know their ki_complete handler will
- * notice ->dio_complete being set and will defer calling that
- * handler until it can be done from a safe task context.
- *
- * Note that the 'res' being passed in here is not important
- * for this case. The actual completion value of the request
- * will be gotten from dio_complete when that is run by the
- * issuer.
- */
- iocb->ki_complete(iocb, 0);
- } else {
+ /*
+ * Always run error completions in user context. These are not
+ * performance critical and some code relies on taking sleeping locks
+ * for error handling.
+ */
+ if (dio->error)
+ dio->flags |= IOMAP_DIO_COMP_WORK;
+
+ /*
+ * Never invalidate pages from this context to avoid deadlocks with
+ * buffered I/O completions when called from the ioend workqueue,
+ * or avoid sleeping when called directly from ->bi_end_io.
+ * Tough luck if you hit the tiny race with someone dirtying the range
+ * right between this check and the actual completion.
+ */
+ if ((dio->flags & IOMAP_DIO_WRITE) &&
+ !(dio->flags & IOMAP_DIO_COMP_WORK)) {
+ if (dio->iocb->ki_filp->f_mapping->nrpages)
+ dio->flags |= IOMAP_DIO_COMP_WORK;
+ else
+ dio->flags |= IOMAP_DIO_NO_INVALIDATE;
+ }
+
+ if (dio->flags & IOMAP_DIO_COMP_WORK) {
struct inode *inode = file_inode(iocb->ki_filp);
/*
@@ -216,7 +203,11 @@ static void iomap_dio_done(struct iomap_dio *dio)
*/
INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
+ return;
}
+
+ WRITE_ONCE(iocb->private, NULL);
+ iomap_dio_complete_work(&dio->aio.work);
}
void iomap_dio_bio_end_io(struct bio *bio)
@@ -252,16 +243,9 @@ u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend)
/*
* Try to avoid another context switch for the completion given
* that we are already called from the ioend completion
- * workqueue, but never invalidate pages from this thread to
- * avoid deadlocks with buffered I/O completions. Tough luck if
- * you hit the tiny race with someone dirtying the range now
- * between this check and the actual completion.
+ * workqueue.
*/
- if (!dio->iocb->ki_filp->f_mapping->nrpages) {
- dio->flags |= IOMAP_DIO_INLINE_COMP;
- dio->flags |= IOMAP_DIO_NO_INVALIDATE;
- }
- dio->flags &= ~IOMAP_DIO_CALLER_COMP;
+ dio->flags &= ~IOMAP_DIO_COMP_WORK;
iomap_dio_done(dio);
}
@@ -285,42 +269,36 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
{
struct inode *inode = file_inode(dio->iocb->ki_filp);
struct bio *bio;
+ struct folio *zero_folio = largest_zero_folio();
+ int nr_vecs = max(1, i_blocksize(inode) / folio_size(zero_folio));
if (!len)
return 0;
+
/*
- * Max block size supported is 64k
+ * This limit shall never be reached as most filesystems have a
+ * maximum blocksize of 64k.
*/
- if (WARN_ON_ONCE(len > IOMAP_ZERO_PAGE_SIZE))
+ if (WARN_ON_ONCE(nr_vecs > BIO_MAX_VECS))
return -EINVAL;
- bio = iomap_dio_alloc_bio(iter, dio, 1, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE);
+ bio = iomap_dio_alloc_bio(iter, dio, nr_vecs,
+ REQ_OP_WRITE | REQ_SYNC | REQ_IDLE);
fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
GFP_KERNEL);
bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos);
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;
- __bio_add_page(bio, zero_page, len, 0);
+ while (len > 0) {
+ unsigned int io_len = min(len, folio_size(zero_folio));
+
+ bio_add_folio_nofail(bio, zero_folio, io_len, 0);
+ len -= io_len;
+ }
iomap_dio_submit_bio(iter, dio, bio, pos);
- return 0;
-}
-/*
- * Use a FUA write if we need datasync semantics and this is a pure data I/O
- * that doesn't require any metadata updates (including after I/O completion
- * such as unwritten extent conversion) and the underlying device either
- * doesn't have a volatile write cache or supports FUA.
- * This allows us to avoid cache flushes on I/O completion.
- */
-static inline bool iomap_dio_can_use_fua(const struct iomap *iomap,
- struct iomap_dio *dio)
-{
- if (iomap->flags & (IOMAP_F_SHARED | IOMAP_F_DIRTY))
- return false;
- if (!(dio->flags & IOMAP_DIO_WRITE_THROUGH))
- return false;
- return !bdev_write_cache(iomap->bdev) || bdev_fua(iomap->bdev);
+ return 0;
}
static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
@@ -336,12 +314,39 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
int nr_pages, ret = 0;
u64 copied = 0;
size_t orig_count;
+ unsigned int alignment;
+
+ /*
+ * File systems that write out of place and always allocate new blocks
+ * need each bio to be block aligned as that's the unit of allocation.
+ */
+ if (dio->flags & IOMAP_DIO_FSBLOCK_ALIGNED)
+ alignment = fs_block_size;
+ else
+ alignment = bdev_logical_block_size(iomap->bdev);
- if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1))
+ if ((pos | length) & (alignment - 1))
return -EINVAL;
if (dio->flags & IOMAP_DIO_WRITE) {
- bio_opf |= REQ_OP_WRITE;
+ bool need_completion_work = true;
+
+ switch (iomap->type) {
+ case IOMAP_MAPPED:
+ /*
+ * Directly mapped I/O does not inherently need to do
+ * work at I/O completion time. But there are various
+ * cases below where this will get set again.
+ */
+ need_completion_work = false;
+ break;
+ case IOMAP_UNWRITTEN:
+ dio->flags |= IOMAP_DIO_UNWRITTEN;
+ need_zeroout = true;
+ break;
+ default:
+ break;
+ }
if (iomap->flags & IOMAP_F_ATOMIC_BIO) {
/*
@@ -354,35 +359,54 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
bio_opf |= REQ_ATOMIC;
}
- if (iomap->type == IOMAP_UNWRITTEN) {
- dio->flags |= IOMAP_DIO_UNWRITTEN;
- need_zeroout = true;
- }
-
- if (iomap->flags & IOMAP_F_SHARED)
+ if (iomap->flags & IOMAP_F_SHARED) {
+ /*
+ * Unsharing of needs to update metadata at I/O
+ * completion time.
+ */
+ need_completion_work = true;
dio->flags |= IOMAP_DIO_COW;
+ }
- if (iomap->flags & IOMAP_F_NEW)
+ if (iomap->flags & IOMAP_F_NEW) {
+ /*
+ * Newly allocated blocks might need recording in
+ * metadata at I/O completion time.
+ */
+ need_completion_work = true;
need_zeroout = true;
- else if (iomap->type == IOMAP_MAPPED &&
- iomap_dio_can_use_fua(iomap, dio))
- bio_opf |= REQ_FUA;
+ }
- if (!(bio_opf & REQ_FUA))
- dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
+ /*
+ * Use a FUA write if we need datasync semantics and this is a
+ * pure overwrite that doesn't require any metadata updates.
+ *
+ * This allows us to avoid cache flushes on I/O completion.
+ */
+ if (dio->flags & IOMAP_DIO_WRITE_THROUGH) {
+ if (!need_completion_work &&
+ !(iomap->flags & IOMAP_F_DIRTY) &&
+ (!bdev_write_cache(iomap->bdev) ||
+ bdev_fua(iomap->bdev)))
+ bio_opf |= REQ_FUA;
+ else
+ dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
+ }
/*
- * We can only do deferred completion for pure overwrites that
+ * We can only do inline completion for pure overwrites that
* don't require additional I/O at completion time.
*
- * This rules out writes that need zeroing or extent conversion,
- * extend the file size, or issue metadata I/O or cache flushes
- * during completion processing.
+ * This rules out writes that need zeroing or metdata updates to
+ * convert unwritten or shared extents.
+ *
+ * Writes that extend i_size are also not supported, but this is
+ * handled in __iomap_dio_rw().
*/
- if (need_zeroout || (pos >= i_size_read(inode)) ||
- ((dio->flags & IOMAP_DIO_NEED_SYNC) &&
- !(bio_opf & REQ_FUA)))
- dio->flags &= ~IOMAP_DIO_CALLER_COMP;
+ if (need_completion_work)
+ dio->flags |= IOMAP_DIO_COMP_WORK;
+
+ bio_opf |= REQ_OP_WRITE;
} else {
bio_opf |= REQ_OP_READ;
}
@@ -403,7 +427,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
* ones we set for inline and deferred completions. If none of those
* are available for this IO, clear the polled flag.
*/
- if (!(dio->flags & (IOMAP_DIO_INLINE_COMP|IOMAP_DIO_CALLER_COMP)))
+ if (dio->flags & IOMAP_DIO_COMP_WORK)
dio->iocb->ki_flags &= ~IOCB_HIPRI;
if (need_zeroout) {
@@ -434,7 +458,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
bio->bi_end_io = iomap_dio_bio_end_io;
ret = bio_iov_iter_get_pages(bio, dio->submit.iter,
- bdev_logical_block_size(iomap->bdev) - 1);
+ alignment - 1);
if (unlikely(ret)) {
/*
* We have to stop part way through an IO. We must fall
@@ -496,7 +520,7 @@ out:
/* Undo iter limitation to current extent */
iov_iter_reexpand(dio->submit.iter, orig_count - copied);
if (copied)
- return iomap_iter_advance(iter, &copied);
+ return iomap_iter_advance(iter, copied);
return ret;
}
@@ -507,7 +531,7 @@ static int iomap_dio_hole_iter(struct iomap_iter *iter, struct iomap_dio *dio)
dio->size += length;
if (!length)
return -EFAULT;
- return iomap_iter_advance(iter, &length);
+ return iomap_iter_advance(iter, length);
}
static int iomap_dio_inline_iter(struct iomap_iter *iomi, struct iomap_dio *dio)
@@ -542,7 +566,7 @@ static int iomap_dio_inline_iter(struct iomap_iter *iomi, struct iomap_dio *dio)
dio->size += copied;
if (!copied)
return -EFAULT;
- return iomap_iter_advance(iomi, &copied);
+ return iomap_iter_advance(iomi, copied);
}
static int iomap_dio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
@@ -639,10 +663,10 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (iocb->ki_flags & IOCB_NOWAIT)
iomi.flags |= IOMAP_NOWAIT;
- if (iov_iter_rw(iter) == READ) {
- /* reads can always complete inline */
- dio->flags |= IOMAP_DIO_INLINE_COMP;
+ if (dio_flags & IOMAP_DIO_FSBLOCK_ALIGNED)
+ dio->flags |= IOMAP_DIO_FSBLOCK_ALIGNED;
+ if (iov_iter_rw(iter) == READ) {
if (iomi.pos >= dio->i_size)
goto out_free_dio;
@@ -656,15 +680,6 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
iomi.flags |= IOMAP_WRITE;
dio->flags |= IOMAP_DIO_WRITE;
- /*
- * Flag as supporting deferred completions, if the issuer
- * groks it. This can avoid a workqueue punt for writes.
- * We may later clear this flag if we need to do other IO
- * as part of this IO completion.
- */
- if (iocb->ki_flags & IOCB_DIO_CALLER_COMP)
- dio->flags |= IOMAP_DIO_CALLER_COMP;
-
if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) {
ret = -EAGAIN;
if (iomi.pos >= dio->i_size ||
@@ -694,6 +709,12 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
}
/*
+ * i_size updates must to happen from process context.
+ */
+ if (iomi.pos + iomi.len > dio->i_size)
+ dio->flags |= IOMAP_DIO_COMP_WORK;
+
+ /*
* Try to invalidate cache pages for the range we are writing.
* If this invalidation fails, let the caller fall back to
* buffered I/O.
@@ -717,12 +738,12 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
}
goto out_free_dio;
}
+ }
- if (!wait_for_completion && !inode->i_sb->s_dio_done_wq) {
- ret = sb_init_dio_done_wq(inode->i_sb);
- if (ret < 0)
- goto out_free_dio;
- }
+ if (!wait_for_completion && !inode->i_sb->s_dio_done_wq) {
+ ret = sb_init_dio_done_wq(inode->i_sb);
+ if (ret < 0)
+ goto out_free_dio;
}
inode_dio_begin(inode);
@@ -765,9 +786,14 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
* If all the writes we issued were already written through to the
* media, we don't need to flush the cache on IO completion. Clear the
* sync flag for this case.
+ *
+ * Otherwise clear the inline completion flag if any sync work is
+ * needed, as that needs to be performed from process context.
*/
if (dio->flags & IOMAP_DIO_WRITE_THROUGH)
dio->flags &= ~IOMAP_DIO_NEED_SYNC;
+ else if (dio->flags & IOMAP_DIO_NEED_SYNC)
+ dio->flags |= IOMAP_DIO_COMP_WORK;
/*
* We are about to drop our additional submission reference, which
@@ -825,15 +851,3 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
return iomap_dio_complete(dio);
}
EXPORT_SYMBOL_GPL(iomap_dio_rw);
-
-static int __init iomap_dio_init(void)
-{
- zero_page = alloc_pages(GFP_KERNEL | __GFP_ZERO,
- IOMAP_ZERO_PAGE_ORDER);
-
- if (!zero_page)
- return -ENOMEM;
-
- return 0;
-}
-fs_initcall(iomap_dio_init);
diff --git a/fs/iomap/internal.h b/fs/iomap/internal.h
index d05cb3aed96e..3a4e4aad2bd1 100644
--- a/fs/iomap/internal.h
+++ b/fs/iomap/internal.h
@@ -6,4 +6,16 @@
u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend);
+#ifdef CONFIG_BLOCK
+int iomap_bio_read_folio_range_sync(const struct iomap_iter *iter,
+ struct folio *folio, loff_t pos, size_t len);
+#else
+static inline int iomap_bio_read_folio_range_sync(const struct iomap_iter *iter,
+ struct folio *folio, loff_t pos, size_t len)
+{
+ WARN_ON_ONCE(1);
+ return -EIO;
+}
+#endif /* CONFIG_BLOCK */
+
#endif /* _IOMAP_INTERNAL_H */
diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c
index b49fa75eab26..86f44922ed3b 100644
--- a/fs/iomap/ioend.c
+++ b/fs/iomap/ioend.c
@@ -194,8 +194,6 @@ new_ioend:
if (!bio_add_folio(&ioend->io_bio, folio, map_len, poff))
goto new_ioend;
- iomap_start_folio_write(wpc->inode, folio, map_len);
-
/*
* Clamp io_offset and io_size to the incore EOF so that ondisk
* file size updates in the ioend completion are byte-accurate.
diff --git a/fs/iomap/iter.c b/fs/iomap/iter.c
index cef77ca0c20b..8692e5e41c6d 100644
--- a/fs/iomap/iter.c
+++ b/fs/iomap/iter.c
@@ -8,22 +8,24 @@
static inline void iomap_iter_reset_iomap(struct iomap_iter *iter)
{
+ if (iter->fbatch) {
+ folio_batch_release(iter->fbatch);
+ kfree(iter->fbatch);
+ iter->fbatch = NULL;
+ }
+
iter->status = 0;
memset(&iter->iomap, 0, sizeof(iter->iomap));
memset(&iter->srcmap, 0, sizeof(iter->srcmap));
}
-/*
- * Advance the current iterator position and output the length remaining for the
- * current mapping.
- */
-int iomap_iter_advance(struct iomap_iter *iter, u64 *count)
+/* Advance the current iterator position and decrement the remaining length */
+int iomap_iter_advance(struct iomap_iter *iter, u64 count)
{
- if (WARN_ON_ONCE(*count > iomap_length(iter)))
+ if (WARN_ON_ONCE(count > iomap_length(iter)))
return -EIO;
- iter->pos += *count;
- iter->len -= *count;
- *count = iomap_length(iter);
+ iter->pos += count;
+ iter->len -= count;
return 0;
}
diff --git a/fs/iomap/seek.c b/fs/iomap/seek.c
index 56db2dd4b10d..6cbc587c93da 100644
--- a/fs/iomap/seek.c
+++ b/fs/iomap/seek.c
@@ -16,13 +16,13 @@ static int iomap_seek_hole_iter(struct iomap_iter *iter,
*hole_pos = mapping_seek_hole_data(iter->inode->i_mapping,
iter->pos, iter->pos + length, SEEK_HOLE);
if (*hole_pos == iter->pos + length)
- return iomap_iter_advance(iter, &length);
+ return iomap_iter_advance(iter, length);
return 0;
case IOMAP_HOLE:
*hole_pos = iter->pos;
return 0;
default:
- return iomap_iter_advance(iter, &length);
+ return iomap_iter_advance(iter, length);
}
}
@@ -59,12 +59,12 @@ static int iomap_seek_data_iter(struct iomap_iter *iter,
switch (iter->iomap.type) {
case IOMAP_HOLE:
- return iomap_iter_advance(iter, &length);
+ return iomap_iter_advance(iter, length);
case IOMAP_UNWRITTEN:
*hole_pos = mapping_seek_hole_data(iter->inode->i_mapping,
iter->pos, iter->pos + length, SEEK_DATA);
if (*hole_pos < 0)
- return iomap_iter_advance(iter, &length);
+ return iomap_iter_advance(iter, length);
return 0;
default:
*hole_pos = iter->pos;
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index a61c1dae4742..532787277b16 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -122,9 +122,10 @@ DEFINE_RANGE_EVENT(iomap_zero_iter);
#define IOMAP_DIO_STRINGS \
- {IOMAP_DIO_FORCE_WAIT, "DIO_FORCE_WAIT" }, \
- {IOMAP_DIO_OVERWRITE_ONLY, "DIO_OVERWRITE_ONLY" }, \
- {IOMAP_DIO_PARTIAL, "DIO_PARTIAL" }
+ {IOMAP_DIO_FORCE_WAIT, "DIO_FORCE_WAIT" }, \
+ {IOMAP_DIO_OVERWRITE_ONLY, "DIO_OVERWRITE_ONLY" }, \
+ {IOMAP_DIO_PARTIAL, "DIO_PARTIAL" }, \
+ {IOMAP_DIO_FSBLOCK_ALIGNED, "DIO_FSBLOCK_ALIGNED" }
DECLARE_EVENT_CLASS(iomap_class,
TP_PROTO(struct inode *inode, struct iomap *iomap),
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 6f0e6b19383c..b7cbe126faf3 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -610,6 +610,11 @@ static int isofs_fill_super(struct super_block *s, struct fs_context *fc)
goto out_freesbi;
}
opt->blocksize = sb_min_blocksize(s, opt->blocksize);
+ if (!opt->blocksize) {
+ printk(KERN_ERR
+ "ISOFS: unable to set blocksize\n");
+ goto out_freesbi;
+ }
sbi->s_high_sierra = 0; /* default is iso9660 */
sbi->s_session = opt->session;
@@ -1515,7 +1520,7 @@ struct inode *__isofs_iget(struct super_block *sb,
if (!inode)
return ERR_PTR(-ENOMEM);
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
ret = isofs_read_inode(inode, relocated);
if (ret < 0) {
iget_failed(inode);
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index d175cccb7c55..764bba8ba999 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -265,7 +265,7 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
inode = iget_locked(sb, ino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
f = JFFS2_INODE_INFO(inode);
@@ -373,7 +373,7 @@ void jffs2_dirty_inode(struct inode *inode, int flags)
{
struct iattr iattr;
- if (!(inode->i_state & I_DIRTY_DATASYNC)) {
+ if (!(inode_state_read_once(inode) & I_DIRTY_DATASYNC)) {
jffs2_dbg(2, "%s(): not calling setattr() for ino #%lu\n",
__func__, inode->i_ino);
return;
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 2a4a288b821c..87ad042221e7 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -26,8 +26,8 @@ int jfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
return rc;
inode_lock(inode);
- if (!(inode->i_state & I_DIRTY_ALL) ||
- (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) {
+ if (!(inode_state_read_once(inode) & I_DIRTY_ALL) ||
+ (datasync && !(inode_state_read_once(inode) & I_DIRTY_DATASYNC))) {
/* Make sure committed changes hit the disk */
jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1);
inode_unlock(inode);
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 21f3d029da7d..4709762713ef 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -29,7 +29,7 @@ struct inode *jfs_iget(struct super_block *sb, unsigned long ino)
inode = iget_locked(sb, ino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
ret = diRead(inode);
diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h
index 10934f9a11be..5aaafedb8fbc 100644
--- a/fs/jfs/jfs_incore.h
+++ b/fs/jfs/jfs_incore.h
@@ -76,14 +76,14 @@ struct jfs_inode_info {
struct {
unchar _unused[16]; /* 16: */
dxd_t _dxd; /* 16: */
- /* _inline may overflow into _inline_ea when needed */
+ /* _inline_sym may overflow into _inline_ea when needed */
/* _inline_ea may overlay the last part of
* file._xtroot if maxentry = XTROOTINITSLOT
*/
union {
struct {
/* 128: inline symlink */
- unchar _inline[128];
+ unchar _inline_sym[128];
/* 128: inline extended attr */
unchar _inline_ea[128];
};
@@ -101,7 +101,7 @@ struct jfs_inode_info {
#define i_imap u.file._imap
#define i_dirtable u.dir._table
#define i_dtroot u.dir._dtroot
-#define i_inline u.link._inline
+#define i_inline u.link._inline_sym
#define i_inline_ea u.link._inline_ea
#define i_inline_all u.link._inline_all
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index 7840a03e5bcb..c16578af3a77 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -1287,7 +1287,7 @@ int txCommit(tid_t tid, /* transaction identifier */
* to verify this, only a trivial s/I_LOCK/I_SYNC/ was done.
* Joern
*/
- if (tblk->u.ip->i_state & I_SYNC)
+ if (inode_state_read_once(tblk->u.ip) & I_SYNC)
tblk->xflag &= ~COMMIT_LAZY;
}
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 457f91c412d4..a36aaee98dce 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -251,7 +251,7 @@ struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn)
struct inode *inode;
inode = iget_locked(sb, kernfs_ino(kn));
- if (inode && (inode->i_state & I_NEW))
+ if (inode && (inode_state_read_once(inode) & I_NEW))
kernfs_init_inode(kn, inode);
return inode;
diff --git a/fs/libfs.c b/fs/libfs.c
index ce8c496a6940..2d6657947abd 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -680,6 +680,7 @@ static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc)
s->s_export_op = ctx->eops;
s->s_xattr = ctx->xattr;
s->s_time_gran = 1;
+ s->s_d_flags |= ctx->s_d_flags;
root = new_inode(s);
if (!root)
return -ENOMEM;
@@ -1542,9 +1543,9 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end,
inode_lock(inode);
ret = sync_mapping_buffers(inode->i_mapping);
- if (!(inode->i_state & I_DIRTY_ALL))
+ if (!(inode_state_read_once(inode) & I_DIRTY_ALL))
goto out;
- if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+ if (datasync && !(inode_state_read_once(inode) & I_DIRTY_DATASYNC))
goto out;
err = sync_inode_metadata(inode, 1);
@@ -1664,7 +1665,7 @@ struct inode *alloc_anon_inode(struct super_block *s)
* list because mark_inode_dirty() will think
* that it already _is_ on the dirty list.
*/
- inode->i_state = I_DIRTY;
+ inode_state_assign_raw(inode, I_DIRTY);
/*
* Historically anonymous inodes don't have a type at all and
* userspace has come to rely on this.
@@ -2289,27 +2290,25 @@ void stashed_dentry_prune(struct dentry *dentry)
cmpxchg(stashed, dentry, NULL);
}
-/* parent must be held exclusive */
+/**
+ * simple_start_creating - prepare to create a given name
+ * @parent: directory in which to prepare to create the name
+ * @name: the name to be created
+ *
+ * Required lock is taken and a lookup in performed prior to creating an
+ * object in a directory. No permission checking is performed.
+ *
+ * Returns: a negative dentry on which vfs_create() or similar may
+ * be attempted, or an error.
+ */
struct dentry *simple_start_creating(struct dentry *parent, const char *name)
{
- struct dentry *dentry;
- struct inode *dir = d_inode(parent);
+ struct qstr qname = QSTR(name);
+ int err;
- inode_lock(dir);
- if (unlikely(IS_DEADDIR(dir))) {
- inode_unlock(dir);
- return ERR_PTR(-ENOENT);
- }
- dentry = lookup_noperm(&QSTR(name), parent);
- if (IS_ERR(dentry)) {
- inode_unlock(dir);
- return dentry;
- }
- if (dentry->d_inode) {
- dput(dentry);
- inode_unlock(dir);
- return ERR_PTR(-EEXIST);
- }
- return dentry;
+ err = lookup_noperm_common(&qname, parent);
+ if (err)
+ return ERR_PTR(err);
+ return start_dirop(parent, &qname, LOOKUP_CREATE | LOOKUP_EXCL);
}
EXPORT_SYMBOL(simple_start_creating);
diff --git a/fs/locks.c b/fs/locks.c
index 04a3f0e20724..9f565802a88c 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -585,7 +585,7 @@ static const struct lease_manager_operations lease_manager_ops = {
/*
* Initialize a lease, use the default lock manager operations
*/
-static int lease_init(struct file *filp, int type, struct file_lease *fl)
+static int lease_init(struct file *filp, unsigned int flags, int type, struct file_lease *fl)
{
if (assign_type(&fl->c, type) != 0)
return -EINVAL;
@@ -594,13 +594,13 @@ static int lease_init(struct file *filp, int type, struct file_lease *fl)
fl->c.flc_pid = current->tgid;
fl->c.flc_file = filp;
- fl->c.flc_flags = FL_LEASE;
+ fl->c.flc_flags = flags;
fl->fl_lmops = &lease_manager_ops;
return 0;
}
/* Allocate a file_lock initialised to this type of lease */
-static struct file_lease *lease_alloc(struct file *filp, int type)
+static struct file_lease *lease_alloc(struct file *filp, unsigned int flags, int type)
{
struct file_lease *fl = locks_alloc_lease();
int error = -ENOMEM;
@@ -608,7 +608,7 @@ static struct file_lease *lease_alloc(struct file *filp, int type)
if (fl == NULL)
return ERR_PTR(error);
- error = lease_init(filp, type, fl);
+ error = lease_init(filp, flags, type, fl);
if (error) {
locks_free_lease(fl);
return ERR_PTR(error);
@@ -1529,29 +1529,35 @@ any_leases_conflict(struct inode *inode, struct file_lease *breaker)
/**
* __break_lease - revoke all outstanding leases on file
* @inode: the inode of the file to return
- * @mode: O_RDONLY: break only write leases; O_WRONLY or O_RDWR:
- * break all leases
- * @type: FL_LEASE: break leases and delegations; FL_DELEG: break
- * only delegations
+ * @flags: LEASE_BREAK_* flags
*
* break_lease (inlined for speed) has checked there already is at least
* some kind of lock (maybe a lease) on this file. Leases are broken on
- * a call to open() or truncate(). This function can sleep unless you
- * specified %O_NONBLOCK to your open().
+ * a call to open() or truncate(). This function can block waiting for the
+ * lease break unless you specify LEASE_BREAK_NONBLOCK.
*/
-int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
+int __break_lease(struct inode *inode, unsigned int flags)
{
- int error = 0;
- struct file_lock_context *ctx;
struct file_lease *new_fl, *fl, *tmp;
+ struct file_lock_context *ctx;
unsigned long break_time;
- int want_write = (mode & O_ACCMODE) != O_RDONLY;
+ unsigned int type;
LIST_HEAD(dispose);
+ bool want_write = !(flags & LEASE_BREAK_OPEN_RDONLY);
+ int error = 0;
- new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK);
+ if (flags & LEASE_BREAK_LEASE)
+ type = FL_LEASE;
+ else if (flags & LEASE_BREAK_DELEG)
+ type = FL_DELEG;
+ else if (flags & LEASE_BREAK_LAYOUT)
+ type = FL_LAYOUT;
+ else
+ return -EINVAL;
+
+ new_fl = lease_alloc(NULL, type, want_write ? F_WRLCK : F_RDLCK);
if (IS_ERR(new_fl))
return PTR_ERR(new_fl);
- new_fl->c.flc_flags = type;
/* typically we will check that ctx is non-NULL before calling */
ctx = locks_inode_context(inode);
@@ -1596,7 +1602,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
if (list_empty(&ctx->flc_lease))
goto out;
- if (mode & O_NONBLOCK) {
+ if (flags & LEASE_BREAK_NONBLOCK) {
trace_break_lease_noblock(inode, new_fl);
error = -EWOULDBLOCK;
goto out;
@@ -1675,8 +1681,9 @@ void lease_get_mtime(struct inode *inode, struct timespec64 *time)
EXPORT_SYMBOL(lease_get_mtime);
/**
- * fcntl_getlease - Enquire what lease is currently active
+ * __fcntl_getlease - Enquire what lease is currently active
* @filp: the file
+ * @flavor: type of lease flags to check
*
* The value returned by this function will be one of
* (if no lease break is pending):
@@ -1697,7 +1704,7 @@ EXPORT_SYMBOL(lease_get_mtime);
* XXX: sfr & willy disagree over whether F_INPROGRESS
* should be returned to userspace.
*/
-int fcntl_getlease(struct file *filp)
+static int __fcntl_getlease(struct file *filp, unsigned int flavor)
{
struct file_lease *fl;
struct inode *inode = file_inode(filp);
@@ -1713,7 +1720,8 @@ int fcntl_getlease(struct file *filp)
list_for_each_entry(fl, &ctx->flc_lease, c.flc_list) {
if (fl->c.flc_file != filp)
continue;
- type = target_leasetype(fl);
+ if (fl->c.flc_flags & flavor)
+ type = target_leasetype(fl);
break;
}
spin_unlock(&ctx->flc_lock);
@@ -1724,6 +1732,19 @@ int fcntl_getlease(struct file *filp)
return type;
}
+int fcntl_getlease(struct file *filp)
+{
+ return __fcntl_getlease(filp, FL_LEASE);
+}
+
+int fcntl_getdeleg(struct file *filp, struct delegation *deleg)
+{
+ if (deleg->d_flags != 0 || deleg->__pad != 0)
+ return -EINVAL;
+ deleg->d_type = __fcntl_getlease(filp, FL_DELEG);
+ return 0;
+}
+
/**
* check_conflicting_open - see if the given file points to an inode that has
* an existing open that would conflict with the
@@ -1929,11 +1950,19 @@ static int generic_delete_lease(struct file *filp, void *owner)
int generic_setlease(struct file *filp, int arg, struct file_lease **flp,
void **priv)
{
+ struct inode *inode = file_inode(filp);
+
+ if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
+ return -EINVAL;
+
switch (arg) {
case F_UNLCK:
return generic_delete_lease(filp, *priv);
- case F_RDLCK:
case F_WRLCK:
+ if (S_ISDIR(inode->i_mode))
+ return -EINVAL;
+ fallthrough;
+ case F_RDLCK:
if (!(*flp)->fl_lmops->lm_break) {
WARN_ON_ONCE(1);
return -ENOLCK;
@@ -2018,8 +2047,6 @@ vfs_setlease(struct file *filp, int arg, struct file_lease **lease, void **priv)
if ((!vfsuid_eq_kuid(vfsuid, current_fsuid())) && !capable(CAP_LEASE))
return -EACCES;
- if (!S_ISREG(inode->i_mode))
- return -EINVAL;
error = security_file_lock(filp, arg);
if (error)
return error;
@@ -2027,13 +2054,13 @@ vfs_setlease(struct file *filp, int arg, struct file_lease **lease, void **priv)
}
EXPORT_SYMBOL_GPL(vfs_setlease);
-static int do_fcntl_add_lease(unsigned int fd, struct file *filp, int arg)
+static int do_fcntl_add_lease(unsigned int fd, struct file *filp, unsigned int flavor, int arg)
{
struct file_lease *fl;
struct fasync_struct *new;
int error;
- fl = lease_alloc(filp, arg);
+ fl = lease_alloc(filp, flavor, arg);
if (IS_ERR(fl))
return PTR_ERR(fl);
@@ -2064,9 +2091,33 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, int arg)
*/
int fcntl_setlease(unsigned int fd, struct file *filp, int arg)
{
+ if (S_ISDIR(file_inode(filp)->i_mode))
+ return -EINVAL;
+
if (arg == F_UNLCK)
return vfs_setlease(filp, F_UNLCK, NULL, (void **)&filp);
- return do_fcntl_add_lease(fd, filp, arg);
+ return do_fcntl_add_lease(fd, filp, FL_LEASE, arg);
+}
+
+/**
+ * fcntl_setdeleg - sets a delegation on an open file
+ * @fd: open file descriptor
+ * @filp: file pointer
+ * @deleg: delegation request from userland
+ *
+ * Call this fcntl to establish a delegation on the file.
+ * Note that you also need to call %F_SETSIG to
+ * receive a signal when the lease is broken.
+ */
+int fcntl_setdeleg(unsigned int fd, struct file *filp, struct delegation *deleg)
+{
+ /* For now, no flags are supported */
+ if (deleg->d_flags != 0 || deleg->__pad != 0)
+ return -EINVAL;
+
+ if (deleg->d_type == F_UNLCK)
+ return vfs_setlease(filp, F_UNLCK, NULL, (void **)&filp);
+ return do_fcntl_add_lease(fd, filp, FL_DELEG, deleg->d_type);
}
/**
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 32db676127a9..51ea9bdc813f 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -26,6 +26,22 @@ static int minix_write_inode(struct inode *inode,
struct writeback_control *wbc);
static int minix_statfs(struct dentry *dentry, struct kstatfs *buf);
+void __minix_error_inode(struct inode *inode, const char *function,
+ unsigned int line, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ printk(KERN_CRIT "minix-fs error (device %s): %s:%d: "
+ "inode #%lu: comm %s: %pV\n",
+ inode->i_sb->s_id, function, line, inode->i_ino,
+ current->comm, &vaf);
+ va_end(args);
+}
+
static void minix_evict_inode(struct inode *inode)
{
truncate_inode_pages_final(&inode->i_data);
@@ -589,7 +605,7 @@ struct inode *minix_iget(struct super_block *sb, unsigned long ino)
inode = iget_locked(sb, ino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
if (INODE_VERSION(inode) == MINIX_V1)
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index d54273c3c9ff..2bfaf377f208 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -42,6 +42,9 @@ struct minix_sb_info {
unsigned short s_version;
};
+void __minix_error_inode(struct inode *inode, const char *function,
+ unsigned int line, const char *fmt, ...);
+
struct inode *minix_iget(struct super_block *, unsigned long);
struct minix_inode *minix_V1_raw_inode(struct super_block *, ino_t, struct buffer_head **);
struct minix2_inode *minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **);
@@ -168,4 +171,10 @@ static inline int minix_test_bit(int nr, const void *vaddr)
#endif
+#define minix_error_inode(inode, fmt, ...) \
+ __minix_error_inode((inode), __func__, __LINE__, \
+ (fmt), ##__VA_ARGS__)
+
+#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
+
#endif /* FS_MINIX_H */
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 8938536d8d3c..263e4ba8b1c8 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -145,6 +145,11 @@ static int minix_unlink(struct inode * dir, struct dentry *dentry)
struct minix_dir_entry * de;
int err;
+ if (inode->i_nlink == 0) {
+ minix_error_inode(inode, "inode has corrupted nlink");
+ return -EFSCORRUPTED;
+ }
+
de = minix_find_entry(dentry, &folio);
if (!de)
return -ENOENT;
@@ -161,15 +166,24 @@ static int minix_unlink(struct inode * dir, struct dentry *dentry)
static int minix_rmdir(struct inode * dir, struct dentry *dentry)
{
struct inode * inode = d_inode(dentry);
- int err = -ENOTEMPTY;
+ int err = -EFSCORRUPTED;
- if (minix_empty_dir(inode)) {
- err = minix_unlink(dir, dentry);
- if (!err) {
- inode_dec_link_count(dir);
- inode_dec_link_count(inode);
- }
+ if (dir->i_nlink <= 2) {
+ minix_error_inode(dir, "inode has corrupted nlink");
+ goto out;
+ }
+
+ err = -ENOTEMPTY;
+ if (!minix_empty_dir(inode))
+ goto out;
+
+ err = minix_unlink(dir, dentry);
+ if (!err) {
+ inode_dec_link_count(dir);
+ inode_dec_link_count(inode);
}
+
+out:
return err;
}
@@ -208,6 +222,17 @@ static int minix_rename(struct mnt_idmap *idmap,
if (dir_de && !minix_empty_dir(new_inode))
goto out_dir;
+ err = -EFSCORRUPTED;
+ if (new_inode->i_nlink == 0 || (dir_de && new_inode->i_nlink != 2)) {
+ minix_error_inode(new_inode, "inode has corrupted nlink");
+ goto out_dir;
+ }
+
+ if (dir_de && old_dir->i_nlink <= 2) {
+ minix_error_inode(old_dir, "inode has corrupted nlink");
+ goto out_dir;
+ }
+
err = -ENOENT;
new_de = minix_find_entry(new_dentry, &new_folio);
if (!new_de)
diff --git a/fs/mount.h b/fs/mount.h
index f13a28752d0b..2d28ef2a3aed 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -27,6 +27,7 @@ struct mnt_namespace {
unsigned int nr_mounts; /* # of mounts in the namespace */
unsigned int pending_mounts;
refcount_t passive; /* number references not pinning @mounts */
+ bool is_anon;
} __randomize_layout;
struct mnt_pcp {
@@ -175,7 +176,7 @@ static inline bool is_local_mountpoint(const struct dentry *dentry)
static inline bool is_anon_ns(struct mnt_namespace *ns)
{
- return ns->ns.ns_id == 0;
+ return ns->is_anon;
}
static inline bool anon_ns_root(const struct mount *m)
diff --git a/fs/namei.c b/fs/namei.c
index 7377020a2cba..bf0f66f0e9b9 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -282,7 +282,7 @@ void putname(struct filename *name)
return;
refcnt = atomic_read(&name->refcnt);
- if (refcnt != 1) {
+ if (unlikely(refcnt != 1)) {
if (WARN_ON_ONCE(!refcnt))
return;
@@ -290,7 +290,7 @@ void putname(struct filename *name)
return;
}
- if (name->name != name->iname) {
+ if (unlikely(name->name != name->iname)) {
__putname(name->name);
kfree(name);
} else
@@ -540,10 +540,13 @@ static inline int do_inode_permission(struct mnt_idmap *idmap,
* @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
*
* Separate out file-system wide checks from inode-specific permission checks.
+ *
+ * Note: lookup_inode_permission_may_exec() does not call here. If you add
+ * MAY_EXEC checks, adjust it.
*/
static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
{
- if (unlikely(mask & MAY_WRITE)) {
+ if (mask & MAY_WRITE) {
umode_t mode = inode->i_mode;
/* Nobody gets write access to a read-only fs. */
@@ -574,7 +577,7 @@ int inode_permission(struct mnt_idmap *idmap,
if (unlikely(retval))
return retval;
- if (unlikely(mask & MAY_WRITE)) {
+ if (mask & MAY_WRITE) {
/*
* Nobody gets write access to an immutable file.
*/
@@ -602,6 +605,42 @@ int inode_permission(struct mnt_idmap *idmap,
}
EXPORT_SYMBOL(inode_permission);
+/*
+ * lookup_inode_permission_may_exec - Check traversal right for given inode
+ *
+ * This is a special case routine for may_lookup() making assumptions specific
+ * to path traversal. Use inode_permission() if you are doing something else.
+ *
+ * Work is shaved off compared to inode_permission() as follows:
+ * - we know for a fact there is no MAY_WRITE to worry about
+ * - it is an invariant the inode is a directory
+ *
+ * Since majority of real-world traversal happens on inodes which grant it for
+ * everyone, we check it upfront and only resort to more expensive work if it
+ * fails.
+ *
+ * Filesystems which have their own ->permission hook and consequently miss out
+ * on IOP_FASTPERM can still get the optimization if they set IOP_FASTPERM_MAY_EXEC
+ * on their directory inodes.
+ */
+static __always_inline int lookup_inode_permission_may_exec(struct mnt_idmap *idmap,
+ struct inode *inode, int mask)
+{
+ /* Lookup already checked this to return -ENOTDIR */
+ VFS_BUG_ON_INODE(!S_ISDIR(inode->i_mode), inode);
+ VFS_BUG_ON((mask & ~MAY_NOT_BLOCK) != 0);
+
+ mask |= MAY_EXEC;
+
+ if (unlikely(!(inode->i_opflags & (IOP_FASTPERM | IOP_FASTPERM_MAY_EXEC))))
+ return inode_permission(idmap, inode, mask);
+
+ if (unlikely(((inode->i_mode & 0111) != 0111) || !no_acl_inode(inode)))
+ return inode_permission(idmap, inode, mask);
+
+ return security_inode_permission(inode, mask);
+}
+
/**
* path_get - get a reference to a path
* @path: path to get the reference to
@@ -746,7 +785,8 @@ static void leave_rcu(struct nameidata *nd)
static void terminate_walk(struct nameidata *nd)
{
- drop_links(nd);
+ if (unlikely(nd->depth))
+ drop_links(nd);
if (!(nd->flags & LOOKUP_RCU)) {
int i;
path_put(&nd->path);
@@ -843,7 +883,7 @@ static bool try_to_unlazy(struct nameidata *nd)
BUG_ON(!(nd->flags & LOOKUP_RCU));
- if (unlikely(!legitimize_links(nd)))
+ if (unlikely(nd->depth && !legitimize_links(nd)))
goto out1;
if (unlikely(!legitimize_path(nd, &nd->path, nd->seq)))
goto out;
@@ -878,7 +918,7 @@ static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry)
int res;
BUG_ON(!(nd->flags & LOOKUP_RCU));
- if (unlikely(!legitimize_links(nd)))
+ if (unlikely(nd->depth && !legitimize_links(nd)))
goto out2;
res = __legitimize_mnt(nd->path.mnt, nd->m_seq);
if (unlikely(res)) {
@@ -951,8 +991,8 @@ static int complete_walk(struct nameidata *nd)
* We don't want to zero nd->root for scoped-lookups or
* externally-managed nd->root.
*/
- if (!(nd->state & ND_ROOT_PRESET))
- if (!(nd->flags & LOOKUP_IS_SCOPED))
+ if (likely(!(nd->state & ND_ROOT_PRESET)))
+ if (likely(!(nd->flags & LOOKUP_IS_SCOPED)))
nd->root.mnt = NULL;
nd->flags &= ~LOOKUP_CACHED;
if (!try_to_unlazy(nd))
@@ -1034,7 +1074,7 @@ static int nd_jump_root(struct nameidata *nd)
}
if (!nd->root.mnt) {
int error = set_root(nd);
- if (error)
+ if (unlikely(error))
return error;
}
if (nd->flags & LOOKUP_RCU) {
@@ -1632,13 +1672,15 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
path->dentry = dentry;
if (nd->flags & LOOKUP_RCU) {
unsigned int seq = nd->next_seq;
+ if (likely(!d_managed(dentry)))
+ return 0;
if (likely(__follow_mount_rcu(nd, path)))
return 0;
// *path and nd->next_seq might've been clobbered
path->mnt = nd->path.mnt;
path->dentry = dentry;
nd->next_seq = seq;
- if (!try_to_unlazy_next(nd, dentry))
+ if (unlikely(!try_to_unlazy_next(nd, dentry)))
return -ECHILD;
}
ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags);
@@ -1823,7 +1865,7 @@ again:
return dentry;
}
-static struct dentry *lookup_slow(const struct qstr *name,
+static noinline struct dentry *lookup_slow(const struct qstr *name,
struct dentry *dir,
unsigned int flags)
{
@@ -1855,7 +1897,7 @@ static inline int may_lookup(struct mnt_idmap *idmap,
int err, mask;
mask = nd->flags & LOOKUP_RCU ? MAY_NOT_BLOCK : 0;
- err = inode_permission(idmap, nd->inode, mask | MAY_EXEC);
+ err = lookup_inode_permission_may_exec(idmap, nd->inode, mask);
if (likely(!err))
return 0;
@@ -1870,7 +1912,7 @@ static inline int may_lookup(struct mnt_idmap *idmap,
if (err != -ECHILD) // hard error
return err;
- return inode_permission(idmap, nd->inode, MAY_EXEC);
+ return lookup_inode_permission_may_exec(idmap, nd->inode, 0);
}
static int reserve_stack(struct nameidata *nd, struct path *link)
@@ -1901,13 +1943,23 @@ static int reserve_stack(struct nameidata *nd, struct path *link)
enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4};
-static const char *pick_link(struct nameidata *nd, struct path *link,
+static noinline const char *pick_link(struct nameidata *nd, struct path *link,
struct inode *inode, int flags)
{
struct saved *last;
const char *res;
- int error = reserve_stack(nd, link);
+ int error;
+ if (nd->flags & LOOKUP_RCU) {
+ /* make sure that d_is_symlink from step_into_slowpath() matches the inode */
+ if (read_seqcount_retry(&link->dentry->d_seq, nd->next_seq))
+ return ERR_PTR(-ECHILD);
+ } else {
+ if (link->mnt == nd->path.mnt)
+ mntget(link->mnt);
+ }
+
+ error = reserve_stack(nd, link);
if (unlikely(error)) {
if (!(nd->flags & LOOKUP_RCU))
path_put(link);
@@ -1981,14 +2033,15 @@ all_done: // pure jump
*
* NOTE: dentry must be what nd->next_seq had been sampled from.
*/
-static const char *step_into(struct nameidata *nd, int flags,
+static noinline const char *step_into_slowpath(struct nameidata *nd, int flags,
struct dentry *dentry)
{
struct path path;
struct inode *inode;
- int err = handle_mounts(nd, dentry, &path);
+ int err;
- if (err < 0)
+ err = handle_mounts(nd, dentry, &path);
+ if (unlikely(err < 0))
return ERR_PTR(err);
inode = path.dentry->d_inode;
if (likely(!d_is_symlink(path.dentry)) ||
@@ -2010,15 +2063,32 @@ static const char *step_into(struct nameidata *nd, int flags,
nd->seq = nd->next_seq;
return NULL;
}
- if (nd->flags & LOOKUP_RCU) {
- /* make sure that d_is_symlink above matches inode */
- if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq))
+ return pick_link(nd, &path, inode, flags);
+}
+
+static __always_inline const char *step_into(struct nameidata *nd, int flags,
+ struct dentry *dentry)
+{
+ /*
+ * In the common case we are in rcu-walk and traversing over a non-mounted on
+ * directory (as opposed to e.g., a symlink).
+ *
+ * We can handle that and negative entries with the checks below.
+ */
+ if (likely((nd->flags & LOOKUP_RCU) &&
+ !d_managed(dentry) && !d_is_symlink(dentry))) {
+ struct inode *inode = dentry->d_inode;
+ if (read_seqcount_retry(&dentry->d_seq, nd->next_seq))
return ERR_PTR(-ECHILD);
- } else {
- if (path.mnt == nd->path.mnt)
- mntget(path.mnt);
+ if (unlikely(!inode))
+ return ERR_PTR(-ENOENT);
+ nd->path.dentry = dentry;
+ /* nd->path.mnt is retained on purpose */
+ nd->inode = inode;
+ nd->seq = nd->next_seq;
+ return NULL;
}
- return pick_link(nd, &path, inode, flags);
+ return step_into_slowpath(nd, flags, dentry);
}
static struct dentry *follow_dotdot_rcu(struct nameidata *nd)
@@ -2101,7 +2171,7 @@ static const char *handle_dots(struct nameidata *nd, int type)
if (!nd->root.mnt) {
error = ERR_PTR(set_root(nd));
- if (error)
+ if (unlikely(error))
return error;
}
if (nd->flags & LOOKUP_RCU)
@@ -2131,7 +2201,7 @@ static const char *handle_dots(struct nameidata *nd, int type)
return NULL;
}
-static const char *walk_component(struct nameidata *nd, int flags)
+static __always_inline const char *walk_component(struct nameidata *nd, int flags)
{
struct dentry *dentry;
/*
@@ -2140,7 +2210,7 @@ static const char *walk_component(struct nameidata *nd, int flags)
* parent relationships.
*/
if (unlikely(nd->last_type != LAST_NORM)) {
- if (!(flags & WALK_MORE) && nd->depth)
+ if (unlikely(nd->depth) && !(flags & WALK_MORE))
put_link(nd);
return handle_dots(nd, nd->last_type);
}
@@ -2152,7 +2222,7 @@ static const char *walk_component(struct nameidata *nd, int flags)
if (IS_ERR(dentry))
return ERR_CAST(dentry);
}
- if (!(flags & WALK_MORE) && nd->depth)
+ if (unlikely(nd->depth) && !(flags & WALK_MORE))
put_link(nd);
return step_into(nd, flags, dentry);
}
@@ -2505,7 +2575,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
if (unlikely(!*name)) {
OK:
/* pathname or trailing symlink, done */
- if (!depth) {
+ if (likely(!depth)) {
nd->dir_vfsuid = i_uid_into_vfsuid(idmap, nd->inode);
nd->dir_mode = nd->inode->i_mode;
nd->flags &= ~LOOKUP_PARENT;
@@ -2543,10 +2613,10 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
const char *s = nd->pathname;
/* LOOKUP_CACHED requires RCU, ask caller to retry */
- if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED)
+ if (unlikely((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED))
return ERR_PTR(-EAGAIN);
- if (!*s)
+ if (unlikely(!*s))
flags &= ~LOOKUP_RCU;
if (flags & LOOKUP_RCU)
rcu_read_lock();
@@ -2560,7 +2630,7 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount);
smp_rmb();
- if (nd->state & ND_ROOT_PRESET) {
+ if (unlikely(nd->state & ND_ROOT_PRESET)) {
struct dentry *root = nd->root.dentry;
struct inode *inode = root->d_inode;
if (*s && unlikely(!d_can_lookup(root)))
@@ -2579,7 +2649,7 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
nd->root.mnt = NULL;
/* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */
- if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) {
+ if (*s == '/' && likely(!(flags & LOOKUP_IN_ROOT))) {
error = nd_jump_root(nd);
if (unlikely(error))
return ERR_PTR(error);
@@ -2632,7 +2702,7 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
}
/* For scoped-lookups we need to set the root to the dirfd as well. */
- if (flags & LOOKUP_IS_SCOPED) {
+ if (unlikely(flags & LOOKUP_IS_SCOPED)) {
nd->root = nd->path;
if (flags & LOOKUP_RCU) {
nd->root_seq = nd->seq;
@@ -2765,6 +2835,62 @@ static int filename_parentat(int dfd, struct filename *name,
return __filename_parentat(dfd, name, flags, parent, last, type, NULL);
}
+/**
+ * start_dirop - begin a create or remove dirop, performing locking and lookup
+ * @parent: the dentry of the parent in which the operation will occur
+ * @name: a qstr holding the name within that parent
+ * @lookup_flags: intent and other lookup flags.
+ *
+ * The lookup is performed and necessary locks are taken so that, on success,
+ * the returned dentry can be operated on safely.
+ * The qstr must already have the hash value calculated.
+ *
+ * Returns: a locked dentry, or an error.
+ *
+ */
+static struct dentry *__start_dirop(struct dentry *parent, struct qstr *name,
+ unsigned int lookup_flags,
+ unsigned int state)
+{
+ struct dentry *dentry;
+ struct inode *dir = d_inode(parent);
+
+ if (state == TASK_KILLABLE) {
+ int ret = down_write_killable_nested(&dir->i_rwsem,
+ I_MUTEX_PARENT);
+ if (ret)
+ return ERR_PTR(ret);
+ } else {
+ inode_lock_nested(dir, I_MUTEX_PARENT);
+ }
+ dentry = lookup_one_qstr_excl(name, parent, lookup_flags);
+ if (IS_ERR(dentry))
+ inode_unlock(dir);
+ return dentry;
+}
+
+struct dentry *start_dirop(struct dentry *parent, struct qstr *name,
+ unsigned int lookup_flags)
+{
+ return __start_dirop(parent, name, lookup_flags, TASK_NORMAL);
+}
+
+/**
+ * end_dirop - signal completion of a dirop
+ * @de: the dentry which was returned by start_dirop or similar.
+ *
+ * If the de is an error, nothing happens. Otherwise any lock taken to
+ * protect the dentry is dropped and the dentry itself is release (dput()).
+ */
+void end_dirop(struct dentry *de)
+{
+ if (!IS_ERR(de)) {
+ inode_unlock(de->d_parent->d_inode);
+ dput(de);
+ }
+}
+EXPORT_SYMBOL(end_dirop);
+
/* does lookup, returns the object with parent locked */
static struct dentry *__start_removing_path(int dfd, struct filename *name,
struct path *path)
@@ -2781,10 +2907,9 @@ static struct dentry *__start_removing_path(int dfd, struct filename *name,
return ERR_PTR(-EINVAL);
/* don't fail immediately if it's r/o, at least try to report other errors */
error = mnt_want_write(parent_path.mnt);
- inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT);
- d = lookup_one_qstr_excl(&last, parent_path.dentry, 0);
+ d = start_dirop(parent_path.dentry, &last, 0);
if (IS_ERR(d))
- goto unlock;
+ goto drop;
if (error)
goto fail;
path->dentry = no_free_ptr(parent_path.dentry);
@@ -2792,10 +2917,9 @@ static struct dentry *__start_removing_path(int dfd, struct filename *name,
return d;
fail:
- dput(d);
+ end_dirop(d);
d = ERR_PTR(error);
-unlock:
- inode_unlock(parent_path.dentry->d_inode);
+drop:
if (!error)
mnt_drop_write(parent_path.mnt);
return d;
@@ -2910,7 +3034,7 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
}
EXPORT_SYMBOL(vfs_path_lookup);
-static int lookup_noperm_common(struct qstr *qname, struct dentry *base)
+int lookup_noperm_common(struct qstr *qname, struct dentry *base)
{
const char *name = qname->name;
u32 len = qname->len;
@@ -3181,6 +3305,234 @@ struct dentry *lookup_noperm_positive_unlocked(struct qstr *name,
}
EXPORT_SYMBOL(lookup_noperm_positive_unlocked);
+/**
+ * start_creating - prepare to create a given name with permission checking
+ * @idmap: idmap of the mount
+ * @parent: directory in which to prepare to create the name
+ * @name: the name to be created
+ *
+ * Locks are taken and a lookup is performed prior to creating
+ * an object in a directory. Permission checking (MAY_EXEC) is performed
+ * against @idmap.
+ *
+ * If the name already exists, a positive dentry is returned, so
+ * behaviour is similar to O_CREAT without O_EXCL, which doesn't fail
+ * with -EEXIST.
+ *
+ * Returns: a negative or positive dentry, or an error.
+ */
+struct dentry *start_creating(struct mnt_idmap *idmap, struct dentry *parent,
+ struct qstr *name)
+{
+ int err = lookup_one_common(idmap, name, parent);
+
+ if (err)
+ return ERR_PTR(err);
+ return start_dirop(parent, name, LOOKUP_CREATE);
+}
+EXPORT_SYMBOL(start_creating);
+
+/**
+ * start_removing - prepare to remove a given name with permission checking
+ * @idmap: idmap of the mount
+ * @parent: directory in which to find the name
+ * @name: the name to be removed
+ *
+ * Locks are taken and a lookup in performed prior to removing
+ * an object from a directory. Permission checking (MAY_EXEC) is performed
+ * against @idmap.
+ *
+ * If the name doesn't exist, an error is returned.
+ *
+ * end_removing() should be called when removal is complete, or aborted.
+ *
+ * Returns: a positive dentry, or an error.
+ */
+struct dentry *start_removing(struct mnt_idmap *idmap, struct dentry *parent,
+ struct qstr *name)
+{
+ int err = lookup_one_common(idmap, name, parent);
+
+ if (err)
+ return ERR_PTR(err);
+ return start_dirop(parent, name, 0);
+}
+EXPORT_SYMBOL(start_removing);
+
+/**
+ * start_creating_killable - prepare to create a given name with permission checking
+ * @idmap: idmap of the mount
+ * @parent: directory in which to prepare to create the name
+ * @name: the name to be created
+ *
+ * Locks are taken and a lookup in performed prior to creating
+ * an object in a directory. Permission checking (MAY_EXEC) is performed
+ * against @idmap.
+ *
+ * If the name already exists, a positive dentry is returned.
+ *
+ * If a signal is received or was already pending, the function aborts
+ * with -EINTR;
+ *
+ * Returns: a negative or positive dentry, or an error.
+ */
+struct dentry *start_creating_killable(struct mnt_idmap *idmap,
+ struct dentry *parent,
+ struct qstr *name)
+{
+ int err = lookup_one_common(idmap, name, parent);
+
+ if (err)
+ return ERR_PTR(err);
+ return __start_dirop(parent, name, LOOKUP_CREATE, TASK_KILLABLE);
+}
+EXPORT_SYMBOL(start_creating_killable);
+
+/**
+ * start_removing_killable - prepare to remove a given name with permission checking
+ * @idmap: idmap of the mount
+ * @parent: directory in which to find the name
+ * @name: the name to be removed
+ *
+ * Locks are taken and a lookup in performed prior to removing
+ * an object from a directory. Permission checking (MAY_EXEC) is performed
+ * against @idmap.
+ *
+ * If the name doesn't exist, an error is returned.
+ *
+ * end_removing() should be called when removal is complete, or aborted.
+ *
+ * If a signal is received or was already pending, the function aborts
+ * with -EINTR;
+ *
+ * Returns: a positive dentry, or an error.
+ */
+struct dentry *start_removing_killable(struct mnt_idmap *idmap,
+ struct dentry *parent,
+ struct qstr *name)
+{
+ int err = lookup_one_common(idmap, name, parent);
+
+ if (err)
+ return ERR_PTR(err);
+ return __start_dirop(parent, name, 0, TASK_KILLABLE);
+}
+EXPORT_SYMBOL(start_removing_killable);
+
+/**
+ * start_creating_noperm - prepare to create a given name without permission checking
+ * @parent: directory in which to prepare to create the name
+ * @name: the name to be created
+ *
+ * Locks are taken and a lookup in performed prior to creating
+ * an object in a directory.
+ *
+ * If the name already exists, a positive dentry is returned.
+ *
+ * Returns: a negative or positive dentry, or an error.
+ */
+struct dentry *start_creating_noperm(struct dentry *parent,
+ struct qstr *name)
+{
+ int err = lookup_noperm_common(name, parent);
+
+ if (err)
+ return ERR_PTR(err);
+ return start_dirop(parent, name, LOOKUP_CREATE);
+}
+EXPORT_SYMBOL(start_creating_noperm);
+
+/**
+ * start_removing_noperm - prepare to remove a given name without permission checking
+ * @parent: directory in which to find the name
+ * @name: the name to be removed
+ *
+ * Locks are taken and a lookup in performed prior to removing
+ * an object from a directory.
+ *
+ * If the name doesn't exist, an error is returned.
+ *
+ * end_removing() should be called when removal is complete, or aborted.
+ *
+ * Returns: a positive dentry, or an error.
+ */
+struct dentry *start_removing_noperm(struct dentry *parent,
+ struct qstr *name)
+{
+ int err = lookup_noperm_common(name, parent);
+
+ if (err)
+ return ERR_PTR(err);
+ return start_dirop(parent, name, 0);
+}
+EXPORT_SYMBOL(start_removing_noperm);
+
+/**
+ * start_creating_dentry - prepare to create a given dentry
+ * @parent: directory from which dentry should be removed
+ * @child: the dentry to be removed
+ *
+ * A lock is taken to protect the dentry again other dirops and
+ * the validity of the dentry is checked: correct parent and still hashed.
+ *
+ * If the dentry is valid and negative a reference is taken and
+ * returned. If not an error is returned.
+ *
+ * end_creating() should be called when creation is complete, or aborted.
+ *
+ * Returns: the valid dentry, or an error.
+ */
+struct dentry *start_creating_dentry(struct dentry *parent,
+ struct dentry *child)
+{
+ inode_lock_nested(parent->d_inode, I_MUTEX_PARENT);
+ if (unlikely(IS_DEADDIR(parent->d_inode) ||
+ child->d_parent != parent ||
+ d_unhashed(child))) {
+ inode_unlock(parent->d_inode);
+ return ERR_PTR(-EINVAL);
+ }
+ if (d_is_positive(child)) {
+ inode_unlock(parent->d_inode);
+ return ERR_PTR(-EEXIST);
+ }
+ return dget(child);
+}
+EXPORT_SYMBOL(start_creating_dentry);
+
+/**
+ * start_removing_dentry - prepare to remove a given dentry
+ * @parent: directory from which dentry should be removed
+ * @child: the dentry to be removed
+ *
+ * A lock is taken to protect the dentry again other dirops and
+ * the validity of the dentry is checked: correct parent and still hashed.
+ *
+ * If the dentry is valid and positive, a reference is taken and
+ * returned. If not an error is returned.
+ *
+ * end_removing() should be called when removal is complete, or aborted.
+ *
+ * Returns: the valid dentry, or an error.
+ */
+struct dentry *start_removing_dentry(struct dentry *parent,
+ struct dentry *child)
+{
+ inode_lock_nested(parent->d_inode, I_MUTEX_PARENT);
+ if (unlikely(IS_DEADDIR(parent->d_inode) ||
+ child->d_parent != parent ||
+ d_unhashed(child))) {
+ inode_unlock(parent->d_inode);
+ return ERR_PTR(-EINVAL);
+ }
+ if (d_is_negative(child)) {
+ inode_unlock(parent->d_inode);
+ return ERR_PTR(-ENOENT);
+ }
+ return dget(child);
+}
+EXPORT_SYMBOL(start_removing_dentry);
+
#ifdef CONFIG_UNIX98_PTYS
int path_pts(struct path *path)
{
@@ -3419,6 +3771,290 @@ void unlock_rename(struct dentry *p1, struct dentry *p2)
EXPORT_SYMBOL(unlock_rename);
/**
+ * __start_renaming - lookup and lock names for rename
+ * @rd: rename data containing parents and flags, and
+ * for receiving found dentries
+ * @lookup_flags: extra flags to pass to ->lookup (e.g. LOOKUP_REVAL,
+ * LOOKUP_NO_SYMLINKS etc).
+ * @old_last: name of object in @rd.old_parent
+ * @new_last: name of object in @rd.new_parent
+ *
+ * Look up two names and ensure locks are in place for
+ * rename.
+ *
+ * On success the found dentries are stored in @rd.old_dentry,
+ * @rd.new_dentry and an extra ref is taken on @rd.old_parent.
+ * These references and the lock are dropped by end_renaming().
+ *
+ * The passed in qstrs must have the hash calculated, and no permission
+ * checking is performed.
+ *
+ * Returns: zero or an error.
+ */
+static int
+__start_renaming(struct renamedata *rd, int lookup_flags,
+ struct qstr *old_last, struct qstr *new_last)
+{
+ struct dentry *trap;
+ struct dentry *d1, *d2;
+ int target_flags = LOOKUP_RENAME_TARGET | LOOKUP_CREATE;
+ int err;
+
+ if (rd->flags & RENAME_EXCHANGE)
+ target_flags = 0;
+ if (rd->flags & RENAME_NOREPLACE)
+ target_flags |= LOOKUP_EXCL;
+
+ trap = lock_rename(rd->old_parent, rd->new_parent);
+ if (IS_ERR(trap))
+ return PTR_ERR(trap);
+
+ d1 = lookup_one_qstr_excl(old_last, rd->old_parent,
+ lookup_flags);
+ err = PTR_ERR(d1);
+ if (IS_ERR(d1))
+ goto out_unlock;
+
+ d2 = lookup_one_qstr_excl(new_last, rd->new_parent,
+ lookup_flags | target_flags);
+ err = PTR_ERR(d2);
+ if (IS_ERR(d2))
+ goto out_dput_d1;
+
+ if (d1 == trap) {
+ /* source is an ancestor of target */
+ err = -EINVAL;
+ goto out_dput_d2;
+ }
+
+ if (d2 == trap) {
+ /* target is an ancestor of source */
+ if (rd->flags & RENAME_EXCHANGE)
+ err = -EINVAL;
+ else
+ err = -ENOTEMPTY;
+ goto out_dput_d2;
+ }
+
+ rd->old_dentry = d1;
+ rd->new_dentry = d2;
+ dget(rd->old_parent);
+ return 0;
+
+out_dput_d2:
+ dput(d2);
+out_dput_d1:
+ dput(d1);
+out_unlock:
+ unlock_rename(rd->old_parent, rd->new_parent);
+ return err;
+}
+
+/**
+ * start_renaming - lookup and lock names for rename with permission checking
+ * @rd: rename data containing parents and flags, and
+ * for receiving found dentries
+ * @lookup_flags: extra flags to pass to ->lookup (e.g. LOOKUP_REVAL,
+ * LOOKUP_NO_SYMLINKS etc).
+ * @old_last: name of object in @rd.old_parent
+ * @new_last: name of object in @rd.new_parent
+ *
+ * Look up two names and ensure locks are in place for
+ * rename.
+ *
+ * On success the found dentries are stored in @rd.old_dentry,
+ * @rd.new_dentry. Also the refcount on @rd->old_parent is increased.
+ * These references and the lock are dropped by end_renaming().
+ *
+ * The passed in qstrs need not have the hash calculated, and basic
+ * eXecute permission checking is performed against @rd.mnt_idmap.
+ *
+ * Returns: zero or an error.
+ */
+int start_renaming(struct renamedata *rd, int lookup_flags,
+ struct qstr *old_last, struct qstr *new_last)
+{
+ int err;
+
+ err = lookup_one_common(rd->mnt_idmap, old_last, rd->old_parent);
+ if (err)
+ return err;
+ err = lookup_one_common(rd->mnt_idmap, new_last, rd->new_parent);
+ if (err)
+ return err;
+ return __start_renaming(rd, lookup_flags, old_last, new_last);
+}
+EXPORT_SYMBOL(start_renaming);
+
+static int
+__start_renaming_dentry(struct renamedata *rd, int lookup_flags,
+ struct dentry *old_dentry, struct qstr *new_last)
+{
+ struct dentry *trap;
+ struct dentry *d2;
+ int target_flags = LOOKUP_RENAME_TARGET | LOOKUP_CREATE;
+ int err;
+
+ if (rd->flags & RENAME_EXCHANGE)
+ target_flags = 0;
+ if (rd->flags & RENAME_NOREPLACE)
+ target_flags |= LOOKUP_EXCL;
+
+ /* Already have the dentry - need to be sure to lock the correct parent */
+ trap = lock_rename_child(old_dentry, rd->new_parent);
+ if (IS_ERR(trap))
+ return PTR_ERR(trap);
+ if (d_unhashed(old_dentry) ||
+ (rd->old_parent && rd->old_parent != old_dentry->d_parent)) {
+ /* dentry was removed, or moved and explicit parent requested */
+ err = -EINVAL;
+ goto out_unlock;
+ }
+
+ d2 = lookup_one_qstr_excl(new_last, rd->new_parent,
+ lookup_flags | target_flags);
+ err = PTR_ERR(d2);
+ if (IS_ERR(d2))
+ goto out_unlock;
+
+ if (old_dentry == trap) {
+ /* source is an ancestor of target */
+ err = -EINVAL;
+ goto out_dput_d2;
+ }
+
+ if (d2 == trap) {
+ /* target is an ancestor of source */
+ if (rd->flags & RENAME_EXCHANGE)
+ err = -EINVAL;
+ else
+ err = -ENOTEMPTY;
+ goto out_dput_d2;
+ }
+
+ rd->old_dentry = dget(old_dentry);
+ rd->new_dentry = d2;
+ rd->old_parent = dget(old_dentry->d_parent);
+ return 0;
+
+out_dput_d2:
+ dput(d2);
+out_unlock:
+ unlock_rename(old_dentry->d_parent, rd->new_parent);
+ return err;
+}
+
+/**
+ * start_renaming_dentry - lookup and lock name for rename with permission checking
+ * @rd: rename data containing parents and flags, and
+ * for receiving found dentries
+ * @lookup_flags: extra flags to pass to ->lookup (e.g. LOOKUP_REVAL,
+ * LOOKUP_NO_SYMLINKS etc).
+ * @old_dentry: dentry of name to move
+ * @new_last: name of target in @rd.new_parent
+ *
+ * Look up target name and ensure locks are in place for
+ * rename.
+ *
+ * On success the found dentry is stored in @rd.new_dentry and
+ * @rd.old_parent is confirmed to be the parent of @old_dentry. If it
+ * was originally %NULL, it is set. In either case a reference is taken
+ * so that end_renaming() can have a stable reference to unlock.
+ *
+ * References and the lock can be dropped with end_renaming()
+ *
+ * The passed in qstr need not have the hash calculated, and basic
+ * eXecute permission checking is performed against @rd.mnt_idmap.
+ *
+ * Returns: zero or an error.
+ */
+int start_renaming_dentry(struct renamedata *rd, int lookup_flags,
+ struct dentry *old_dentry, struct qstr *new_last)
+{
+ int err;
+
+ err = lookup_one_common(rd->mnt_idmap, new_last, rd->new_parent);
+ if (err)
+ return err;
+ return __start_renaming_dentry(rd, lookup_flags, old_dentry, new_last);
+}
+EXPORT_SYMBOL(start_renaming_dentry);
+
+/**
+ * start_renaming_two_dentries - Lock to dentries in given parents for rename
+ * @rd: rename data containing parent
+ * @old_dentry: dentry of name to move
+ * @new_dentry: dentry to move to
+ *
+ * Ensure locks are in place for rename and check parentage is still correct.
+ *
+ * On success the two dentries are stored in @rd.old_dentry and
+ * @rd.new_dentry and @rd.old_parent and @rd.new_parent are confirmed to
+ * be the parents of the dentries.
+ *
+ * References and the lock can be dropped with end_renaming()
+ *
+ * Returns: zero or an error.
+ */
+int
+start_renaming_two_dentries(struct renamedata *rd,
+ struct dentry *old_dentry, struct dentry *new_dentry)
+{
+ struct dentry *trap;
+ int err;
+
+ /* Already have the dentry - need to be sure to lock the correct parent */
+ trap = lock_rename_child(old_dentry, rd->new_parent);
+ if (IS_ERR(trap))
+ return PTR_ERR(trap);
+ err = -EINVAL;
+ if (d_unhashed(old_dentry) ||
+ (rd->old_parent && rd->old_parent != old_dentry->d_parent))
+ /* old_dentry was removed, or moved and explicit parent requested */
+ goto out_unlock;
+ if (d_unhashed(new_dentry) ||
+ rd->new_parent != new_dentry->d_parent)
+ /* new_dentry was removed or moved */
+ goto out_unlock;
+
+ if (old_dentry == trap)
+ /* source is an ancestor of target */
+ goto out_unlock;
+
+ if (new_dentry == trap) {
+ /* target is an ancestor of source */
+ if (rd->flags & RENAME_EXCHANGE)
+ err = -EINVAL;
+ else
+ err = -ENOTEMPTY;
+ goto out_unlock;
+ }
+
+ err = -EEXIST;
+ if (d_is_positive(new_dentry) && (rd->flags & RENAME_NOREPLACE))
+ goto out_unlock;
+
+ rd->old_dentry = dget(old_dentry);
+ rd->new_dentry = dget(new_dentry);
+ rd->old_parent = dget(old_dentry->d_parent);
+ return 0;
+
+out_unlock:
+ unlock_rename(old_dentry->d_parent, rd->new_parent);
+ return err;
+}
+EXPORT_SYMBOL(start_renaming_two_dentries);
+
+void end_renaming(struct renamedata *rd)
+{
+ unlock_rename(rd->old_parent, rd->new_parent);
+ dput(rd->old_dentry);
+ dput(rd->new_dentry);
+ dput(rd->old_parent);
+}
+EXPORT_SYMBOL(end_renaming);
+
+/**
* vfs_prepare_mode - prepare the mode to be used for a new inode
* @idmap: idmap of the mount the inode was found from
* @dir: parent directory of the new inode
@@ -3461,10 +4097,9 @@ static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap,
/**
* vfs_create - create new file
* @idmap: idmap of the mount the inode was found from
- * @dir: inode of the parent directory
* @dentry: dentry of the child file
* @mode: mode of the child file
- * @want_excl: whether the file must not yet exist
+ * @di: returns parent inode, if the inode is delegated.
*
* Create a new file.
*
@@ -3474,9 +4109,10 @@ static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap,
* On non-idmapped mounts or if permission checking is to be performed on the
* raw inode simply pass @nop_mnt_idmap.
*/
-int vfs_create(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode, bool want_excl)
+int vfs_create(struct mnt_idmap *idmap, struct dentry *dentry, umode_t mode,
+ struct delegated_inode *di)
{
+ struct inode *dir = d_inode(dentry->d_parent);
int error;
error = may_create(idmap, dir, dentry);
@@ -3490,7 +4126,10 @@ int vfs_create(struct mnt_idmap *idmap, struct inode *dir,
error = security_inode_create(dir, dentry, mode);
if (error)
return error;
- error = dir->i_op->create(idmap, dir, dentry, mode, want_excl);
+ error = try_break_deleg(dir, di);
+ if (error)
+ return error;
+ error = dir->i_op->create(idmap, dir, dentry, mode, true);
if (!error)
fsnotify_create(dir, dentry);
return error;
@@ -3697,7 +4336,7 @@ static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry,
*/
static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
const struct open_flags *op,
- bool got_write)
+ bool got_write, struct delegated_inode *delegated_inode)
{
struct mnt_idmap *idmap;
struct dentry *dir = nd->path.dentry;
@@ -3786,6 +4425,11 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
/* Negative dentry, just create the file */
if (!dentry->d_inode && (open_flag & O_CREAT)) {
+ /* but break the directory lease first! */
+ error = try_break_deleg(dir_inode, delegated_inode);
+ if (error)
+ goto out_dput;
+
file->f_mode |= FMODE_CREATED;
audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
if (!dir_inode->i_op->create) {
@@ -3848,6 +4492,7 @@ static struct dentry *lookup_fast_for_open(struct nameidata *nd, int open_flag)
static const char *open_last_lookups(struct nameidata *nd,
struct file *file, const struct open_flags *op)
{
+ struct delegated_inode delegated_inode = { };
struct dentry *dir = nd->path.dentry;
int open_flag = op->open_flag;
bool got_write = false;
@@ -3879,7 +4524,7 @@ static const char *open_last_lookups(struct nameidata *nd,
return ERR_PTR(-ECHILD);
}
}
-
+retry:
if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
got_write = !mnt_want_write(nd->path.mnt);
/*
@@ -3892,7 +4537,7 @@ static const char *open_last_lookups(struct nameidata *nd,
inode_lock(dir->d_inode);
else
inode_lock_shared(dir->d_inode);
- dentry = lookup_open(nd, file, op, got_write);
+ dentry = lookup_open(nd, file, op, got_write, &delegated_inode);
if (!IS_ERR(dentry)) {
if (file->f_mode & FMODE_CREATED)
fsnotify_create(dir->d_inode, dentry);
@@ -3907,8 +4552,16 @@ static const char *open_last_lookups(struct nameidata *nd,
if (got_write)
mnt_drop_write(nd->path.mnt);
- if (IS_ERR(dentry))
+ if (IS_ERR(dentry)) {
+ if (is_delegated(&delegated_inode)) {
+ int error = break_deleg_wait(&delegated_inode);
+
+ if (!error)
+ goto retry;
+ return ERR_PTR(error);
+ }
return ERR_CAST(dentry);
+ }
if (file->f_mode & (FMODE_OPENED | FMODE_CREATED)) {
dput(nd->path.dentry);
@@ -4036,7 +4689,7 @@ int vfs_tmpfile(struct mnt_idmap *idmap,
inode = file_inode(file);
if (!(open_flag & O_EXCL)) {
spin_lock(&inode->i_lock);
- inode->i_state |= I_LINKABLE;
+ inode_state_set(inode, I_LINKABLE);
spin_unlock(&inode->i_lock);
}
security_inode_post_create_tmpfile(idmap, inode);
@@ -4223,21 +4876,18 @@ static struct dentry *filename_create(int dfd, struct filename *name,
*/
if (last.name[last.len] && !want_dir)
create_flags &= ~LOOKUP_CREATE;
- inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
- dentry = lookup_one_qstr_excl(&last, path->dentry,
- reval_flag | create_flags);
+ dentry = start_dirop(path->dentry, &last, reval_flag | create_flags);
if (IS_ERR(dentry))
- goto unlock;
+ goto out_drop_write;
if (unlikely(error))
goto fail;
return dentry;
fail:
- dput(dentry);
+ end_dirop(dentry);
dentry = ERR_PTR(error);
-unlock:
- inode_unlock(path->dentry->d_inode);
+out_drop_write:
if (!error)
mnt_drop_write(path->mnt);
out:
@@ -4256,11 +4906,20 @@ struct dentry *start_creating_path(int dfd, const char *pathname,
}
EXPORT_SYMBOL(start_creating_path);
+/**
+ * end_creating_path - finish a code section started by start_creating_path()
+ * @path: the path instantiated by start_creating_path()
+ * @dentry: the dentry returned by start_creating_path()
+ *
+ * end_creating_path() will unlock and locks taken by start_creating_path()
+ * and drop an references that were taken. It should only be called
+ * if start_creating_path() returned a non-error.
+ * If vfs_mkdir() was called and it returned an error, that error *should*
+ * be passed to end_creating_path() together with the path.
+ */
void end_creating_path(const struct path *path, struct dentry *dentry)
{
- if (!IS_ERR(dentry))
- dput(dentry);
- inode_unlock(path->dentry->d_inode);
+ end_creating(dentry);
mnt_drop_write(path->mnt);
path_put(path);
}
@@ -4278,13 +4937,15 @@ inline struct dentry *start_creating_user_path(
}
EXPORT_SYMBOL(start_creating_user_path);
+
/**
* vfs_mknod - create device node or file
- * @idmap: idmap of the mount the inode was found from
- * @dir: inode of the parent directory
- * @dentry: dentry of the child device node
- * @mode: mode of the child device node
- * @dev: device number of device to create
+ * @idmap: idmap of the mount the inode was found from
+ * @dir: inode of the parent directory
+ * @dentry: dentry of the child device node
+ * @mode: mode of the child device node
+ * @dev: device number of device to create
+ * @delegated_inode: returns parent inode, if the inode is delegated.
*
* Create a device node or file.
*
@@ -4295,7 +4956,8 @@ EXPORT_SYMBOL(start_creating_user_path);
* raw inode simply pass @nop_mnt_idmap.
*/
int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode, dev_t dev)
+ struct dentry *dentry, umode_t mode, dev_t dev,
+ struct delegated_inode *delegated_inode)
{
bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV;
int error = may_create(idmap, dir, dentry);
@@ -4319,6 +4981,10 @@ int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
if (error)
return error;
+ error = try_break_deleg(dir, delegated_inode);
+ if (error)
+ return error;
+
error = dir->i_op->mknod(idmap, dir, dentry, mode, dev);
if (!error)
fsnotify_create(dir, dentry);
@@ -4346,6 +5012,7 @@ static int may_mknod(umode_t mode)
static int do_mknodat(int dfd, struct filename *name, umode_t mode,
unsigned int dev)
{
+ struct delegated_inode di = { };
struct mnt_idmap *idmap;
struct dentry *dentry;
struct path path;
@@ -4369,22 +5036,26 @@ retry:
idmap = mnt_idmap(path.mnt);
switch (mode & S_IFMT) {
case 0: case S_IFREG:
- error = vfs_create(idmap, path.dentry->d_inode,
- dentry, mode, true);
+ error = vfs_create(idmap, dentry, mode, &di);
if (!error)
security_path_post_mknod(idmap, dentry);
break;
case S_IFCHR: case S_IFBLK:
error = vfs_mknod(idmap, path.dentry->d_inode,
- dentry, mode, new_decode_dev(dev));
+ dentry, mode, new_decode_dev(dev), &di);
break;
case S_IFIFO: case S_IFSOCK:
error = vfs_mknod(idmap, path.dentry->d_inode,
- dentry, mode, 0);
+ dentry, mode, 0, &di);
break;
}
out2:
end_creating_path(&path, dentry);
+ if (is_delegated(&di)) {
+ error = break_deleg_wait(&di);
+ if (!error)
+ goto retry;
+ }
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
goto retry;
@@ -4407,10 +5078,11 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d
/**
* vfs_mkdir - create directory returning correct dentry if possible
- * @idmap: idmap of the mount the inode was found from
- * @dir: inode of the parent directory
- * @dentry: dentry of the child directory
- * @mode: mode of the child directory
+ * @idmap: idmap of the mount the inode was found from
+ * @dir: inode of the parent directory
+ * @dentry: dentry of the child directory
+ * @mode: mode of the child directory
+ * @delegated_inode: returns parent inode, if the inode is delegated.
*
* Create a directory.
*
@@ -4427,7 +5099,8 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d
* In case of an error the dentry is dput() and an ERR_PTR() is returned.
*/
struct dentry *vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+ struct dentry *dentry, umode_t mode,
+ struct delegated_inode *delegated_inode)
{
int error;
unsigned max_links = dir->i_sb->s_max_links;
@@ -4450,6 +5123,10 @@ struct dentry *vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
if (max_links && dir->i_nlink >= max_links)
goto err;
+ error = try_break_deleg(dir, delegated_inode);
+ if (error)
+ goto err;
+
de = dir->i_op->mkdir(idmap, dir, dentry, mode);
error = PTR_ERR(de);
if (IS_ERR(de))
@@ -4462,7 +5139,7 @@ struct dentry *vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
return dentry;
err:
- dput(dentry);
+ end_creating(dentry);
return ERR_PTR(error);
}
EXPORT_SYMBOL(vfs_mkdir);
@@ -4473,6 +5150,7 @@ int do_mkdirat(int dfd, struct filename *name, umode_t mode)
struct path path;
int error;
unsigned int lookup_flags = LOOKUP_DIRECTORY;
+ struct delegated_inode delegated_inode = { };
retry:
dentry = filename_create(dfd, name, &path, lookup_flags);
@@ -4484,11 +5162,16 @@ retry:
mode_strip_umask(path.dentry->d_inode, mode));
if (!error) {
dentry = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode,
- dentry, mode);
+ dentry, mode, &delegated_inode);
if (IS_ERR(dentry))
error = PTR_ERR(dentry);
}
end_creating_path(&path, dentry);
+ if (is_delegated(&delegated_inode)) {
+ error = break_deleg_wait(&delegated_inode);
+ if (!error)
+ goto retry;
+ }
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
goto retry;
@@ -4510,9 +5193,10 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
/**
* vfs_rmdir - remove directory
- * @idmap: idmap of the mount the inode was found from
- * @dir: inode of the parent directory
- * @dentry: dentry of the child directory
+ * @idmap: idmap of the mount the inode was found from
+ * @dir: inode of the parent directory
+ * @dentry: dentry of the child directory
+ * @delegated_inode: returns parent inode, if it's delegated.
*
* Remove a directory.
*
@@ -4523,7 +5207,7 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
* raw inode simply pass @nop_mnt_idmap.
*/
int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry)
+ struct dentry *dentry, struct delegated_inode *delegated_inode)
{
int error = may_delete(idmap, dir, dentry, 1);
@@ -4545,6 +5229,10 @@ int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir,
if (error)
goto out;
+ error = try_break_deleg(dir, delegated_inode);
+ if (error)
+ goto out;
+
error = dir->i_op->rmdir(dir, dentry);
if (error)
goto out;
@@ -4571,6 +5259,7 @@ int do_rmdir(int dfd, struct filename *name)
struct qstr last;
int type;
unsigned int lookup_flags = 0;
+ struct delegated_inode delegated_inode = { };
retry:
error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
if (error)
@@ -4592,22 +5281,26 @@ retry:
if (error)
goto exit2;
- inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
- dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags);
+ dentry = start_dirop(path.dentry, &last, lookup_flags);
error = PTR_ERR(dentry);
if (IS_ERR(dentry))
goto exit3;
error = security_path_rmdir(&path, dentry);
if (error)
goto exit4;
- error = vfs_rmdir(mnt_idmap(path.mnt), path.dentry->d_inode, dentry);
+ error = vfs_rmdir(mnt_idmap(path.mnt), path.dentry->d_inode,
+ dentry, &delegated_inode);
exit4:
- dput(dentry);
+ end_dirop(dentry);
exit3:
- inode_unlock(path.dentry->d_inode);
mnt_drop_write(path.mnt);
exit2:
path_put(&path);
+ if (is_delegated(&delegated_inode)) {
+ error = break_deleg_wait(&delegated_inode);
+ if (!error)
+ goto retry;
+ }
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
goto retry;
@@ -4648,7 +5341,7 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
* raw inode simply pass @nop_mnt_idmap.
*/
int vfs_unlink(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, struct inode **delegated_inode)
+ struct dentry *dentry, struct delegated_inode *delegated_inode)
{
struct inode *target = dentry->d_inode;
int error = may_delete(idmap, dir, dentry, 0);
@@ -4667,6 +5360,9 @@ int vfs_unlink(struct mnt_idmap *idmap, struct inode *dir,
else {
error = security_inode_unlink(dir, dentry);
if (!error) {
+ error = try_break_deleg(dir, delegated_inode);
+ if (error)
+ goto out;
error = try_break_deleg(target, delegated_inode);
if (error)
goto out;
@@ -4705,67 +5401,62 @@ int do_unlinkat(int dfd, struct filename *name)
struct path path;
struct qstr last;
int type;
- struct inode *inode = NULL;
- struct inode *delegated_inode = NULL;
+ struct inode *inode;
+ struct delegated_inode delegated_inode = { };
unsigned int lookup_flags = 0;
retry:
error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
if (error)
- goto exit1;
+ goto exit_putname;
error = -EISDIR;
if (type != LAST_NORM)
- goto exit2;
+ goto exit_path_put;
error = mnt_want_write(path.mnt);
if (error)
- goto exit2;
+ goto exit_path_put;
retry_deleg:
- inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
- dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags);
+ dentry = start_dirop(path.dentry, &last, lookup_flags);
error = PTR_ERR(dentry);
- if (!IS_ERR(dentry)) {
+ if (IS_ERR(dentry))
+ goto exit_drop_write;
- /* Why not before? Because we want correct error value */
- if (last.name[last.len])
- goto slashes;
- inode = dentry->d_inode;
- ihold(inode);
- error = security_path_unlink(&path, dentry);
- if (error)
- goto exit3;
- error = vfs_unlink(mnt_idmap(path.mnt), path.dentry->d_inode,
- dentry, &delegated_inode);
-exit3:
- dput(dentry);
+ /* Why not before? Because we want correct error value */
+ if (unlikely(last.name[last.len])) {
+ if (d_is_dir(dentry))
+ error = -EISDIR;
+ else
+ error = -ENOTDIR;
+ end_dirop(dentry);
+ goto exit_drop_write;
}
- inode_unlock(path.dentry->d_inode);
- if (inode)
- iput(inode); /* truncate the inode here */
- inode = NULL;
- if (delegated_inode) {
+ inode = dentry->d_inode;
+ ihold(inode);
+ error = security_path_unlink(&path, dentry);
+ if (error)
+ goto exit_end_dirop;
+ error = vfs_unlink(mnt_idmap(path.mnt), path.dentry->d_inode,
+ dentry, &delegated_inode);
+exit_end_dirop:
+ end_dirop(dentry);
+ iput(inode); /* truncate the inode here */
+ if (is_delegated(&delegated_inode)) {
error = break_deleg_wait(&delegated_inode);
if (!error)
goto retry_deleg;
}
+exit_drop_write:
mnt_drop_write(path.mnt);
-exit2:
+exit_path_put:
path_put(&path);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
- inode = NULL;
goto retry;
}
-exit1:
+exit_putname:
putname(name);
return error;
-
-slashes:
- if (d_is_dir(dentry))
- error = -EISDIR;
- else
- error = -ENOTDIR;
- goto exit3;
}
SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
@@ -4789,6 +5480,7 @@ SYSCALL_DEFINE1(unlink, const char __user *, pathname)
* @dir: inode of the parent directory
* @dentry: dentry of the child symlink file
* @oldname: name of the file to link to
+ * @delegated_inode: returns victim inode, if the inode is delegated.
*
* Create a symlink.
*
@@ -4799,7 +5491,8 @@ SYSCALL_DEFINE1(unlink, const char __user *, pathname)
* raw inode simply pass @nop_mnt_idmap.
*/
int vfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, const char *oldname)
+ struct dentry *dentry, const char *oldname,
+ struct delegated_inode *delegated_inode)
{
int error;
@@ -4814,6 +5507,10 @@ int vfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
if (error)
return error;
+ error = try_break_deleg(dir, delegated_inode);
+ if (error)
+ return error;
+
error = dir->i_op->symlink(idmap, dir, dentry, oldname);
if (!error)
fsnotify_create(dir, dentry);
@@ -4827,6 +5524,7 @@ int do_symlinkat(struct filename *from, int newdfd, struct filename *to)
struct dentry *dentry;
struct path path;
unsigned int lookup_flags = 0;
+ struct delegated_inode delegated_inode = { };
if (IS_ERR(from)) {
error = PTR_ERR(from);
@@ -4841,8 +5539,13 @@ retry:
error = security_path_symlink(&path, dentry, from->name);
if (!error)
error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode,
- dentry, from->name);
+ dentry, from->name, &delegated_inode);
end_creating_path(&path, dentry);
+ if (is_delegated(&delegated_inode)) {
+ error = break_deleg_wait(&delegated_inode);
+ if (!error)
+ goto retry;
+ }
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
goto retry;
@@ -4892,7 +5595,7 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn
*/
int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap,
struct inode *dir, struct dentry *new_dentry,
- struct inode **delegated_inode)
+ struct delegated_inode *delegated_inode)
{
struct inode *inode = old_dentry->d_inode;
unsigned max_links = dir->i_sb->s_max_links;
@@ -4931,19 +5634,21 @@ int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap,
inode_lock(inode);
/* Make sure we don't allow creating hardlink to an unlinked file */
- if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
+ if (inode->i_nlink == 0 && !(inode_state_read_once(inode) & I_LINKABLE))
error = -ENOENT;
else if (max_links && inode->i_nlink >= max_links)
error = -EMLINK;
else {
- error = try_break_deleg(inode, delegated_inode);
+ error = try_break_deleg(dir, delegated_inode);
+ if (!error)
+ error = try_break_deleg(inode, delegated_inode);
if (!error)
error = dir->i_op->link(old_dentry, dir, new_dentry);
}
- if (!error && (inode->i_state & I_LINKABLE)) {
+ if (!error && (inode_state_read_once(inode) & I_LINKABLE)) {
spin_lock(&inode->i_lock);
- inode->i_state &= ~I_LINKABLE;
+ inode_state_clear(inode, I_LINKABLE);
spin_unlock(&inode->i_lock);
}
inode_unlock(inode);
@@ -4968,7 +5673,7 @@ int do_linkat(int olddfd, struct filename *old, int newdfd,
struct mnt_idmap *idmap;
struct dentry *new_dentry;
struct path old_path, new_path;
- struct inode *delegated_inode = NULL;
+ struct delegated_inode delegated_inode = { };
int how = 0;
int error;
@@ -5012,7 +5717,7 @@ retry:
new_dentry, &delegated_inode);
out_dput:
end_creating_path(&new_path, new_dentry);
- if (delegated_inode) {
+ if (is_delegated(&delegated_inode)) {
error = break_deleg_wait(&delegated_inode);
if (!error) {
path_put(&old_path);
@@ -5098,7 +5803,7 @@ int vfs_rename(struct renamedata *rd)
struct inode *new_dir = d_inode(rd->new_parent);
struct dentry *old_dentry = rd->old_dentry;
struct dentry *new_dentry = rd->new_dentry;
- struct inode **delegated_inode = rd->delegated_inode;
+ struct delegated_inode *delegated_inode = rd->delegated_inode;
unsigned int flags = rd->flags;
bool is_dir = d_is_dir(old_dentry);
struct inode *source = old_dentry->d_inode;
@@ -5203,6 +5908,14 @@ int vfs_rename(struct renamedata *rd)
old_dir->i_nlink >= max_links)
goto out;
}
+ error = try_break_deleg(old_dir, delegated_inode);
+ if (error)
+ goto out;
+ if (new_dir != old_dir) {
+ error = try_break_deleg(new_dir, delegated_inode);
+ if (error)
+ goto out;
+ }
if (!is_dir) {
error = try_break_deleg(source, delegated_inode);
if (error)
@@ -5256,14 +5969,11 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd,
struct filename *to, unsigned int flags)
{
struct renamedata rd;
- struct dentry *old_dentry, *new_dentry;
- struct dentry *trap;
struct path old_path, new_path;
struct qstr old_last, new_last;
int old_type, new_type;
- struct inode *delegated_inode = NULL;
- unsigned int lookup_flags = 0, target_flags =
- LOOKUP_RENAME_TARGET | LOOKUP_CREATE;
+ struct delegated_inode delegated_inode = { };
+ unsigned int lookup_flags = 0;
bool should_retry = false;
int error = -EINVAL;
@@ -5274,11 +5984,6 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd,
(flags & RENAME_EXCHANGE))
goto put_names;
- if (flags & RENAME_EXCHANGE)
- target_flags = 0;
- if (flags & RENAME_NOREPLACE)
- target_flags |= LOOKUP_EXCL;
-
retry:
error = filename_parentat(olddfd, from, lookup_flags, &old_path,
&old_last, &old_type);
@@ -5308,68 +6013,42 @@ retry:
goto exit2;
retry_deleg:
- trap = lock_rename(new_path.dentry, old_path.dentry);
- if (IS_ERR(trap)) {
- error = PTR_ERR(trap);
+ rd.old_parent = old_path.dentry;
+ rd.mnt_idmap = mnt_idmap(old_path.mnt);
+ rd.new_parent = new_path.dentry;
+ rd.delegated_inode = &delegated_inode;
+ rd.flags = flags;
+
+ error = __start_renaming(&rd, lookup_flags, &old_last, &new_last);
+ if (error)
goto exit_lock_rename;
- }
- old_dentry = lookup_one_qstr_excl(&old_last, old_path.dentry,
- lookup_flags);
- error = PTR_ERR(old_dentry);
- if (IS_ERR(old_dentry))
- goto exit3;
- new_dentry = lookup_one_qstr_excl(&new_last, new_path.dentry,
- lookup_flags | target_flags);
- error = PTR_ERR(new_dentry);
- if (IS_ERR(new_dentry))
- goto exit4;
if (flags & RENAME_EXCHANGE) {
- if (!d_is_dir(new_dentry)) {
+ if (!d_is_dir(rd.new_dentry)) {
error = -ENOTDIR;
if (new_last.name[new_last.len])
- goto exit5;
+ goto exit_unlock;
}
}
/* unless the source is a directory trailing slashes give -ENOTDIR */
- if (!d_is_dir(old_dentry)) {
+ if (!d_is_dir(rd.old_dentry)) {
error = -ENOTDIR;
if (old_last.name[old_last.len])
- goto exit5;
+ goto exit_unlock;
if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len])
- goto exit5;
+ goto exit_unlock;
}
- /* source should not be ancestor of target */
- error = -EINVAL;
- if (old_dentry == trap)
- goto exit5;
- /* target should not be an ancestor of source */
- if (!(flags & RENAME_EXCHANGE))
- error = -ENOTEMPTY;
- if (new_dentry == trap)
- goto exit5;
- error = security_path_rename(&old_path, old_dentry,
- &new_path, new_dentry, flags);
+ error = security_path_rename(&old_path, rd.old_dentry,
+ &new_path, rd.new_dentry, flags);
if (error)
- goto exit5;
+ goto exit_unlock;
- rd.old_parent = old_path.dentry;
- rd.old_dentry = old_dentry;
- rd.mnt_idmap = mnt_idmap(old_path.mnt);
- rd.new_parent = new_path.dentry;
- rd.new_dentry = new_dentry;
- rd.delegated_inode = &delegated_inode;
- rd.flags = flags;
error = vfs_rename(&rd);
-exit5:
- dput(new_dentry);
-exit4:
- dput(old_dentry);
-exit3:
- unlock_rename(new_path.dentry, old_path.dentry);
+exit_unlock:
+ end_renaming(&rd);
exit_lock_rename:
- if (delegated_inode) {
+ if (is_delegated(&delegated_inode)) {
error = break_deleg_wait(&delegated_inode);
if (!error)
goto retry_deleg;
diff --git a/fs/namespace.c b/fs/namespace.c
index d82910f33dc4..c58674a20cad 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -132,16 +132,6 @@ EXPORT_SYMBOL_GPL(fs_kobj);
*/
__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
-static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
-{
- struct ns_common *ns;
-
- if (!node)
- return NULL;
- ns = rb_entry(node, struct ns_common, ns_tree_node);
- return container_of(ns, struct mnt_namespace, ns);
-}
-
static void mnt_ns_release(struct mnt_namespace *ns)
{
/* keep alive for {list,stat}mount() */
@@ -151,7 +141,8 @@ static void mnt_ns_release(struct mnt_namespace *ns)
kfree(ns);
}
}
-DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T))
+DEFINE_FREE(mnt_ns_release, struct mnt_namespace *,
+ if (!IS_ERR(_T)) mnt_ns_release(_T))
static void mnt_ns_release_rcu(struct rcu_head *rcu)
{
@@ -1345,26 +1336,12 @@ static void delayed_mntput(struct work_struct *unused)
}
static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);
-static void mntput_no_expire(struct mount *mnt)
+static void noinline mntput_no_expire_slowpath(struct mount *mnt)
{
LIST_HEAD(list);
int count;
- rcu_read_lock();
- if (likely(READ_ONCE(mnt->mnt_ns))) {
- /*
- * Since we don't do lock_mount_hash() here,
- * ->mnt_ns can change under us. However, if it's
- * non-NULL, then there's a reference that won't
- * be dropped until after an RCU delay done after
- * turning ->mnt_ns NULL. So if we observe it
- * non-NULL under rcu_read_lock(), the reference
- * we are dropping is not the final one.
- */
- mnt_add_count(mnt, -1);
- rcu_read_unlock();
- return;
- }
+ VFS_BUG_ON(mnt->mnt_ns);
lock_mount_hash();
/*
* make sure that if __legitimize_mnt() has not seen us grab
@@ -1415,6 +1392,26 @@ static void mntput_no_expire(struct mount *mnt)
cleanup_mnt(mnt);
}
+static void mntput_no_expire(struct mount *mnt)
+{
+ rcu_read_lock();
+ if (likely(READ_ONCE(mnt->mnt_ns))) {
+ /*
+ * Since we don't do lock_mount_hash() here,
+ * ->mnt_ns can change under us. However, if it's
+ * non-NULL, then there's a reference that won't
+ * be dropped until after an RCU delay done after
+ * turning ->mnt_ns NULL. So if we observe it
+ * non-NULL under rcu_read_lock(), the reference
+ * we are dropping is not the final one.
+ */
+ mnt_add_count(mnt, -1);
+ rcu_read_unlock();
+ return;
+ }
+ mntput_no_expire_slowpath(mnt);
+}
+
void mntput(struct vfsmount *mnt)
{
if (mnt) {
@@ -3103,19 +3100,7 @@ static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned
SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags)
{
- int fd;
- struct file *file __free(fput) = NULL;
-
- file = vfs_open_tree(dfd, filename, flags);
- if (IS_ERR(file))
- return PTR_ERR(file);
-
- fd = get_unused_fd_flags(flags & O_CLOEXEC);
- if (fd < 0)
- return fd;
-
- fd_install(fd, no_free_ptr(file));
- return fd;
+ return FD_ADD(flags, vfs_open_tree(dfd, filename, flags));
}
/*
@@ -4093,8 +4078,9 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
dec_mnt_namespaces(ucounts);
return ERR_PTR(ret);
}
- if (!anon)
- ns_tree_gen_id(&new_ns->ns);
+ ns_tree_gen_id(new_ns);
+
+ new_ns->is_anon = anon;
refcount_set(&new_ns->passive, 1);
new_ns->mounts = RB_ROOT;
init_waitqueue_head(&new_ns->poll);
@@ -4283,10 +4269,10 @@ static unsigned int attr_flags_to_mnt_flags(u64 attr_flags)
SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
unsigned int, attr_flags)
{
+ struct path new_path __free(path_put) = {};
struct mnt_namespace *ns;
struct fs_context *fc;
- struct file *file;
- struct path newmount;
+ struct vfsmount *new_mnt;
struct mount *mnt;
unsigned int mnt_flags = 0;
long ret;
@@ -4324,35 +4310,36 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
fc = fd_file(f)->private_data;
- ret = mutex_lock_interruptible(&fc->uapi_mutex);
- if (ret < 0)
+ ACQUIRE(mutex_intr, uapi_mutex)(&fc->uapi_mutex);
+ ret = ACQUIRE_ERR(mutex_intr, &uapi_mutex);
+ if (ret)
return ret;
/* There must be a valid superblock or we can't mount it */
ret = -EINVAL;
if (!fc->root)
- goto err_unlock;
+ return ret;
ret = -EPERM;
if (mount_too_revealing(fc->root->d_sb, &mnt_flags)) {
errorfcp(fc, "VFS", "Mount too revealing");
- goto err_unlock;
+ return ret;
}
ret = -EBUSY;
if (fc->phase != FS_CONTEXT_AWAITING_MOUNT)
- goto err_unlock;
+ return ret;
if (fc->sb_flags & SB_MANDLOCK)
warn_mandlock();
- newmount.mnt = vfs_create_mount(fc);
- if (IS_ERR(newmount.mnt)) {
- ret = PTR_ERR(newmount.mnt);
- goto err_unlock;
- }
- newmount.dentry = dget(fc->root);
- newmount.mnt->mnt_flags = mnt_flags;
+ new_mnt = vfs_create_mount(fc);
+ if (IS_ERR(new_mnt))
+ return PTR_ERR(new_mnt);
+ new_mnt->mnt_flags = mnt_flags;
+
+ new_path.dentry = dget(fc->root);
+ new_path.mnt = new_mnt;
/* We've done the mount bit - now move the file context into more or
* less the same state as if we'd done an fspick(). We don't want to
@@ -4362,38 +4349,27 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
vfs_clean_context(fc);
ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, true);
- if (IS_ERR(ns)) {
- ret = PTR_ERR(ns);
- goto err_path;
- }
- mnt = real_mount(newmount.mnt);
+ if (IS_ERR(ns))
+ return PTR_ERR(ns);
+ mnt = real_mount(new_path.mnt);
ns->root = mnt;
ns->nr_mounts = 1;
mnt_add_to_ns(ns, mnt);
- mntget(newmount.mnt);
+ mntget(new_path.mnt);
- /* Attach to an apparent O_PATH fd with a note that we need to unmount
- * it, not just simply put it.
- */
- file = dentry_open(&newmount, O_PATH, fc->cred);
- if (IS_ERR(file)) {
- dissolve_on_fput(newmount.mnt);
- ret = PTR_ERR(file);
- goto err_path;
+ FD_PREPARE(fdf, (flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0,
+ dentry_open(&new_path, O_PATH, fc->cred));
+ if (fdf.err) {
+ dissolve_on_fput(new_path.mnt);
+ return fdf.err;
}
- file->f_mode |= FMODE_NEED_UNMOUNT;
-
- ret = get_unused_fd_flags((flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0);
- if (ret >= 0)
- fd_install(ret, file);
- else
- fput(file);
-err_path:
- path_put(&newmount);
-err_unlock:
- mutex_unlock(&fc->uapi_mutex);
- return ret;
+ /*
+ * Attach to an apparent O_PATH fd with a note that we
+ * need to unmount it, not just simply put it.
+ */
+ fd_prepare_file(fdf)->f_mode |= FMODE_NEED_UNMOUNT;
+ return fd_publish(fdf);
}
static inline int vfs_move_mount(const struct path *from_path,
@@ -5035,19 +5011,17 @@ SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename,
unsigned, flags, struct mount_attr __user *, uattr,
size_t, usize)
{
- struct file __free(fput) *file = NULL;
- int fd;
-
if (!uattr && usize)
return -EINVAL;
- file = vfs_open_tree(dfd, filename, flags);
- if (IS_ERR(file))
- return PTR_ERR(file);
+ FD_PREPARE(fdf, flags, vfs_open_tree(dfd, filename, flags));
+ if (fdf.err)
+ return fdf.err;
if (uattr) {
- int ret;
struct mount_kattr kattr = {};
+ struct file *file = fd_prepare_file(fdf);
+ int ret;
if (flags & OPEN_TREE_CLONE)
kattr.kflags = MOUNT_KATTR_IDMAP_REPLACE;
@@ -5063,12 +5037,7 @@ SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename,
return ret;
}
- fd = get_unused_fd_flags(flags & O_CLOEXEC);
- if (fd < 0)
- return fd;
-
- fd_install(fd, no_free_ptr(file));
- return fd;
+ return fd_publish(fdf);
}
int show_path(struct seq_file *m, struct dentry *root)
@@ -5150,6 +5119,12 @@ static u64 mnt_to_propagation_flags(struct mount *m)
return propagation;
}
+u64 vfsmount_to_propagation_flags(struct vfsmount *mnt)
+{
+ return mnt_to_propagation_flags(real_mount(mnt));
+}
+EXPORT_SYMBOL_GPL(vfsmount_to_propagation_flags);
+
static void statmount_sb_basic(struct kstatmount *s)
{
struct super_block *sb = s->mnt->mnt_sb;
@@ -5454,11 +5429,11 @@ static int statmount_string(struct kstatmount *s, u64 flag)
ret = statmount_sb_source(s, seq);
break;
case STATMOUNT_MNT_UIDMAP:
- sm->mnt_uidmap = start;
+ offp = &sm->mnt_uidmap;
ret = statmount_mnt_uidmap(s, seq);
break;
case STATMOUNT_MNT_GIDMAP:
- sm->mnt_gidmap = start;
+ offp = &sm->mnt_gidmap;
ret = statmount_mnt_gidmap(s, seq);
break;
default:
@@ -5736,7 +5711,7 @@ static int copy_mnt_id_req(const struct mnt_id_req __user *req,
ret = copy_struct_from_user(kreq, sizeof(*kreq), req, usize);
if (ret)
return ret;
- if (kreq->spare != 0)
+ if (kreq->mnt_ns_fd != 0 && kreq->mnt_ns_id)
return -EINVAL;
/* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */
if (kreq->mnt_id <= MNT_UNIQUE_ID_OFFSET)
@@ -5753,16 +5728,14 @@ static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq
{
struct mnt_namespace *mnt_ns;
- if (kreq->mnt_ns_id && kreq->spare)
- return ERR_PTR(-EINVAL);
-
- if (kreq->mnt_ns_id)
- return lookup_mnt_ns(kreq->mnt_ns_id);
-
- if (kreq->spare) {
+ if (kreq->mnt_ns_id) {
+ mnt_ns = lookup_mnt_ns(kreq->mnt_ns_id);
+ if (!mnt_ns)
+ return ERR_PTR(-ENOENT);
+ } else if (kreq->mnt_ns_fd) {
struct ns_common *ns;
- CLASS(fd, f)(kreq->spare);
+ CLASS(fd, f)(kreq->mnt_ns_fd);
if (fd_empty(f))
return ERR_PTR(-EBADF);
@@ -5774,11 +5747,12 @@ static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq
return ERR_PTR(-EINVAL);
mnt_ns = to_mnt_ns(ns);
+ refcount_inc(&mnt_ns->passive);
} else {
mnt_ns = current->nsproxy->mnt_ns;
+ refcount_inc(&mnt_ns->passive);
}
- refcount_inc(&mnt_ns->passive);
return mnt_ns;
}
@@ -5801,8 +5775,8 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
return ret;
ns = grab_requested_mnt_ns(&kreq);
- if (!ns)
- return -ENOENT;
+ if (IS_ERR(ns))
+ return PTR_ERR(ns);
if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) &&
!ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
@@ -5912,8 +5886,8 @@ static void __free_klistmount_free(const struct klistmount *kls)
static inline int prepare_klistmount(struct klistmount *kls, struct mnt_id_req *kreq,
size_t nr_mnt_ids)
{
-
u64 last_mnt_id = kreq->param;
+ struct mnt_namespace *ns;
/* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */
if (last_mnt_id != 0 && last_mnt_id <= MNT_UNIQUE_ID_OFFSET)
@@ -5927,9 +5901,10 @@ static inline int prepare_klistmount(struct klistmount *kls, struct mnt_id_req *
if (!kls->kmnt_ids)
return -ENOMEM;
- kls->ns = grab_requested_mnt_ns(kreq);
- if (!kls->ns)
- return -ENOENT;
+ ns = grab_requested_mnt_ns(kreq);
+ if (IS_ERR(ns))
+ return PTR_ERR(ns);
+ kls->ns = ns;
kls->mnt_parent_id = kreq->mnt_id;
return 0;
@@ -5985,11 +5960,8 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
}
struct mnt_namespace init_mnt_ns = {
- .ns.inum = ns_init_inum(&init_mnt_ns),
- .ns.ops = &mntns_operations,
+ .ns = NS_COMMON_INIT(init_mnt_ns),
.user_ns = &init_user_ns,
- .ns.__ns_ref = REFCOUNT_INIT(1),
- .ns.ns_type = ns_common_type(&init_mnt_ns),
.passive = REFCOUNT_INIT(1),
.mounts = RB_ROOT,
.poll = __WAIT_QUEUE_HEAD_INITIALIZER(init_mnt_ns.poll),
diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c
index 09394ac2c180..f9d62abef2ac 100644
--- a/fs/netfs/buffered_write.c
+++ b/fs/netfs/buffered_write.c
@@ -535,7 +535,7 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr
folio_unlock(folio);
err = filemap_fdatawrite_range(mapping,
folio_pos(folio),
- folio_pos(folio) + folio_size(folio));
+ folio_next_pos(folio));
switch (err) {
case 0:
ret = VM_FAULT_RETRY;
diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c
index 486166460e17..6df89c92b10b 100644
--- a/fs/netfs/misc.c
+++ b/fs/netfs/misc.c
@@ -147,10 +147,10 @@ bool netfs_dirty_folio(struct address_space *mapping, struct folio *folio)
if (!fscache_cookie_valid(cookie))
return true;
- if (!(inode->i_state & I_PINNING_NETFS_WB)) {
+ if (!(inode_state_read_once(inode) & I_PINNING_NETFS_WB)) {
spin_lock(&inode->i_lock);
- if (!(inode->i_state & I_PINNING_NETFS_WB)) {
- inode->i_state |= I_PINNING_NETFS_WB;
+ if (!(inode_state_read(inode) & I_PINNING_NETFS_WB)) {
+ inode_state_set(inode, I_PINNING_NETFS_WB);
need_use = true;
}
spin_unlock(&inode->i_lock);
@@ -192,7 +192,7 @@ void netfs_clear_inode_writeback(struct inode *inode, const void *aux)
{
struct fscache_cookie *cookie = netfs_i_cookie(netfs_inode(inode));
- if (inode->i_state & I_PINNING_NETFS_WB) {
+ if (inode_state_read_once(inode) & I_PINNING_NETFS_WB) {
loff_t i_size = i_size_read(inode);
fscache_unuse_cookie(cookie, aux, &i_size);
}
@@ -298,7 +298,7 @@ bool netfs_release_folio(struct folio *folio, gfp_t gfp)
if (folio_test_dirty(folio))
return false;
- end = umin(folio_pos(folio) + folio_size(folio), i_size_read(&ctx->inode));
+ end = umin(folio_next_pos(folio), i_size_read(&ctx->inode));
if (end > ctx->zero_point)
ctx->zero_point = end;
diff --git a/fs/netfs/read_single.c b/fs/netfs/read_single.c
index 5c0dc4efc792..8e6264f62a8f 100644
--- a/fs/netfs/read_single.c
+++ b/fs/netfs/read_single.c
@@ -36,12 +36,12 @@ void netfs_single_mark_inode_dirty(struct inode *inode)
mark_inode_dirty(inode);
- if (caching && !(inode->i_state & I_PINNING_NETFS_WB)) {
+ if (caching && !(inode_state_read_once(inode) & I_PINNING_NETFS_WB)) {
bool need_use = false;
spin_lock(&inode->i_lock);
- if (!(inode->i_state & I_PINNING_NETFS_WB)) {
- inode->i_state |= I_PINNING_NETFS_WB;
+ if (!(inode_state_read(inode) & I_PINNING_NETFS_WB)) {
+ inode_state_set(inode, I_PINNING_NETFS_WB);
need_use = true;
}
spin_unlock(&inode->i_lock);
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 4e3dcc157a83..54699299d5b1 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -338,6 +338,14 @@ again:
/* Match the xprt security policy */
if (clp->cl_xprtsec.policy != data->xprtsec.policy)
continue;
+ if (clp->cl_xprtsec.policy == RPC_XPRTSEC_TLS_X509) {
+ if (clp->cl_xprtsec.cert_serial !=
+ data->xprtsec.cert_serial)
+ continue;
+ if (clp->cl_xprtsec.privkey_serial !=
+ data->xprtsec.privkey_serial)
+ continue;
+ }
refcount_inc(&clp->cl_count);
return clp;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 46d9c65d50f8..ea9f6ca8f30f 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2268,11 +2268,12 @@ int nfs_atomic_open_v23(struct inode *dir, struct dentry *dentry,
return -ENAMETOOLONG;
if (open_flags & O_CREAT) {
- file->f_mode |= FMODE_CREATED;
error = nfs_do_create(dir, dentry, mode, open_flags);
- if (error)
+ if (!error) {
+ file->f_mode |= FMODE_CREATED;
+ return finish_open(file, dentry, NULL);
+ } else if (error != -EEXIST || open_flags & O_EXCL)
return error;
- return finish_open(file, dentry, NULL);
}
if (d_in_lookup(dentry)) {
/* The only flags nfs_lookup considers are
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 18b57c7c2f97..f76fe406937a 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -475,7 +475,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
goto out_no_inode;
}
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
struct nfs_inode *nfsi = NFS_I(inode);
unsigned long now = jiffies;
@@ -718,6 +718,8 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
struct nfs_fattr *fattr;
loff_t oldsize = i_size_read(inode);
int error = 0;
+ kuid_t task_uid = current_fsuid();
+ kuid_t owner_uid = inode->i_uid;
nfs_inc_stats(inode, NFSIOS_VFSSETATTR);
@@ -739,9 +741,11 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
if (nfs_have_delegated_mtime(inode) && attr->ia_valid & ATTR_MTIME) {
spin_lock(&inode->i_lock);
if (attr->ia_valid & ATTR_MTIME_SET) {
- nfs_set_timestamps_to_ts(inode, attr);
- attr->ia_valid &= ~(ATTR_MTIME|ATTR_MTIME_SET|
+ if (uid_eq(task_uid, owner_uid)) {
+ nfs_set_timestamps_to_ts(inode, attr);
+ attr->ia_valid &= ~(ATTR_MTIME|ATTR_MTIME_SET|
ATTR_ATIME|ATTR_ATIME_SET);
+ }
} else {
nfs_update_timestamps(inode, attr->ia_valid);
attr->ia_valid &= ~(ATTR_MTIME|ATTR_ATIME);
@@ -751,10 +755,12 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
attr->ia_valid & ATTR_ATIME &&
!(attr->ia_valid & ATTR_MTIME)) {
if (attr->ia_valid & ATTR_ATIME_SET) {
- spin_lock(&inode->i_lock);
- nfs_set_timestamps_to_ts(inode, attr);
- spin_unlock(&inode->i_lock);
- attr->ia_valid &= ~(ATTR_ATIME|ATTR_ATIME_SET);
+ if (uid_eq(task_uid, owner_uid)) {
+ spin_lock(&inode->i_lock);
+ nfs_set_timestamps_to_ts(inode, attr);
+ spin_unlock(&inode->i_lock);
+ attr->ia_valid &= ~(ATTR_ATIME|ATTR_ATIME_SET);
+ }
} else {
nfs_update_delegated_atime(inode);
attr->ia_valid &= ~ATTR_ATIME;
diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
index 2c0455e91571..49ed90c6b9f2 100644
--- a/fs/nfs/localio.c
+++ b/fs/nfs/localio.c
@@ -42,10 +42,9 @@ struct nfs_local_kiocb {
/* Begin mostly DIO-specific members */
size_t end_len;
short int end_iter_index;
- short int n_iters;
+ atomic_t n_iters;
bool iter_is_dio_aligned[NFSLOCAL_MAX_IOS];
- loff_t offset[NFSLOCAL_MAX_IOS] ____cacheline_aligned;
- struct iov_iter iters[NFSLOCAL_MAX_IOS];
+ struct iov_iter iters[NFSLOCAL_MAX_IOS] ____cacheline_aligned;
/* End mostly DIO-specific members */
};
@@ -314,7 +313,9 @@ nfs_local_iocb_alloc(struct nfs_pgio_header *hdr,
init_sync_kiocb(&iocb->kiocb, file);
iocb->hdr = hdr;
+ iocb->kiocb.ki_pos = hdr->args.offset;
iocb->kiocb.ki_flags &= ~IOCB_APPEND;
+ iocb->kiocb.ki_complete = NULL;
iocb->aio_complete_work = NULL;
iocb->end_iter_index = -1;
@@ -388,13 +389,24 @@ static bool nfs_iov_iter_aligned_bvec(const struct iov_iter *i,
return true;
}
+static void
+nfs_local_iter_setup(struct iov_iter *iter, int rw, struct bio_vec *bvec,
+ unsigned int nvecs, unsigned long total,
+ size_t start, size_t len)
+{
+ iov_iter_bvec(iter, rw, bvec, nvecs, total);
+ if (start)
+ iov_iter_advance(iter, start);
+ iov_iter_truncate(iter, len);
+}
+
/*
* Setup as many as 3 iov_iter based on extents described by @local_dio.
* Returns the number of iov_iter that were setup.
*/
static int
nfs_local_iters_setup_dio(struct nfs_local_kiocb *iocb, int rw,
- unsigned int nvecs, size_t len,
+ unsigned int nvecs, unsigned long total,
struct nfs_local_dio *local_dio)
{
int n_iters = 0;
@@ -402,39 +414,17 @@ nfs_local_iters_setup_dio(struct nfs_local_kiocb *iocb, int rw,
/* Setup misaligned start? */
if (local_dio->start_len) {
- iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len);
- iters[n_iters].count = local_dio->start_len;
- iocb->offset[n_iters] = iocb->hdr->args.offset;
- iocb->iter_is_dio_aligned[n_iters] = false;
+ nfs_local_iter_setup(&iters[n_iters], rw, iocb->bvec,
+ nvecs, total, 0, local_dio->start_len);
++n_iters;
}
- /* Setup misaligned end?
- * If so, the end is purposely setup to be issued using buffered IO
- * before the middle (which will use DIO, if DIO-aligned, with AIO).
- * This creates problems if/when the end results in a partial write.
- * So must save index and length of end to handle this corner case.
- */
- if (local_dio->end_len) {
- iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len);
- iocb->offset[n_iters] = local_dio->end_offset;
- iov_iter_advance(&iters[n_iters],
- local_dio->start_len + local_dio->middle_len);
- iocb->iter_is_dio_aligned[n_iters] = false;
- /* Save index and length of end */
- iocb->end_iter_index = n_iters;
- iocb->end_len = local_dio->end_len;
- ++n_iters;
- }
-
- /* Setup DIO-aligned middle to be issued last, to allow for
- * DIO with AIO completion (see nfs_local_call_{read,write}).
+ /*
+ * Setup DIO-aligned middle, if there is no misaligned end (below)
+ * then AIO completion is used, see nfs_local_call_{read,write}
*/
- iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len);
- if (local_dio->start_len)
- iov_iter_advance(&iters[n_iters], local_dio->start_len);
- iters[n_iters].count -= local_dio->end_len;
- iocb->offset[n_iters] = local_dio->middle_offset;
+ nfs_local_iter_setup(&iters[n_iters], rw, iocb->bvec, nvecs,
+ total, local_dio->start_len, local_dio->middle_len);
iocb->iter_is_dio_aligned[n_iters] =
nfs_iov_iter_aligned_bvec(&iters[n_iters],
@@ -442,12 +432,22 @@ nfs_local_iters_setup_dio(struct nfs_local_kiocb *iocb, int rw,
if (unlikely(!iocb->iter_is_dio_aligned[n_iters])) {
trace_nfs_local_dio_misaligned(iocb->hdr->inode,
- iocb->hdr->args.offset, len, local_dio);
+ local_dio->start_len, local_dio->middle_len, local_dio);
return 0; /* no DIO-aligned IO possible */
}
+ iocb->end_iter_index = n_iters;
++n_iters;
- iocb->n_iters = n_iters;
+ /* Setup misaligned end? */
+ if (local_dio->end_len) {
+ nfs_local_iter_setup(&iters[n_iters], rw, iocb->bvec,
+ nvecs, total, local_dio->start_len +
+ local_dio->middle_len, local_dio->end_len);
+ iocb->end_iter_index = n_iters;
+ ++n_iters;
+ }
+
+ atomic_set(&iocb->n_iters, n_iters);
return n_iters;
}
@@ -473,18 +473,26 @@ nfs_local_iters_init(struct nfs_local_kiocb *iocb, int rw)
}
len = hdr->args.count - total;
+ /*
+ * For each iocb, iocb->n_iters is always at least 1 and we always
+ * end io after first nfs_local_pgio_done call unless misaligned DIO.
+ */
+ atomic_set(&iocb->n_iters, 1);
+
if (test_bit(NFS_IOHDR_ODIRECT, &hdr->flags)) {
struct nfs_local_dio local_dio;
if (nfs_is_local_dio_possible(iocb, rw, len, &local_dio) &&
- nfs_local_iters_setup_dio(iocb, rw, v, len, &local_dio) != 0)
+ nfs_local_iters_setup_dio(iocb, rw, v, len, &local_dio) != 0) {
+ /* Ensure DIO WRITE's IO on stable storage upon completion */
+ if (rw == ITER_SOURCE)
+ iocb->kiocb.ki_flags |= IOCB_DSYNC|IOCB_SYNC;
return; /* is DIO-aligned */
+ }
}
/* Use buffered IO */
- iocb->offset[0] = hdr->args.offset;
iov_iter_bvec(&iocb->iters[0], rw, iocb->bvec, v, len);
- iocb->n_iters = 1;
}
static void
@@ -504,9 +512,11 @@ nfs_local_pgio_init(struct nfs_pgio_header *hdr,
hdr->task.tk_start = ktime_get();
}
-static void
-nfs_local_pgio_done(struct nfs_pgio_header *hdr, long status)
+static bool
+nfs_local_pgio_done(struct nfs_local_kiocb *iocb, long status, bool force)
{
+ struct nfs_pgio_header *hdr = iocb->hdr;
+
/* Must handle partial completions */
if (status >= 0) {
hdr->res.count += status;
@@ -517,6 +527,12 @@ nfs_local_pgio_done(struct nfs_pgio_header *hdr, long status)
hdr->res.op_status = nfs_localio_errno_to_nfs4_stat(status);
hdr->task.tk_status = status;
}
+
+ if (force)
+ return true;
+
+ BUG_ON(atomic_read(&iocb->n_iters) <= 0);
+ return atomic_dec_and_test(&iocb->n_iters);
}
static void
@@ -547,11 +563,11 @@ static inline void nfs_local_pgio_aio_complete(struct nfs_local_kiocb *iocb)
queue_work(nfsiod_workqueue, &iocb->work);
}
-static void
-nfs_local_read_done(struct nfs_local_kiocb *iocb, long status)
+static void nfs_local_read_done(struct nfs_local_kiocb *iocb)
{
struct nfs_pgio_header *hdr = iocb->hdr;
struct file *filp = iocb->kiocb.ki_filp;
+ long status = hdr->task.tk_status;
if ((iocb->kiocb.ki_flags & IOCB_DIRECT) && status == -EINVAL) {
/* Underlying FS will return -EINVAL if misaligned DIO is attempted. */
@@ -564,20 +580,27 @@ nfs_local_read_done(struct nfs_local_kiocb *iocb, long status)
*/
hdr->res.replen = 0;
- if (hdr->res.count != hdr->args.count ||
- hdr->args.offset + hdr->res.count >= i_size_read(file_inode(filp)))
+ /* nfs_readpage_result() handles short read */
+
+ if (hdr->args.offset + hdr->res.count >= i_size_read(file_inode(filp)))
hdr->res.eof = true;
dprintk("%s: read %ld bytes eof %d.\n", __func__,
status > 0 ? status : 0, hdr->res.eof);
}
+static inline void nfs_local_read_iocb_done(struct nfs_local_kiocb *iocb)
+{
+ nfs_local_read_done(iocb);
+ nfs_local_pgio_release(iocb);
+}
+
static void nfs_local_read_aio_complete_work(struct work_struct *work)
{
struct nfs_local_kiocb *iocb =
container_of(work, struct nfs_local_kiocb, work);
- nfs_local_pgio_release(iocb);
+ nfs_local_read_iocb_done(iocb);
}
static void nfs_local_read_aio_complete(struct kiocb *kiocb, long ret)
@@ -585,43 +608,51 @@ static void nfs_local_read_aio_complete(struct kiocb *kiocb, long ret)
struct nfs_local_kiocb *iocb =
container_of(kiocb, struct nfs_local_kiocb, kiocb);
- nfs_local_pgio_done(iocb->hdr, ret);
- nfs_local_read_done(iocb, ret);
+ /* AIO completion of DIO read should always be last to complete */
+ if (unlikely(!nfs_local_pgio_done(iocb, ret, false)))
+ return;
+
nfs_local_pgio_aio_complete(iocb); /* Calls nfs_local_read_aio_complete_work */
}
-static void nfs_local_call_read(struct work_struct *work)
+static void do_nfs_local_call_read(struct nfs_local_kiocb *iocb, struct file *filp)
{
- struct nfs_local_kiocb *iocb =
- container_of(work, struct nfs_local_kiocb, work);
- struct file *filp = iocb->kiocb.ki_filp;
- const struct cred *save_cred;
+ bool force_done = false;
ssize_t status;
+ int n_iters;
- save_cred = override_creds(filp->f_cred);
-
- for (int i = 0; i < iocb->n_iters ; i++) {
+ n_iters = atomic_read(&iocb->n_iters);
+ for (int i = 0; i < n_iters ; i++) {
if (iocb->iter_is_dio_aligned[i]) {
iocb->kiocb.ki_flags |= IOCB_DIRECT;
- iocb->kiocb.ki_complete = nfs_local_read_aio_complete;
- iocb->aio_complete_work = nfs_local_read_aio_complete_work;
- }
+ /* Only use AIO completion if DIO-aligned segment is last */
+ if (i == iocb->end_iter_index) {
+ iocb->kiocb.ki_complete = nfs_local_read_aio_complete;
+ iocb->aio_complete_work = nfs_local_read_aio_complete_work;
+ }
+ } else
+ iocb->kiocb.ki_flags &= ~IOCB_DIRECT;
- iocb->kiocb.ki_pos = iocb->offset[i];
status = filp->f_op->read_iter(&iocb->kiocb, &iocb->iters[i]);
if (status != -EIOCBQUEUED) {
- nfs_local_pgio_done(iocb->hdr, status);
- if (iocb->hdr->task.tk_status)
+ if (unlikely(status >= 0 && status < iocb->iters[i].count))
+ force_done = true; /* Partial read */
+ if (nfs_local_pgio_done(iocb, status, force_done)) {
+ nfs_local_read_iocb_done(iocb);
break;
+ }
}
}
+}
- revert_creds(save_cred);
+static void nfs_local_call_read(struct work_struct *work)
+{
+ struct nfs_local_kiocb *iocb =
+ container_of(work, struct nfs_local_kiocb, work);
+ struct file *filp = iocb->kiocb.ki_filp;
- if (status != -EIOCBQUEUED) {
- nfs_local_read_done(iocb, status);
- nfs_local_pgio_release(iocb);
- }
+ scoped_with_creds(filp->f_cred)
+ do_nfs_local_call_read(iocb, filp);
}
static int
@@ -736,11 +767,10 @@ static void nfs_local_vfs_getattr(struct nfs_local_kiocb *iocb)
fattr->du.nfs3.used = stat.blocks << 9;
}
-static void
-nfs_local_write_done(struct nfs_local_kiocb *iocb, long status)
+static void nfs_local_write_done(struct nfs_local_kiocb *iocb)
{
struct nfs_pgio_header *hdr = iocb->hdr;
- struct inode *inode = hdr->inode;
+ long status = hdr->task.tk_status;
dprintk("%s: wrote %ld bytes.\n", __func__, status > 0 ? status : 0);
@@ -759,10 +789,17 @@ nfs_local_write_done(struct nfs_local_kiocb *iocb, long status)
nfs_set_pgio_error(hdr, -ENOSPC, hdr->args.offset);
status = -ENOSPC;
/* record -ENOSPC in terms of nfs_local_pgio_done */
- nfs_local_pgio_done(hdr, status);
+ (void) nfs_local_pgio_done(iocb, status, true);
}
if (hdr->task.tk_status < 0)
- nfs_reset_boot_verifier(inode);
+ nfs_reset_boot_verifier(hdr->inode);
+}
+
+static inline void nfs_local_write_iocb_done(struct nfs_local_kiocb *iocb)
+{
+ nfs_local_write_done(iocb);
+ nfs_local_vfs_getattr(iocb);
+ nfs_local_pgio_release(iocb);
}
static void nfs_local_write_aio_complete_work(struct work_struct *work)
@@ -770,8 +807,7 @@ static void nfs_local_write_aio_complete_work(struct work_struct *work)
struct nfs_local_kiocb *iocb =
container_of(work, struct nfs_local_kiocb, work);
- nfs_local_vfs_getattr(iocb);
- nfs_local_pgio_release(iocb);
+ nfs_local_write_iocb_done(iocb);
}
static void nfs_local_write_aio_complete(struct kiocb *kiocb, long ret)
@@ -779,75 +815,62 @@ static void nfs_local_write_aio_complete(struct kiocb *kiocb, long ret)
struct nfs_local_kiocb *iocb =
container_of(kiocb, struct nfs_local_kiocb, kiocb);
- nfs_local_pgio_done(iocb->hdr, ret);
- nfs_local_write_done(iocb, ret);
+ /* AIO completion of DIO write should always be last to complete */
+ if (unlikely(!nfs_local_pgio_done(iocb, ret, false)))
+ return;
+
nfs_local_pgio_aio_complete(iocb); /* Calls nfs_local_write_aio_complete_work */
}
-static void nfs_local_call_write(struct work_struct *work)
+static ssize_t do_nfs_local_call_write(struct nfs_local_kiocb *iocb,
+ struct file *filp)
{
- struct nfs_local_kiocb *iocb =
- container_of(work, struct nfs_local_kiocb, work);
- struct file *filp = iocb->kiocb.ki_filp;
- unsigned long old_flags = current->flags;
- const struct cred *save_cred;
+ bool force_done = false;
ssize_t status;
-
- current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
- save_cred = override_creds(filp->f_cred);
+ int n_iters;
file_start_write(filp);
- for (int i = 0; i < iocb->n_iters ; i++) {
+ n_iters = atomic_read(&iocb->n_iters);
+ for (int i = 0; i < n_iters ; i++) {
if (iocb->iter_is_dio_aligned[i]) {
iocb->kiocb.ki_flags |= IOCB_DIRECT;
- iocb->kiocb.ki_complete = nfs_local_write_aio_complete;
- iocb->aio_complete_work = nfs_local_write_aio_complete_work;
- }
-retry:
- iocb->kiocb.ki_pos = iocb->offset[i];
+ /* Only use AIO completion if DIO-aligned segment is last */
+ if (i == iocb->end_iter_index) {
+ iocb->kiocb.ki_complete = nfs_local_write_aio_complete;
+ iocb->aio_complete_work = nfs_local_write_aio_complete_work;
+ }
+ } else
+ iocb->kiocb.ki_flags &= ~IOCB_DIRECT;
+
status = filp->f_op->write_iter(&iocb->kiocb, &iocb->iters[i]);
if (status != -EIOCBQUEUED) {
- if (unlikely(status >= 0 && status < iocb->iters[i].count)) {
- /* partial write */
- if (i == iocb->end_iter_index) {
- /* Must not account partial end, otherwise, due
- * to end being issued before middle: the partial
- * write accounting in nfs_local_write_done()
- * would incorrectly advance hdr->args.offset
- */
- status = 0;
- } else {
- /* Partial write at start or buffered middle,
- * exit early.
- */
- nfs_local_pgio_done(iocb->hdr, status);
- break;
- }
- } else if (unlikely(status == -ENOTBLK &&
- (iocb->kiocb.ki_flags & IOCB_DIRECT))) {
- /* VFS will return -ENOTBLK if DIO WRITE fails to
- * invalidate the page cache. Retry using buffered IO.
- */
- iocb->kiocb.ki_flags &= ~IOCB_DIRECT;
- iocb->kiocb.ki_complete = NULL;
- iocb->aio_complete_work = NULL;
- goto retry;
- }
- nfs_local_pgio_done(iocb->hdr, status);
- if (iocb->hdr->task.tk_status)
+ if (unlikely(status >= 0 && status < iocb->iters[i].count))
+ force_done = true; /* Partial write */
+ if (nfs_local_pgio_done(iocb, status, force_done)) {
+ nfs_local_write_iocb_done(iocb);
break;
+ }
}
}
file_end_write(filp);
- revert_creds(save_cred);
- current->flags = old_flags;
+ return status;
+}
- if (status != -EIOCBQUEUED) {
- nfs_local_write_done(iocb, status);
- nfs_local_vfs_getattr(iocb);
- nfs_local_pgio_release(iocb);
- }
+static void nfs_local_call_write(struct work_struct *work)
+{
+ struct nfs_local_kiocb *iocb =
+ container_of(work, struct nfs_local_kiocb, work);
+ struct file *filp = iocb->kiocb.ki_filp;
+ unsigned long old_flags = current->flags;
+ ssize_t status;
+
+ current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
+
+ scoped_with_creds(filp->f_cred)
+ status = do_nfs_local_call_write(iocb, filp);
+
+ current->flags = old_flags;
}
static int
diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c
index 0d7310c1ee0c..5d97c1d38bb6 100644
--- a/fs/nfs/nfs3client.c
+++ b/fs/nfs/nfs3client.c
@@ -2,6 +2,7 @@
#include <linux/nfs_fs.h>
#include <linux/nfs_mount.h>
#include <linux/sunrpc/addr.h>
+#include <net/handshake.h>
#include "internal.h"
#include "nfs3_fs.h"
#include "netns.h"
@@ -98,7 +99,11 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
.net = mds_clp->cl_net,
.timeparms = &ds_timeout,
.cred = mds_srv->cred,
- .xprtsec = mds_clp->cl_xprtsec,
+ .xprtsec = {
+ .policy = RPC_XPRTSEC_NONE,
+ .cert_serial = TLS_NO_CERT,
+ .privkey_serial = TLS_NO_PRIVKEY,
+ },
.connect_timeout = connect_timeout,
.reconnect_timeout = connect_timeout,
};
@@ -111,9 +116,14 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
cl_init.hostname = buf;
switch (ds_proto) {
+ case XPRT_TRANSPORT_TCP_TLS:
+ if (mds_clp->cl_xprtsec.policy != RPC_XPRTSEC_NONE)
+ cl_init.xprtsec = mds_clp->cl_xprtsec;
+ else
+ ds_proto = XPRT_TRANSPORT_TCP;
+ fallthrough;
case XPRT_TRANSPORT_RDMA:
case XPRT_TRANSPORT_TCP:
- case XPRT_TRANSPORT_TCP_TLS:
if (mds_clp->cl_nconnect > 1)
cl_init.nconnect = mds_clp->cl_nconnect;
}
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 5998d6bd8a4f..3a4baed993c9 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -11,6 +11,7 @@
#include <linux/sunrpc/xprt.h>
#include <linux/sunrpc/bc_xprt.h>
#include <linux/sunrpc/rpc_pipe_fs.h>
+#include <net/handshake.h>
#include "internal.h"
#include "callback.h"
#include "delegation.h"
@@ -983,7 +984,11 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
.net = mds_clp->cl_net,
.timeparms = &ds_timeout,
.cred = mds_srv->cred,
- .xprtsec = mds_srv->nfs_client->cl_xprtsec,
+ .xprtsec = {
+ .policy = RPC_XPRTSEC_NONE,
+ .cert_serial = TLS_NO_CERT,
+ .privkey_serial = TLS_NO_PRIVKEY,
+ },
};
char buf[INET6_ADDRSTRLEN + 1];
@@ -992,9 +997,14 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
cl_init.hostname = buf;
switch (ds_proto) {
+ case XPRT_TRANSPORT_TCP_TLS:
+ if (mds_srv->nfs_client->cl_xprtsec.policy != RPC_XPRTSEC_NONE)
+ cl_init.xprtsec = mds_srv->nfs_client->cl_xprtsec;
+ else
+ ds_proto = XPRT_TRANSPORT_TCP;
+ fallthrough;
case XPRT_TRANSPORT_RDMA:
case XPRT_TRANSPORT_TCP:
- case XPRT_TRANSPORT_TCP_TLS:
if (mds_clp->cl_nconnect > 1) {
cl_init.nconnect = mds_clp->cl_nconnect;
cl_init.max_connect = NFS_MAX_TRANSPORTS;
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 7f43e890d356..7317f26892c5 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -431,6 +431,8 @@ void nfs42_ssc_unregister_ops(void)
static int nfs4_setlease(struct file *file, int arg, struct file_lease **lease,
void **priv)
{
+ if (!S_ISREG(file_inode(file)->i_mode))
+ return -EINVAL;
return nfs4_proc_setlease(file, arg, lease, priv);
}
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index 00932500fce4..9e1c48c5c0b8 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -306,15 +306,12 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
const char *type, void *data,
size_t data_size, struct idmap *idmap)
{
- const struct cred *saved_cred;
struct key *rkey;
const struct user_key_payload *payload;
ssize_t ret;
- saved_cred = override_creds(id_resolver_cache);
- rkey = nfs_idmap_request_key(name, namelen, type, idmap);
- revert_creds(saved_cred);
-
+ scoped_with_creds(id_resolver_cache)
+ rkey = nfs_idmap_request_key(name, namelen, type, idmap);
if (IS_ERR(rkey)) {
ret = PTR_ERR(rkey);
goto out;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 411776718494..93c6ce04332b 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4715,16 +4715,19 @@ static int _nfs4_proc_lookupp(struct inode *inode,
};
unsigned short task_flags = 0;
- if (NFS_SERVER(inode)->flags & NFS_MOUNT_SOFTREVAL)
+ if (server->flags & NFS_MOUNT_SOFTREVAL)
task_flags |= RPC_TASK_TIMEOUT;
+ if (server->caps & NFS_CAP_MOVEABLE)
+ task_flags |= RPC_TASK_MOVEABLE;
args.bitmask = nfs4_bitmask(server, fattr->label);
nfs_fattr_init(fattr);
+ nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0);
dprintk("NFS call lookupp ino=0x%lx\n", inode->i_ino);
- status = nfs4_call_sync(clnt, server, &msg, &args.seq_args,
- &res.seq_res, task_flags);
+ status = nfs4_do_call_sync(clnt, server, &msg, &args.seq_args,
+ &res.seq_res, task_flags);
dprintk("NFS reply lookupp: %d\n", status);
return status;
}
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index a3135b5af7ee..f157d43d1312 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -317,7 +317,7 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n");
pnfs_detach_layout_hdr(lo);
/* Notify pnfs_destroy_layout_final() that we're done */
- if (inode->i_state & (I_FREEING | I_CLEAR))
+ if (inode_state_read(inode) & (I_FREEING | I_CLEAR))
wake_up_var_locked(lo, &inode->i_lock);
spin_unlock(&inode->i_lock);
pnfs_free_layout_hdr(lo);
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 7b32afb29782..9976cc16b689 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -809,8 +809,11 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
unsigned int retrans)
{
struct nfs_client *clp = ERR_PTR(-EIO);
+ struct nfs_client *mds_clp = mds_srv->nfs_client;
+ enum xprtsec_policies xprtsec_policy = mds_clp->cl_xprtsec.policy;
struct nfs4_pnfs_ds_addr *da;
unsigned long connect_timeout = timeo * (retrans + 1) * HZ / 10;
+ int ds_proto;
int status = 0;
dprintk("--> %s DS %s\n", __func__, ds->ds_remotestr);
@@ -834,27 +837,28 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
.xprtsec = clp->cl_xprtsec,
};
- if (da->da_transport != clp->cl_proto &&
- clp->cl_proto != XPRT_TRANSPORT_TCP_TLS)
- continue;
- if (da->da_transport == XPRT_TRANSPORT_TCP &&
- mds_srv->nfs_client->cl_proto == XPRT_TRANSPORT_TCP_TLS)
+ if (xprt_args.ident == XPRT_TRANSPORT_TCP &&
+ clp->cl_proto == XPRT_TRANSPORT_TCP_TLS)
xprt_args.ident = XPRT_TRANSPORT_TCP_TLS;
- if (da->da_addr.ss_family != clp->cl_addr.ss_family)
+ if (xprt_args.ident != clp->cl_proto)
+ continue;
+ if (xprt_args.dstaddr->sa_family !=
+ clp->cl_addr.ss_family)
continue;
/* Add this address as an alias */
rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args,
- rpc_clnt_test_and_add_xprt, NULL);
+ rpc_clnt_test_and_add_xprt, NULL);
continue;
}
- if (da->da_transport == XPRT_TRANSPORT_TCP &&
- mds_srv->nfs_client->cl_proto == XPRT_TRANSPORT_TCP_TLS)
- da->da_transport = XPRT_TRANSPORT_TCP_TLS;
- clp = get_v3_ds_connect(mds_srv,
- &da->da_addr,
- da->da_addrlen, da->da_transport,
- timeo, retrans);
+
+ ds_proto = da->da_transport;
+ if (ds_proto == XPRT_TRANSPORT_TCP &&
+ xprtsec_policy != RPC_XPRTSEC_NONE)
+ ds_proto = XPRT_TRANSPORT_TCP_TLS;
+
+ clp = get_v3_ds_connect(mds_srv, &da->da_addr, da->da_addrlen,
+ ds_proto, timeo, retrans);
if (IS_ERR(clp))
continue;
clp->cl_rpcclient->cl_softerr = 0;
@@ -880,7 +884,10 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
u32 minor_version)
{
struct nfs_client *clp = ERR_PTR(-EIO);
+ struct nfs_client *mds_clp = mds_srv->nfs_client;
+ enum xprtsec_policies xprtsec_policy = mds_clp->cl_xprtsec.policy;
struct nfs4_pnfs_ds_addr *da;
+ int ds_proto;
int status = 0;
dprintk("--> %s DS %s\n", __func__, ds->ds_remotestr);
@@ -908,12 +915,8 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
.data = &xprtdata,
};
- if (da->da_transport != clp->cl_proto &&
- clp->cl_proto != XPRT_TRANSPORT_TCP_TLS)
- continue;
- if (da->da_transport == XPRT_TRANSPORT_TCP &&
- mds_srv->nfs_client->cl_proto ==
- XPRT_TRANSPORT_TCP_TLS) {
+ if (xprt_args.ident == XPRT_TRANSPORT_TCP &&
+ clp->cl_proto == XPRT_TRANSPORT_TCP_TLS) {
struct sockaddr *addr =
(struct sockaddr *)&da->da_addr;
struct sockaddr_in *sin =
@@ -944,7 +947,10 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
xprt_args.ident = XPRT_TRANSPORT_TCP_TLS;
xprt_args.servername = servername;
}
- if (da->da_addr.ss_family != clp->cl_addr.ss_family)
+ if (xprt_args.ident != clp->cl_proto)
+ continue;
+ if (xprt_args.dstaddr->sa_family !=
+ clp->cl_addr.ss_family)
continue;
/**
@@ -958,15 +964,14 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
if (xprtdata.cred)
put_cred(xprtdata.cred);
} else {
- if (da->da_transport == XPRT_TRANSPORT_TCP &&
- mds_srv->nfs_client->cl_proto ==
- XPRT_TRANSPORT_TCP_TLS)
- da->da_transport = XPRT_TRANSPORT_TCP_TLS;
- clp = nfs4_set_ds_client(mds_srv,
- &da->da_addr,
- da->da_addrlen,
- da->da_transport, timeo,
- retrans, minor_version);
+ ds_proto = da->da_transport;
+ if (ds_proto == XPRT_TRANSPORT_TCP &&
+ xprtsec_policy != RPC_XPRTSEC_NONE)
+ ds_proto = XPRT_TRANSPORT_TCP_TLS;
+
+ clp = nfs4_set_ds_client(mds_srv, &da->da_addr,
+ da->da_addrlen, ds_proto,
+ timeo, retrans, minor_version);
if (IS_ERR(clp))
continue;
@@ -977,7 +982,6 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
clp = ERR_PTR(-EIO);
continue;
}
-
}
}
diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c
index 545148d42dcc..ea6e6168092b 100644
--- a/fs/nfs/sysfs.c
+++ b/fs/nfs/sysfs.c
@@ -189,6 +189,7 @@ static struct nfs_netns_client *nfs_netns_client_alloc(struct kobject *parent,
return p;
kobject_put(&p->kobject);
+ kobject_put(&p->nfs_net_kobj);
}
return NULL;
}
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index a238b6725008..93798575b807 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -1086,7 +1086,7 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct net *net,
struct auth_domain *client,
struct svc_fh *fhp,
unsigned int may_flags, struct file *file,
- struct nfsd_file **pnf, bool want_gc)
+ umode_t type, bool want_gc, struct nfsd_file **pnf)
{
unsigned char need = may_flags & NFSD_FILE_MAY_MASK;
struct nfsd_file *new, *nf;
@@ -1097,13 +1097,13 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct net *net,
int ret;
retry:
- if (rqstp) {
- status = fh_verify(rqstp, fhp, S_IFREG,
+ if (rqstp)
+ status = fh_verify(rqstp, fhp, type,
may_flags|NFSD_MAY_OWNER_OVERRIDE);
- } else {
- status = fh_verify_local(net, cred, client, fhp, S_IFREG,
+ else
+ status = fh_verify_local(net, cred, client, fhp, type,
may_flags|NFSD_MAY_OWNER_OVERRIDE);
- }
+
if (status != nfs_ok)
return status;
inode = d_inode(fhp->fh_dentry);
@@ -1176,15 +1176,18 @@ out:
open_file:
trace_nfsd_file_alloc(nf);
- nf->nf_mark = nfsd_file_mark_find_or_create(inode);
- if (nf->nf_mark) {
+
+ if (type == S_IFREG)
+ nf->nf_mark = nfsd_file_mark_find_or_create(inode);
+
+ if (type != S_IFREG || nf->nf_mark) {
if (file) {
get_file(file);
nf->nf_file = file;
status = nfs_ok;
trace_nfsd_file_opened(nf, status);
} else {
- ret = nfsd_open_verified(fhp, may_flags, &nf->nf_file);
+ ret = nfsd_open_verified(fhp, type, may_flags, &nf->nf_file);
if (ret == -EOPENSTALE && stale_retry) {
stale_retry = false;
nfsd_file_unhash(nf);
@@ -1246,7 +1249,7 @@ nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp,
unsigned int may_flags, struct nfsd_file **pnf)
{
return nfsd_file_do_acquire(rqstp, SVC_NET(rqstp), NULL, NULL,
- fhp, may_flags, NULL, pnf, true);
+ fhp, may_flags, NULL, S_IFREG, true, pnf);
}
/**
@@ -1271,7 +1274,7 @@ nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
unsigned int may_flags, struct nfsd_file **pnf)
{
return nfsd_file_do_acquire(rqstp, SVC_NET(rqstp), NULL, NULL,
- fhp, may_flags, NULL, pnf, false);
+ fhp, may_flags, NULL, S_IFREG, false, pnf);
}
/**
@@ -1314,8 +1317,8 @@ nfsd_file_acquire_local(struct net *net, struct svc_cred *cred,
const struct cred *save_cred = get_current_cred();
__be32 beres;
- beres = nfsd_file_do_acquire(NULL, net, cred, client,
- fhp, may_flags, NULL, pnf, false);
+ beres = nfsd_file_do_acquire(NULL, net, cred, client, fhp, may_flags,
+ NULL, S_IFREG, false, pnf);
put_cred(revert_creds(save_cred));
return beres;
}
@@ -1344,7 +1347,33 @@ nfsd_file_acquire_opened(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct nfsd_file **pnf)
{
return nfsd_file_do_acquire(rqstp, SVC_NET(rqstp), NULL, NULL,
- fhp, may_flags, file, pnf, false);
+ fhp, may_flags, file, S_IFREG, false, pnf);
+}
+
+/**
+ * nfsd_file_acquire_dir - Get a struct nfsd_file with an open directory
+ * @rqstp: the RPC transaction being executed
+ * @fhp: the NFS filehandle of the file to be opened
+ * @pnf: OUT: new or found "struct nfsd_file" object
+ *
+ * The nfsd_file_object returned by this API is reference-counted
+ * but not garbage-collected. The object is unhashed after the
+ * final nfsd_file_put(). This opens directories only, and only
+ * in O_RDONLY mode.
+ *
+ * Return values:
+ * %nfs_ok - @pnf points to an nfsd_file with its reference
+ * count boosted.
+ *
+ * On error, an nfsstat value in network byte order is returned.
+ */
+__be32
+nfsd_file_acquire_dir(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct nfsd_file **pnf)
+{
+ return nfsd_file_do_acquire(rqstp, SVC_NET(rqstp), NULL, NULL, fhp,
+ NFSD_MAY_READ|NFSD_MAY_64BIT_COOKIE,
+ NULL, S_IFDIR, false, pnf);
}
/*
diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
index e3d6ca2b6030..b383dbc5b921 100644
--- a/fs/nfsd/filecache.h
+++ b/fs/nfsd/filecache.h
@@ -82,5 +82,7 @@ __be32 nfsd_file_acquire_opened(struct svc_rqst *rqstp, struct svc_fh *fhp,
__be32 nfsd_file_acquire_local(struct net *net, struct svc_cred *cred,
struct auth_domain *client, struct svc_fh *fhp,
unsigned int may_flags, struct nfsd_file **pnf);
+__be32 nfsd_file_acquire_dir(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct nfsd_file **pnf);
int nfsd_file_cache_stats_show(struct seq_file *m, void *v);
#endif /* _FS_NFSD_FILECACHE_H */
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index b6d03e1ef5f7..42adc5461db0 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -281,14 +281,11 @@ nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (host_err)
return nfserrno(host_err);
- inode_lock_nested(inode, I_MUTEX_PARENT);
-
- child = lookup_one(&nop_mnt_idmap,
- &QSTR_LEN(argp->name, argp->len),
- parent);
+ child = start_creating(&nop_mnt_idmap, parent,
+ &QSTR_LEN(argp->name, argp->len));
if (IS_ERR(child)) {
status = nfserrno(PTR_ERR(child));
- goto out;
+ goto out_write;
}
if (d_really_is_negative(child)) {
@@ -344,7 +341,7 @@ nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
status = fh_fill_pre_attrs(fhp);
if (status != nfs_ok)
goto out;
- host_err = vfs_create(&nop_mnt_idmap, inode, child, iap->ia_mode, true);
+ host_err = vfs_create(&nop_mnt_idmap, child, iap->ia_mode, NULL);
if (host_err < 0) {
status = nfserrno(host_err);
goto out;
@@ -367,9 +364,8 @@ set_attr:
status = nfsd_create_setattr(rqstp, fhp, resfhp, &attrs);
out:
- inode_unlock(inode);
- if (child && !IS_ERR(child))
- dput(child);
+ end_creating(child);
+out_write:
fh_drop_write(fhp);
return status;
}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 7f7e6bb23a90..b74800917583 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -264,14 +264,11 @@ nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (is_create_with_attrs(open))
nfsd4_acl_to_attr(NF4REG, open->op_acl, &attrs);
- inode_lock_nested(inode, I_MUTEX_PARENT);
-
- child = lookup_one(&nop_mnt_idmap,
- &QSTR_LEN(open->op_fname, open->op_fnamelen),
- parent);
+ child = start_creating(&nop_mnt_idmap, parent,
+ &QSTR_LEN(open->op_fname, open->op_fnamelen));
if (IS_ERR(child)) {
status = nfserrno(PTR_ERR(child));
- goto out;
+ goto out_write;
}
if (d_really_is_negative(child)) {
@@ -379,10 +376,9 @@ set_attr:
if (attrs.na_aclerr)
open->op_bmval[0] &= ~FATTR4_WORD0_ACL;
out:
- inode_unlock(inode);
+ end_creating(child);
nfsd_attrs_free(&attrs);
- if (child && !IS_ERR(child))
- dput(child);
+out_write:
fh_drop_write(fhp);
return status;
}
@@ -2342,6 +2338,13 @@ nfsd4_get_dir_delegation(struct svc_rqst *rqstp,
union nfsd4_op_u *u)
{
struct nfsd4_get_dir_delegation *gdd = &u->get_dir_delegation;
+ struct nfs4_delegation *dd;
+ struct nfsd_file *nf;
+ __be32 status;
+
+ status = nfsd_file_acquire_dir(rqstp, &cstate->current_fh, &nf);
+ if (status != nfs_ok)
+ return status;
/*
* RFC 8881, section 18.39.3 says:
@@ -2355,7 +2358,20 @@ nfsd4_get_dir_delegation(struct svc_rqst *rqstp,
* return NFS4_OK with a non-fatal status of GDD4_UNAVAIL in this
* situation.
*/
- gdd->gddrnf_status = GDD4_UNAVAIL;
+ dd = nfsd_get_dir_deleg(cstate, gdd, nf);
+ nfsd_file_put(nf);
+ if (IS_ERR(dd)) {
+ int err = PTR_ERR(dd);
+
+ if (err != -EAGAIN)
+ return nfserrno(err);
+ gdd->gddrnf_status = GDD4_UNAVAIL;
+ return nfs_ok;
+ }
+
+ gdd->gddrnf_status = GDD4_OK;
+ memcpy(&gdd->gddr_stateid, &dd->dl_stid.sc_stateid, sizeof(gdd->gddr_stateid));
+ nfs4_put_stid(&dd->dl_stid);
return nfs_ok;
}
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index e2b9472e5c78..b39d4cbdfd35 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -195,13 +195,11 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
goto out_creds;
dir = nn->rec_file->f_path.dentry;
- /* lock the parent */
- inode_lock(d_inode(dir));
- dentry = lookup_one(&nop_mnt_idmap, &QSTR(dname), dir);
+ dentry = start_creating(&nop_mnt_idmap, dir, &QSTR(dname));
if (IS_ERR(dentry)) {
status = PTR_ERR(dentry);
- goto out_unlock;
+ goto out;
}
if (d_really_is_positive(dentry))
/*
@@ -212,15 +210,13 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
* In the 4.0 case, we should never get here; but we may
* as well be forgiving and just succeed silently.
*/
- goto out_put;
- dentry = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), dentry, S_IRWXU);
+ goto out_end;
+ dentry = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), dentry, 0700, NULL);
if (IS_ERR(dentry))
status = PTR_ERR(dentry);
-out_put:
- if (!status)
- dput(dentry);
-out_unlock:
- inode_unlock(d_inode(dir));
+out_end:
+ end_creating(dentry);
+out:
if (status == 0) {
if (nn->in_grace)
__nfsd4_create_reclaim_record_grace(clp, dname,
@@ -328,20 +324,12 @@ nfsd4_unlink_clid_dir(char *name, struct nfsd_net *nn)
dprintk("NFSD: nfsd4_unlink_clid_dir. name %s\n", name);
dir = nn->rec_file->f_path.dentry;
- inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
- dentry = lookup_one(&nop_mnt_idmap, &QSTR(name), dir);
- if (IS_ERR(dentry)) {
- status = PTR_ERR(dentry);
- goto out_unlock;
- }
- status = -ENOENT;
- if (d_really_is_negative(dentry))
- goto out;
- status = vfs_rmdir(&nop_mnt_idmap, d_inode(dir), dentry);
-out:
- dput(dentry);
-out_unlock:
- inode_unlock(d_inode(dir));
+ dentry = start_removing(&nop_mnt_idmap, dir, &QSTR(name));
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+
+ status = vfs_rmdir(&nop_mnt_idmap, d_inode(dir), dentry, NULL);
+ end_removing(dentry);
return status;
}
@@ -427,7 +415,7 @@ purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn)
if (nfs4_has_reclaimed_state(name, nn))
goto out_free;
- status = vfs_rmdir(&nop_mnt_idmap, d_inode(parent), child);
+ status = vfs_rmdir(&nop_mnt_idmap, d_inode(parent), child, NULL);
if (status)
printk("failed to remove client recovery directory %pd\n",
child);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index c1b54322c412..6791fc239dbd 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1542,7 +1542,8 @@ static void nfs4_free_ol_stateid(struct nfs4_stid *stid)
release_all_access(stp);
if (stp->st_stateowner)
nfs4_put_stateowner(stp->st_stateowner);
- WARN_ON(!list_empty(&stid->sc_cp_list));
+ if (!list_empty(&stid->sc_cp_list))
+ nfs4_free_cpntf_statelist(stid->sc_client->net, stid);
kmem_cache_free(stateid_slab, stid);
}
@@ -3486,7 +3487,20 @@ nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
struct nfsd4_slot *slot = resp->cstate.slot;
unsigned int base;
- dprintk("--> %s slot %p\n", __func__, slot);
+ /*
+ * RFC 5661 Section 2.10.6.1.2:
+ *
+ * Any time SEQUENCE ... returns an error ... [t]he replier MUST NOT
+ * modify the reply cache entry for the slot whenever an error is
+ * returned from SEQUENCE ...
+ *
+ * Because nfsd4_store_cache_entry is called only by
+ * nfsd4_sequence_done(), nfsd4_store_cache_entry() is called only
+ * when a SEQUENCE operation was part of the COMPOUND.
+ * nfs41_check_op_ordering() ensures SEQUENCE is the first op.
+ */
+ if (resp->opcnt == 1 && resp->cstate.status != nfs_ok)
+ return;
slot->sl_flags |= NFSD4_SLOT_INITIALIZED;
slot->sl_opcnt = resp->opcnt;
@@ -4349,6 +4363,36 @@ static bool replay_matches_cache(struct svc_rqst *rqstp,
return true;
}
+/*
+ * Note that the response is constructed here both for the case
+ * of a new SEQUENCE request and for a replayed SEQUENCE request.
+ * We do not cache SEQUENCE responses as SEQUENCE is idempotent.
+ */
+static void nfsd4_construct_sequence_response(struct nfsd4_session *session,
+ struct nfsd4_sequence *seq)
+{
+ struct nfs4_client *clp = session->se_client;
+
+ seq->maxslots_response = max(session->se_target_maxslots,
+ seq->maxslots);
+ seq->target_maxslots = session->se_target_maxslots;
+
+ switch (clp->cl_cb_state) {
+ case NFSD4_CB_DOWN:
+ seq->status_flags = SEQ4_STATUS_CB_PATH_DOWN;
+ break;
+ case NFSD4_CB_FAULT:
+ seq->status_flags = SEQ4_STATUS_BACKCHANNEL_FAULT;
+ break;
+ default:
+ seq->status_flags = 0;
+ }
+ if (!list_empty(&clp->cl_revoked))
+ seq->status_flags |= SEQ4_STATUS_RECALLABLE_STATE_REVOKED;
+ if (atomic_read(&clp->cl_admin_revoked))
+ seq->status_flags |= SEQ4_STATUS_ADMIN_STATE_REVOKED;
+}
+
__be32
nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
union nfsd4_op_u *u)
@@ -4398,6 +4442,9 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
dprintk("%s: slotid %d\n", __func__, seq->slotid);
trace_nfsd_slot_seqid_sequence(clp, seq, slot);
+
+ nfsd4_construct_sequence_response(session, seq);
+
status = check_slot_seqid(seq->seqid, slot->sl_seqid, slot->sl_flags);
if (status == nfserr_replay_cache) {
status = nfserr_seq_misordered;
@@ -4495,23 +4542,6 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
}
out:
- seq->maxslots = max(session->se_target_maxslots, seq->maxslots);
- seq->target_maxslots = session->se_target_maxslots;
-
- switch (clp->cl_cb_state) {
- case NFSD4_CB_DOWN:
- seq->status_flags = SEQ4_STATUS_CB_PATH_DOWN;
- break;
- case NFSD4_CB_FAULT:
- seq->status_flags = SEQ4_STATUS_BACKCHANNEL_FAULT;
- break;
- default:
- seq->status_flags = 0;
- }
- if (!list_empty(&clp->cl_revoked))
- seq->status_flags |= SEQ4_STATUS_RECALLABLE_STATE_REVOKED;
- if (atomic_read(&clp->cl_admin_revoked))
- seq->status_flags |= SEQ4_STATUS_ADMIN_STATE_REVOKED;
trace_nfsd_seq4_status(rqstp, seq);
out_no_session:
if (conn)
@@ -7829,7 +7859,8 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
__be32 status;
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
- if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
+ status = fh_verify(rqstp, &cstate->current_fh, 0, 0);
+ if (status)
return status;
status = nfsd4_lookup_stateid(cstate, stateid, SC_TYPE_DELEG, SC_STATUS_REVOKED, &s, nn);
@@ -9347,3 +9378,103 @@ out_status:
nfs4_put_stid(&dp->dl_stid);
return status;
}
+
+/**
+ * nfsd_get_dir_deleg - attempt to get a directory delegation
+ * @cstate: compound state
+ * @gdd: GET_DIR_DELEGATION arg/resp structure
+ * @nf: nfsd_file opened on the directory
+ *
+ * Given a GET_DIR_DELEGATION request @gdd, attempt to acquire a delegation
+ * on the directory to which @nf refers. Note that this does not set up any
+ * sort of async notifications for the delegation.
+ */
+struct nfs4_delegation *
+nfsd_get_dir_deleg(struct nfsd4_compound_state *cstate,
+ struct nfsd4_get_dir_delegation *gdd,
+ struct nfsd_file *nf)
+{
+ struct nfs4_client *clp = cstate->clp;
+ struct nfs4_delegation *dp;
+ struct file_lease *fl;
+ struct nfs4_file *fp, *rfp;
+ int status = 0;
+
+ fp = nfsd4_alloc_file();
+ if (!fp)
+ return ERR_PTR(-ENOMEM);
+
+ nfsd4_file_init(&cstate->current_fh, fp);
+
+ rfp = nfsd4_file_hash_insert(fp, &cstate->current_fh);
+ if (unlikely(!rfp)) {
+ put_nfs4_file(fp);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ if (rfp != fp) {
+ put_nfs4_file(fp);
+ fp = rfp;
+ }
+
+ /* if this client already has one, return that it's unavailable */
+ spin_lock(&state_lock);
+ spin_lock(&fp->fi_lock);
+ /* existing delegation? */
+ if (nfs4_delegation_exists(clp, fp)) {
+ status = -EAGAIN;
+ } else if (!fp->fi_deleg_file) {
+ fp->fi_deleg_file = nfsd_file_get(nf);
+ fp->fi_delegees = 1;
+ } else {
+ ++fp->fi_delegees;
+ }
+ spin_unlock(&fp->fi_lock);
+ spin_unlock(&state_lock);
+
+ if (status) {
+ put_nfs4_file(fp);
+ return ERR_PTR(status);
+ }
+
+ /* Try to set up the lease */
+ status = -ENOMEM;
+ dp = alloc_init_deleg(clp, fp, NULL, NFS4_OPEN_DELEGATE_READ);
+ if (!dp)
+ goto out_delegees;
+
+ fl = nfs4_alloc_init_lease(dp);
+ if (!fl)
+ goto out_put_stid;
+
+ status = kernel_setlease(nf->nf_file,
+ fl->c.flc_type, &fl, NULL);
+ if (fl)
+ locks_free_lease(fl);
+ if (status)
+ goto out_put_stid;
+
+ /*
+ * Now, try to hash it. This can fail if we race another nfsd task
+ * trying to set a delegation on the same file. If that happens,
+ * then just say UNAVAIL.
+ */
+ spin_lock(&state_lock);
+ spin_lock(&clp->cl_lock);
+ spin_lock(&fp->fi_lock);
+ status = hash_delegation_locked(dp, fp);
+ spin_unlock(&fp->fi_lock);
+ spin_unlock(&clp->cl_lock);
+ spin_unlock(&state_lock);
+
+ if (!status)
+ return dp;
+
+ /* Something failed. Drop the lease and clean up the stid */
+ kernel_setlease(fp->fi_deleg_file->nf_file, F_UNLCK, NULL, (void **)&dp);
+out_put_stid:
+ nfs4_put_stid(&dp->dl_stid);
+out_delegees:
+ put_deleg_file(fp);
+ return ERR_PTR(status);
+}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 6040a6145dad..67bb9c0b9fcb 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -5073,7 +5073,7 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr,
return nfserr;
/* Note slotid's are numbered from zero: */
/* sr_highest_slotid */
- nfserr = nfsd4_encode_slotid4(xdr, seq->maxslots - 1);
+ nfserr = nfsd4_encode_slotid4(xdr, seq->maxslots_response - 1);
if (nfserr != nfs_ok)
return nfserr;
/* sr_target_highest_slotid */
@@ -5925,8 +5925,7 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
*/
warn_on_nonidempotent_op(op);
xdr_truncate_encode(xdr, op_status_offset + XDR_UNIT);
- }
- if (so) {
+ } else if (so) {
int len = xdr->buf->len - (op_status_offset + XDR_UNIT);
so->so_replay.rp_status = op->status;
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index f19320018639..b752433c3c2c 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -458,6 +458,7 @@ enum {
#define NFSD4_2_SUPPORTED_ATTRS_WORD2 \
(NFSD4_1_SUPPORTED_ATTRS_WORD2 | \
FATTR4_WORD2_MODE_UMASK | \
+ FATTR4_WORD2_CLONE_BLKSIZE | \
NFSD4_2_SECURITY_ATTRS | \
FATTR4_WORD2_XATTR_SUPPORT | \
FATTR4_WORD2_TIME_DELEG_ACCESS | \
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 3eb724ec9566..ed85dd43da18 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -269,9 +269,6 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct net *net,
dentry);
}
- fhp->fh_dentry = dentry;
- fhp->fh_export = exp;
-
switch (fhp->fh_maxsize) {
case NFS4_FHSIZE:
if (dentry->d_sb->s_export_op->flags & EXPORT_OP_NOATOMIC_ATTR)
@@ -293,6 +290,9 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct net *net,
goto out;
}
+ fhp->fh_dentry = dentry;
+ fhp->fh_export = exp;
+
return 0;
out:
exp_put(exp);
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 8f71f5748c75..481e789a7697 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -306,18 +306,16 @@ nfsd_proc_create(struct svc_rqst *rqstp)
goto done;
}
- inode_lock_nested(dirfhp->fh_dentry->d_inode, I_MUTEX_PARENT);
- dchild = lookup_one(&nop_mnt_idmap, &QSTR_LEN(argp->name, argp->len),
- dirfhp->fh_dentry);
+ dchild = start_creating(&nop_mnt_idmap, dirfhp->fh_dentry,
+ &QSTR_LEN(argp->name, argp->len));
if (IS_ERR(dchild)) {
resp->status = nfserrno(PTR_ERR(dchild));
- goto out_unlock;
+ goto out_write;
}
fh_init(newfhp, NFS_FHSIZE);
resp->status = fh_compose(newfhp, dirfhp->fh_export, dchild, dirfhp);
if (!resp->status && d_really_is_negative(dchild))
resp->status = nfserr_noent;
- dput(dchild);
if (resp->status) {
if (resp->status != nfserr_noent)
goto out_unlock;
@@ -409,6 +407,9 @@ nfsd_proc_create(struct svc_rqst *rqstp)
/* File doesn't exist. Create it and set attrs */
resp->status = nfsd_create_locked(rqstp, dirfhp, &attrs, type,
rdev, newfhp);
+ /* nfsd_create_locked() unlocked the parent */
+ dput(dchild);
+ goto out_write;
} else if (type == S_IFREG) {
dprintk("nfsd: existing %s, valid=%x, size=%ld\n",
argp->name, attr->ia_valid, (long) attr->ia_size);
@@ -423,7 +424,8 @@ nfsd_proc_create(struct svc_rqst *rqstp)
}
out_unlock:
- inode_unlock(dirfhp->fh_dentry->d_inode);
+ end_creating(dchild);
+out_write:
fh_drop_write(dirfhp);
done:
fh_put(dirfhp);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 1e736f402426..b052c1effdc5 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -867,4 +867,9 @@ static inline bool try_to_expire_client(struct nfs4_client *clp)
extern __be32 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp,
struct dentry *dentry, struct nfs4_delegation **pdp);
+
+struct nfsd4_get_dir_delegation;
+struct nfs4_delegation *nfsd_get_dir_deleg(struct nfsd4_compound_state *cstate,
+ struct nfsd4_get_dir_delegation *gdd,
+ struct nfsd_file *nf);
#endif /* NFSD4_STATE_H */
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 9cb20d4aeab1..31cbf46b47b1 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -959,15 +959,16 @@ retry:
/**
* nfsd_open_verified - Open a regular file for the filecache
* @fhp: NFS filehandle of the file to open
+ * @type: S_IFMT inode type allowed (0 means any type is allowed)
* @may_flags: internal permission flags
* @filp: OUT: open "struct file *"
*
* Returns zero on success, or a negative errno value.
*/
int
-nfsd_open_verified(struct svc_fh *fhp, int may_flags, struct file **filp)
+nfsd_open_verified(struct svc_fh *fhp, umode_t type, int may_flags, struct file **filp)
{
- return __nfsd_open(fhp, S_IFREG, may_flags, filp);
+ return __nfsd_open(fhp, type, may_flags, filp);
}
/*
@@ -1159,7 +1160,7 @@ static int wait_for_concurrent_writes(struct file *file)
dprintk("nfsd: write resume %d\n", task_pid_nr(current));
}
- if (inode->i_state & I_DIRTY) {
+ if (inode_state_read_once(inode) & I_DIRTY) {
dprintk("nfsd: write sync %d\n", task_pid_nr(current));
err = vfs_fsync(file, 0);
}
@@ -1521,7 +1522,7 @@ nfsd_check_ignore_resizing(struct iattr *iap)
iap->ia_valid &= ~ATTR_SIZE;
}
-/* The parent directory should already be locked: */
+/* The parent directory should already be locked - we will unlock */
__be32
nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct nfsd_attrs *attrs,
@@ -1552,13 +1553,12 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
err = 0;
switch (type) {
case S_IFREG:
- host_err = vfs_create(&nop_mnt_idmap, dirp, dchild,
- iap->ia_mode, true);
+ host_err = vfs_create(&nop_mnt_idmap, dchild, iap->ia_mode, NULL);
if (!host_err)
nfsd_check_ignore_resizing(iap);
break;
case S_IFDIR:
- dchild = vfs_mkdir(&nop_mnt_idmap, dirp, dchild, iap->ia_mode);
+ dchild = vfs_mkdir(&nop_mnt_idmap, dirp, dchild, iap->ia_mode, NULL);
if (IS_ERR(dchild)) {
host_err = PTR_ERR(dchild);
} else if (d_is_negative(dchild)) {
@@ -1574,7 +1574,7 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
case S_IFIFO:
case S_IFSOCK:
host_err = vfs_mknod(&nop_mnt_idmap, dirp, dchild,
- iap->ia_mode, rdev);
+ iap->ia_mode, rdev, NULL);
break;
default:
printk(KERN_WARNING "nfsd: bad file type %o in nfsd_create\n",
@@ -1587,8 +1587,9 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
err = nfsd_create_setattr(rqstp, fhp, resfhp, attrs);
out:
- if (!IS_ERR(dchild))
- dput(dchild);
+ if (!err)
+ fh_fill_post_attrs(fhp);
+ end_creating(dchild);
return err;
out_nfserr:
@@ -1626,28 +1627,24 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (host_err)
return nfserrno(host_err);
- inode_lock_nested(dentry->d_inode, I_MUTEX_PARENT);
- dchild = lookup_one(&nop_mnt_idmap, &QSTR_LEN(fname, flen), dentry);
+ dchild = start_creating(&nop_mnt_idmap, dentry, &QSTR_LEN(fname, flen));
host_err = PTR_ERR(dchild);
- if (IS_ERR(dchild)) {
- err = nfserrno(host_err);
- goto out_unlock;
- }
+ if (IS_ERR(dchild))
+ return nfserrno(host_err);
+
err = fh_compose(resfhp, fhp->fh_export, dchild, fhp);
- /*
- * We unconditionally drop our ref to dchild as fh_compose will have
- * already grabbed its own ref for it.
- */
- dput(dchild);
if (err)
goto out_unlock;
err = fh_fill_pre_attrs(fhp);
if (err != nfs_ok)
goto out_unlock;
err = nfsd_create_locked(rqstp, fhp, attrs, type, rdev, resfhp);
- fh_fill_post_attrs(fhp);
+ /* nfsd_create_locked() unlocked the parent */
+ dput(dchild);
+ return err;
+
out_unlock:
- inode_unlock(dentry->d_inode);
+ end_creating(dchild);
return err;
}
@@ -1733,28 +1730,26 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
}
dentry = fhp->fh_dentry;
- inode_lock_nested(dentry->d_inode, I_MUTEX_PARENT);
- dnew = lookup_one(&nop_mnt_idmap, &QSTR_LEN(fname, flen), dentry);
+ dnew = start_creating(&nop_mnt_idmap, dentry, &QSTR_LEN(fname, flen));
if (IS_ERR(dnew)) {
err = nfserrno(PTR_ERR(dnew));
- inode_unlock(dentry->d_inode);
goto out_drop_write;
}
err = fh_fill_pre_attrs(fhp);
if (err != nfs_ok)
goto out_unlock;
- host_err = vfs_symlink(&nop_mnt_idmap, d_inode(dentry), dnew, path);
+ host_err = vfs_symlink(&nop_mnt_idmap, d_inode(dentry), dnew, path, NULL);
err = nfserrno(host_err);
cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp);
if (!err)
nfsd_create_setattr(rqstp, fhp, resfhp, attrs);
fh_fill_post_attrs(fhp);
out_unlock:
- inode_unlock(dentry->d_inode);
+ end_creating(dnew);
if (!err)
err = nfserrno(commit_metadata(fhp));
- dput(dnew);
- if (err==0) err = cerr;
+ if (!err)
+ err = cerr;
out_drop_write:
fh_drop_write(fhp);
out:
@@ -1809,32 +1804,31 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
ddir = ffhp->fh_dentry;
dirp = d_inode(ddir);
- inode_lock_nested(dirp, I_MUTEX_PARENT);
+ dnew = start_creating(&nop_mnt_idmap, ddir, &QSTR_LEN(name, len));
- dnew = lookup_one(&nop_mnt_idmap, &QSTR_LEN(name, len), ddir);
if (IS_ERR(dnew)) {
host_err = PTR_ERR(dnew);
- goto out_unlock;
+ goto out_drop_write;
}
dold = tfhp->fh_dentry;
err = nfserr_noent;
if (d_really_is_negative(dold))
- goto out_dput;
+ goto out_unlock;
err = fh_fill_pre_attrs(ffhp);
if (err != nfs_ok)
- goto out_dput;
+ goto out_unlock;
host_err = vfs_link(dold, &nop_mnt_idmap, dirp, dnew, NULL);
fh_fill_post_attrs(ffhp);
- inode_unlock(dirp);
+out_unlock:
+ end_creating(dnew);
if (!host_err) {
host_err = commit_metadata(ffhp);
if (!host_err)
host_err = commit_metadata(tfhp);
}
- dput(dnew);
out_drop_write:
fh_drop_write(tfhp);
if (host_err == -EBUSY) {
@@ -1849,12 +1843,6 @@ out_drop_write:
}
out:
return err != nfs_ok ? err : nfserrno(host_err);
-
-out_dput:
- dput(dnew);
-out_unlock:
- inode_unlock(dirp);
- goto out_drop_write;
}
static void
@@ -1895,11 +1883,12 @@ __be32
nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
struct svc_fh *tfhp, char *tname, int tlen)
{
- struct dentry *fdentry, *tdentry, *odentry, *ndentry, *trap;
+ struct dentry *fdentry, *tdentry;
int type = S_IFDIR;
+ struct renamedata rd = {};
__be32 err;
int host_err;
- bool close_cached = false;
+ struct dentry *close_cached;
trace_nfsd_vfs_rename(rqstp, ffhp, tfhp, fname, flen, tname, tlen);
@@ -1925,15 +1914,22 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
goto out;
retry:
+ close_cached = NULL;
host_err = fh_want_write(ffhp);
if (host_err) {
err = nfserrno(host_err);
goto out;
}
- trap = lock_rename(tdentry, fdentry);
- if (IS_ERR(trap)) {
- err = nfserr_xdev;
+ rd.mnt_idmap = &nop_mnt_idmap;
+ rd.old_parent = fdentry;
+ rd.new_parent = tdentry;
+
+ host_err = start_renaming(&rd, 0, &QSTR_LEN(fname, flen),
+ &QSTR_LEN(tname, tlen));
+
+ if (host_err) {
+ err = nfserrno(host_err);
goto out_want_write;
}
err = fh_fill_pre_attrs(ffhp);
@@ -1943,48 +1939,23 @@ retry:
if (err != nfs_ok)
goto out_unlock;
- odentry = lookup_one(&nop_mnt_idmap, &QSTR_LEN(fname, flen), fdentry);
- host_err = PTR_ERR(odentry);
- if (IS_ERR(odentry))
- goto out_nfserr;
+ type = d_inode(rd.old_dentry)->i_mode & S_IFMT;
+
+ if (d_inode(rd.new_dentry))
+ type = d_inode(rd.new_dentry)->i_mode & S_IFMT;
- host_err = -ENOENT;
- if (d_really_is_negative(odentry))
- goto out_dput_old;
- host_err = -EINVAL;
- if (odentry == trap)
- goto out_dput_old;
- type = d_inode(odentry)->i_mode & S_IFMT;
-
- ndentry = lookup_one(&nop_mnt_idmap, &QSTR_LEN(tname, tlen), tdentry);
- host_err = PTR_ERR(ndentry);
- if (IS_ERR(ndentry))
- goto out_dput_old;
- if (d_inode(ndentry))
- type = d_inode(ndentry)->i_mode & S_IFMT;
- host_err = -ENOTEMPTY;
- if (ndentry == trap)
- goto out_dput_new;
-
- if ((ndentry->d_sb->s_export_op->flags & EXPORT_OP_CLOSE_BEFORE_UNLINK) &&
- nfsd_has_cached_files(ndentry)) {
- close_cached = true;
- goto out_dput_old;
+ if ((rd.new_dentry->d_sb->s_export_op->flags & EXPORT_OP_CLOSE_BEFORE_UNLINK) &&
+ nfsd_has_cached_files(rd.new_dentry)) {
+ close_cached = dget(rd.new_dentry);
+ goto out_unlock;
} else {
- struct renamedata rd = {
- .mnt_idmap = &nop_mnt_idmap,
- .old_parent = fdentry,
- .old_dentry = odentry,
- .new_parent = tdentry,
- .new_dentry = ndentry,
- };
int retries;
for (retries = 1;;) {
host_err = vfs_rename(&rd);
if (host_err != -EAGAIN || !retries--)
break;
- if (!nfsd_wait_for_delegreturn(rqstp, d_inode(odentry)))
+ if (!nfsd_wait_for_delegreturn(rqstp, d_inode(rd.old_dentry)))
break;
}
if (!host_err) {
@@ -1993,11 +1964,6 @@ retry:
host_err = commit_metadata(ffhp);
}
}
- out_dput_new:
- dput(ndentry);
- out_dput_old:
- dput(odentry);
- out_nfserr:
if (host_err == -EBUSY) {
/*
* See RFC 8881 Section 18.26.4 para 1-3: NFSv4 RENAME
@@ -2016,7 +1982,7 @@ retry:
fh_fill_post_attrs(tfhp);
}
out_unlock:
- unlock_rename(tdentry, fdentry);
+ end_renaming(&rd);
out_want_write:
fh_drop_write(ffhp);
@@ -2027,9 +1993,8 @@ out_want_write:
* until this point and then reattempt the whole shebang.
*/
if (close_cached) {
- close_cached = false;
- nfsd_close_cached_files(ndentry);
- dput(ndentry);
+ nfsd_close_cached_files(close_cached);
+ dput(close_cached);
goto retry;
}
out:
@@ -2054,7 +2019,7 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
{
struct dentry *dentry, *rdentry;
struct inode *dirp;
- struct inode *rinode;
+ struct inode *rinode = NULL;
__be32 err;
int host_err;
@@ -2073,24 +2038,21 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
dentry = fhp->fh_dentry;
dirp = d_inode(dentry);
- inode_lock_nested(dirp, I_MUTEX_PARENT);
- rdentry = lookup_one(&nop_mnt_idmap, &QSTR_LEN(fname, flen), dentry);
+ rdentry = start_removing(&nop_mnt_idmap, dentry, &QSTR_LEN(fname, flen));
+
host_err = PTR_ERR(rdentry);
if (IS_ERR(rdentry))
- goto out_unlock;
+ goto out_drop_write;
- if (d_really_is_negative(rdentry)) {
- dput(rdentry);
- host_err = -ENOENT;
- goto out_unlock;
- }
- rinode = d_inode(rdentry);
err = fh_fill_pre_attrs(fhp);
if (err != nfs_ok)
goto out_unlock;
+ rinode = d_inode(rdentry);
+ /* Prevent truncation until after locks dropped */
ihold(rinode);
+
if (!type)
type = d_inode(rdentry)->i_mode & S_IFMT;
@@ -2108,14 +2070,14 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
break;
}
} else {
- host_err = vfs_rmdir(&nop_mnt_idmap, dirp, rdentry);
+ host_err = vfs_rmdir(&nop_mnt_idmap, dirp, rdentry, NULL);
}
fh_fill_post_attrs(fhp);
- inode_unlock(dirp);
- if (!host_err)
+out_unlock:
+ end_removing(rdentry);
+ if (!err && !host_err)
host_err = commit_metadata(fhp);
- dput(rdentry);
iput(rinode); /* truncate the inode here */
out_drop_write:
@@ -2133,9 +2095,6 @@ out_nfserr:
}
out:
return err != nfs_ok ? err : nfserrno(host_err);
-out_unlock:
- inode_unlock(dirp);
- goto out_drop_write;
}
/*
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 0c0292611c6d..09de48c50cbe 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -114,7 +114,7 @@ __be32 nfsd_setxattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
int nfsd_open_break_lease(struct inode *, int);
__be32 nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t,
int, struct file **);
-int nfsd_open_verified(struct svc_fh *fhp, int may_flags,
+int nfsd_open_verified(struct svc_fh *fhp, umode_t type, int may_flags,
struct file **filp);
__be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct file *file, loff_t offset,
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index ee0570cbdd9e..1ce8e12ae335 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -574,8 +574,9 @@ struct nfsd4_sequence {
struct nfs4_sessionid sessionid; /* request/response */
u32 seqid; /* request/response */
u32 slotid; /* request/response */
- u32 maxslots; /* request/response */
+ u32 maxslots; /* request */
u32 cachethis; /* request */
+ u32 maxslots_response; /* response */
u32 target_maxslots; /* response */
u32 status_flags; /* response */
};
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index bcc7d76269ac..4bbdc832d7f2 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -1148,7 +1148,7 @@ int nilfs_cpfile_read(struct super_block *sb, size_t cpsize,
cpfile = nilfs_iget_locked(sb, NULL, NILFS_CPFILE_INO);
if (unlikely(!cpfile))
return -ENOMEM;
- if (!(cpfile->i_state & I_NEW))
+ if (!(inode_state_read_once(cpfile) & I_NEW))
goto out;
err = nilfs_mdt_init(cpfile, NILFS_MDT_GFP, 0);
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index c664daba56ae..674380837ab9 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -506,7 +506,7 @@ int nilfs_dat_read(struct super_block *sb, size_t entry_size,
dat = nilfs_iget_locked(sb, NULL, NILFS_DAT_INO);
if (unlikely(!dat))
return -ENOMEM;
- if (!(dat->i_state & I_NEW))
+ if (!(inode_state_read_once(dat) & I_NEW))
goto out;
err = nilfs_mdt_init(dat, NILFS_MDT_GFP, sizeof(*di));
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index c4cd4a4dedd0..99eb8a59009e 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -188,7 +188,7 @@ int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root,
ifile = nilfs_iget_locked(sb, root, NILFS_IFILE_INO);
if (unlikely(!ifile))
return -ENOMEM;
- if (!(ifile->i_state & I_NEW))
+ if (!(inode_state_read_once(ifile) & I_NEW))
goto out;
err = nilfs_mdt_init(ifile, NILFS_MDT_GFP,
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 87ddde159f0c..51bde45d5865 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -365,7 +365,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
failed_after_creation:
clear_nlink(inode);
- if (inode->i_state & I_NEW)
+ if (inode_state_read_once(inode) & I_NEW)
unlock_new_inode(inode);
iput(inode); /*
* raw_inode will be deleted through
@@ -562,7 +562,7 @@ struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root,
if (unlikely(!inode))
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW)) {
+ if (!(inode_state_read_once(inode) & I_NEW)) {
if (!inode->i_nlink) {
iput(inode);
return ERR_PTR(-ESTALE);
@@ -591,7 +591,7 @@ struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino,
inode = iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args);
if (unlikely(!inode))
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
err = nilfs_init_gcinode(inode);
@@ -631,7 +631,7 @@ int nilfs_attach_btree_node_cache(struct inode *inode)
nilfs_iget_set, &args);
if (unlikely(!btnc_inode))
return -ENOMEM;
- if (btnc_inode->i_state & I_NEW) {
+ if (inode_state_read_once(btnc_inode) & I_NEW) {
nilfs_init_btnc_inode(btnc_inode);
unlock_new_inode(btnc_inode);
}
@@ -686,7 +686,7 @@ struct inode *nilfs_iget_for_shadow(struct inode *inode)
nilfs_iget_set, &args);
if (unlikely(!s_inode))
return ERR_PTR(-ENOMEM);
- if (!(s_inode->i_state & I_NEW))
+ if (!(inode_state_read_once(s_inode) & I_NEW))
return inode;
NILFS_I(s_inode)->i_flags = 0;
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index f466daa39440..b7e3d91b6243 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -14,6 +14,7 @@
#include <linux/buffer_head.h>
#include <linux/spinlock.h>
#include <linux/blkdev.h>
+#include <linux/fs_struct.h>
#include <linux/nilfs2_api.h>
#include <linux/nilfs2_ondisk.h>
#include "the_nilfs.h"
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index f15ca6fc400d..deee16bc9d4e 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2768,7 +2768,12 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
if (sci->sc_task) {
wake_up(&sci->sc_wait_daemon);
- kthread_stop(sci->sc_task);
+ if (kthread_stop(sci->sc_task)) {
+ spin_lock(&sci->sc_state_lock);
+ sci->sc_task = NULL;
+ timer_shutdown_sync(&sci->sc_timer);
+ spin_unlock(&sci->sc_state_lock);
+ }
}
spin_lock(&sci->sc_state_lock);
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 330f269abedf..83f93337c01b 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -1226,7 +1226,7 @@ int nilfs_sufile_read(struct super_block *sb, size_t susize,
sufile = nilfs_iget_locked(sb, NULL, NILFS_SUFILE_INO);
if (unlikely(!sufile))
return -ENOMEM;
- if (!(sufile->i_state & I_NEW))
+ if (!(inode_state_read_once(sufile) & I_NEW))
goto out;
err = nilfs_mdt_init(sufile, NILFS_MDT_GFP, sizeof(*sui));
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 1dadda82cae5..d0b9b984002f 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -1597,16 +1597,20 @@ static struct hlist_head *fanotify_alloc_merge_hash(void)
return hash;
}
+DEFINE_CLASS(fsnotify_group,
+ struct fsnotify_group *,
+ if (!IS_ERR_OR_NULL(_T)) fsnotify_destroy_group(_T),
+ fsnotify_alloc_group(ops, flags),
+ const struct fsnotify_ops *ops, int flags)
+
/* fanotify syscalls */
SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
{
struct user_namespace *user_ns = current_user_ns();
- struct fsnotify_group *group;
int f_flags, fd;
unsigned int fid_mode = flags & FANOTIFY_FID_BITS;
unsigned int class = flags & FANOTIFY_CLASS_BITS;
unsigned int internal_flags = 0;
- struct file *file;
pr_debug("%s: flags=%x event_f_flags=%x\n",
__func__, flags, event_f_flags);
@@ -1690,36 +1694,29 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
if (flags & FAN_NONBLOCK)
f_flags |= O_NONBLOCK;
- /* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */
- group = fsnotify_alloc_group(&fanotify_fsnotify_ops,
+ CLASS(fsnotify_group, group)(&fanotify_fsnotify_ops,
FSNOTIFY_GROUP_USER);
- if (IS_ERR(group)) {
+ /* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */
+ if (IS_ERR(group))
return PTR_ERR(group);
- }
/* Enforce groups limits per user in all containing user ns */
group->fanotify_data.ucounts = inc_ucount(user_ns, current_euid(),
UCOUNT_FANOTIFY_GROUPS);
- if (!group->fanotify_data.ucounts) {
- fd = -EMFILE;
- goto out_destroy_group;
- }
+ if (!group->fanotify_data.ucounts)
+ return -EMFILE;
group->fanotify_data.flags = flags | internal_flags;
group->memcg = get_mem_cgroup_from_mm(current->mm);
group->user_ns = get_user_ns(user_ns);
group->fanotify_data.merge_hash = fanotify_alloc_merge_hash();
- if (!group->fanotify_data.merge_hash) {
- fd = -ENOMEM;
- goto out_destroy_group;
- }
+ if (!group->fanotify_data.merge_hash)
+ return -ENOMEM;
group->overflow_event = fanotify_alloc_overflow_event();
- if (unlikely(!group->overflow_event)) {
- fd = -ENOMEM;
- goto out_destroy_group;
- }
+ if (unlikely(!group->overflow_event))
+ return -ENOMEM;
if (force_o_largefile())
event_f_flags |= O_LARGEFILE;
@@ -1738,8 +1735,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
group->priority = FSNOTIFY_PRIO_PRE_CONTENT;
break;
default:
- fd = -EINVAL;
- goto out_destroy_group;
+ return -EINVAL;
}
BUILD_BUG_ON(!(FANOTIFY_ADMIN_INIT_FLAGS & FAN_UNLIMITED_QUEUE));
@@ -1750,27 +1746,15 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
}
if (flags & FAN_ENABLE_AUDIT) {
- fd = -EPERM;
if (!capable(CAP_AUDIT_WRITE))
- goto out_destroy_group;
- }
-
- fd = get_unused_fd_flags(f_flags);
- if (fd < 0)
- goto out_destroy_group;
-
- file = anon_inode_getfile_fmode("[fanotify]", &fanotify_fops, group,
- f_flags, FMODE_NONOTIFY);
- if (IS_ERR(file)) {
- put_unused_fd(fd);
- fd = PTR_ERR(file);
- goto out_destroy_group;
+ return -EPERM;
}
- fd_install(fd, file);
- return fd;
-out_destroy_group:
- fsnotify_destroy_group(group);
+ fd = FD_ADD(f_flags,
+ anon_inode_getfile_fmode("[fanotify]", &fanotify_fops,
+ group, f_flags, FMODE_NONOTIFY));
+ if (fd >= 0)
+ retain_and_null_ptr(group);
return fd;
}
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 46bfc543f946..d27ff5e5f165 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -52,7 +52,7 @@ static void fsnotify_unmount_inodes(struct super_block *sb)
* the inode cannot have any associated watches.
*/
spin_lock(&inode->i_lock);
- if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
+ if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) {
spin_unlock(&inode->i_lock);
continue;
}
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 79b026a36fb6..bf27d5da91f1 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -58,6 +58,8 @@ const struct dentry_operations ns_dentry_operations = {
static void nsfs_evict(struct inode *inode)
{
struct ns_common *ns = inode->i_private;
+
+ __ns_ref_active_put(ns);
clear_inode(inode);
ns->ops->put(ns);
}
@@ -108,7 +110,6 @@ int ns_get_path(struct path *path, struct task_struct *task,
int open_namespace(struct ns_common *ns)
{
struct path path __free(path_put) = {};
- struct file *f;
int err;
/* call first to consume reference */
@@ -116,16 +117,7 @@ int open_namespace(struct ns_common *ns)
if (err < 0)
return err;
- CLASS(get_unused_fd, fd)(O_CLOEXEC);
- if (fd < 0)
- return fd;
-
- f = dentry_open(&path, O_RDONLY, current_cred());
- if (IS_ERR(f))
- return PTR_ERR(f);
-
- fd_install(fd, f);
- return take_fd(fd);
+ return FD_ADD(O_CLOEXEC, dentry_open(&path, O_RDONLY, current_cred()));
}
int open_related_ns(struct ns_common *ns,
@@ -311,7 +303,6 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
struct mnt_ns_info kinfo = {};
struct mnt_ns_info __user *uinfo = (struct mnt_ns_info __user *)arg;
struct path path __free(path_put) = {};
- struct file *f __free(fput) = NULL;
size_t usize = _IOC_SIZE(ioctl);
if (ns->ns_type != CLONE_NEWNS)
@@ -330,28 +321,18 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
if (ret)
return ret;
- CLASS(get_unused_fd, fd)(O_CLOEXEC);
- if (fd < 0)
- return fd;
-
- f = dentry_open(&path, O_RDONLY, current_cred());
- if (IS_ERR(f))
- return PTR_ERR(f);
-
- if (uinfo) {
- /*
- * If @uinfo is passed return all information about the
- * mount namespace as well.
- */
- ret = copy_ns_info_to_user(to_mnt_ns(ns), uinfo, usize, &kinfo);
- if (ret)
- return ret;
- }
-
- /* Transfer reference of @f to caller's fdtable. */
- fd_install(fd, no_free_ptr(f));
- /* File descriptor is live so hand it off to the caller. */
- return take_fd(fd);
+ FD_PREPARE(fdf, O_CLOEXEC, dentry_open(&path, O_RDONLY, current_cred()));
+ if (fdf.err)
+ return fdf.err;
+ /*
+ * If @uinfo is passed return all information about the
+ * mount namespace as well.
+ */
+ ret = copy_ns_info_to_user(to_mnt_ns(ns), uinfo, usize, &kinfo);
+ if (ret)
+ return ret;
+ ret = fd_publish(fdf);
+ break;
}
default:
ret = -ENOTTY;
@@ -408,6 +389,7 @@ static const struct super_operations nsfs_ops = {
.statfs = simple_statfs,
.evict_inode = nsfs_evict,
.show_path = nsfs_show_path,
+ .drop_inode = inode_just_drop,
};
static int nsfs_init_inode(struct inode *inode, void *data)
@@ -418,6 +400,16 @@ static int nsfs_init_inode(struct inode *inode, void *data)
inode->i_mode |= S_IRUGO;
inode->i_fop = &ns_file_operations;
inode->i_ino = ns->inum;
+
+ /*
+ * Bring the namespace subtree back to life if we have to. This
+ * can happen when e.g., all processes using a network namespace
+ * and all namespace files or namespace file bind-mounts have
+ * died but there are still sockets pinning it. The SIOCGSKNS
+ * ioctl on such a socket will resurrect the relevant namespace
+ * subtree.
+ */
+ __ns_ref_active_get(ns);
return 0;
}
@@ -458,6 +450,45 @@ static int nsfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
return FILEID_NSFS;
}
+bool is_current_namespace(struct ns_common *ns)
+{
+ switch (ns->ns_type) {
+#ifdef CONFIG_CGROUPS
+ case CLONE_NEWCGROUP:
+ return current_in_namespace(to_cg_ns(ns));
+#endif
+#ifdef CONFIG_IPC_NS
+ case CLONE_NEWIPC:
+ return current_in_namespace(to_ipc_ns(ns));
+#endif
+ case CLONE_NEWNS:
+ return current_in_namespace(to_mnt_ns(ns));
+#ifdef CONFIG_NET_NS
+ case CLONE_NEWNET:
+ return current_in_namespace(to_net_ns(ns));
+#endif
+#ifdef CONFIG_PID_NS
+ case CLONE_NEWPID:
+ return current_in_namespace(to_pid_ns(ns));
+#endif
+#ifdef CONFIG_TIME_NS
+ case CLONE_NEWTIME:
+ return current_in_namespace(to_time_ns(ns));
+#endif
+#ifdef CONFIG_USER_NS
+ case CLONE_NEWUSER:
+ return current_in_namespace(to_user_ns(ns));
+#endif
+#ifdef CONFIG_UTS_NS
+ case CLONE_NEWUTS:
+ return current_in_namespace(to_uts_ns(ns));
+#endif
+ default:
+ VFS_WARN_ON_ONCE(true);
+ return false;
+ }
+}
+
static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
int fh_len, int fh_type)
{
@@ -483,18 +514,35 @@ static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
return NULL;
}
+ if (!fid->ns_id)
+ return NULL;
+ /* Either both are set or both are unset. */
+ if (!fid->ns_inum != !fid->ns_type)
+ return NULL;
+
scoped_guard(rcu) {
ns = ns_tree_lookup_rcu(fid->ns_id, fid->ns_type);
if (!ns)
return NULL;
VFS_WARN_ON_ONCE(ns->ns_id != fid->ns_id);
- VFS_WARN_ON_ONCE(ns->ns_type != fid->ns_type);
- if (ns->inum != fid->ns_inum)
+ if (fid->ns_inum && (fid->ns_inum != ns->inum))
+ return NULL;
+ if (fid->ns_type && (fid->ns_type != ns->ns_type))
return NULL;
- if (!__ns_ref_get(ns))
+ /*
+ * This is racy because we're not actually taking an
+ * active reference. IOW, it could happen that the
+ * namespace becomes inactive after this check.
+ * We don't care because nsfs_init_inode() will just
+ * resurrect the relevant namespace tree for us. If it
+ * has been active here we just allow it's resurrection.
+ * We could try to take an active reference here and
+ * then drop it again. But really, why bother.
+ */
+ if (!ns_get_unless_inactive(ns))
return NULL;
}
@@ -590,6 +638,8 @@ static int nsfs_init_fs_context(struct fs_context *fc)
struct pseudo_fs_context *ctx = init_pseudo(fc, NSFS_MAGIC);
if (!ctx)
return -ENOMEM;
+ fc->s_iflags |= SB_I_NOEXEC | SB_I_NODEV;
+ ctx->s_d_flags |= DCACHE_DONTCACHE;
ctx->ops = &nsfs_ops;
ctx->eops = &nsfs_export_operations;
ctx->dops = &ns_dentry_operations;
@@ -612,3 +662,27 @@ void __init nsfs_init(void)
nsfs_root_path.mnt = nsfs_mnt;
nsfs_root_path.dentry = nsfs_mnt->mnt_root;
}
+
+void nsproxy_ns_active_get(struct nsproxy *ns)
+{
+ ns_ref_active_get(ns->mnt_ns);
+ ns_ref_active_get(ns->uts_ns);
+ ns_ref_active_get(ns->ipc_ns);
+ ns_ref_active_get(ns->pid_ns_for_children);
+ ns_ref_active_get(ns->cgroup_ns);
+ ns_ref_active_get(ns->net_ns);
+ ns_ref_active_get(ns->time_ns);
+ ns_ref_active_get(ns->time_ns_for_children);
+}
+
+void nsproxy_ns_active_put(struct nsproxy *ns)
+{
+ ns_ref_active_put(ns->mnt_ns);
+ ns_ref_active_put(ns->uts_ns);
+ ns_ref_active_put(ns->ipc_ns);
+ ns_ref_active_put(ns->pid_ns_for_children);
+ ns_ref_active_put(ns->cgroup_ns);
+ ns_ref_active_put(ns->net_ns);
+ ns_ref_active_put(ns->time_ns);
+ ns_ref_active_put(ns->time_ns_for_children);
+}
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index 3959f23c487a..08266adc42ba 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -537,7 +537,7 @@ struct inode *ntfs_iget5(struct super_block *sb, const struct MFT_REF *ref,
return ERR_PTR(-ENOMEM);
/* If this is a freshly allocated inode, need to read it now. */
- if (inode->i_state & I_NEW)
+ if (inode_state_read_once(inode) & I_NEW)
inode = ntfs_read_mft(inode, name, ref);
else if (ref->seq != ntfs_i(inode)->mi.mrec->seq) {
/*
diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c
index ddff94c091b8..8d09dfec970a 100644
--- a/fs/ntfs3/super.c
+++ b/fs/ntfs3/super.c
@@ -51,6 +51,7 @@
#include <linux/buffer_head.h>
#include <linux/exportfs.h>
#include <linux/fs.h>
+#include <linux/fs_struct.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/log2.h>
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 62464d194da3..af1e2cedb217 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -13,6 +13,7 @@
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/string.h>
+#include <linux/fs_struct.h>
#include <cluster/masklog.h>
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 162711cc5b20..b267ec580da9 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6892,7 +6892,7 @@ static void ocfs2_zero_cluster_folios(struct inode *inode, loff_t start,
ocfs2_map_and_dirty_folio(inode, handle, from, to, folio, 1,
&phys);
- start = folio_next_index(folio) << PAGE_SHIFT;
+ start = folio_next_pos(folio);
}
out:
if (folios)
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 92a6149da9c1..619ff03b15d6 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2487,7 +2487,7 @@ update:
* which hasn't been populated yet, so clear the refresh flag
* and let the caller handle it.
*/
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
status = 0;
if (lockres)
ocfs2_complete_lock_res_refresh(lockres, 0);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index fcc89856ab95..78f81950c9ee 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -152,8 +152,8 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
mlog_errno(PTR_ERR(inode));
goto bail;
}
- trace_ocfs2_iget5_locked(inode->i_state);
- if (inode->i_state & I_NEW) {
+ trace_ocfs2_iget5_locked(inode_state_read_once(inode));
+ if (inode_state_read_once(inode) & I_NEW) {
rc = ocfs2_read_locked_inode(inode, &args);
unlock_new_inode(inode);
}
@@ -1290,6 +1290,8 @@ static void ocfs2_clear_inode(struct inode *inode)
void ocfs2_evict_inode(struct inode *inode)
{
+ write_inode_now(inode, 1);
+
if (!inode->i_nlink ||
(OCFS2_I(inode)->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)) {
ocfs2_delete_inode(inode);
@@ -1299,27 +1301,6 @@ void ocfs2_evict_inode(struct inode *inode)
ocfs2_clear_inode(inode);
}
-/* Called under inode_lock, with no more references on the
- * struct inode, so it's safe here to check the flags field
- * and to manipulate i_nlink without any other locks. */
-int ocfs2_drop_inode(struct inode *inode)
-{
- struct ocfs2_inode_info *oi = OCFS2_I(inode);
-
- trace_ocfs2_drop_inode((unsigned long long)oi->ip_blkno,
- inode->i_nlink, oi->ip_flags);
-
- assert_spin_locked(&inode->i_lock);
- inode->i_state |= I_WILL_FREE;
- spin_unlock(&inode->i_lock);
- write_inode_now(inode, 1);
- spin_lock(&inode->i_lock);
- WARN_ON(inode->i_state & I_NEW);
- inode->i_state &= ~I_WILL_FREE;
-
- return 1;
-}
-
/*
* This is called from our getattr.
*/
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index accf03d4765e..07bd838e7843 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -116,7 +116,6 @@ static inline struct ocfs2_caching_info *INODE_CACHE(struct inode *inode)
}
void ocfs2_evict_inode(struct inode *inode);
-int ocfs2_drop_inode(struct inode *inode);
/* Flags for ocfs2_iget() */
#define OCFS2_FI_FLAG_SYSFILE 0x1
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index e5f58ff2175f..85239807dec7 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -902,15 +902,8 @@ bail:
static int ocfs2_journal_submit_inode_data_buffers(struct jbd2_inode *jinode)
{
- struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_ALL,
- .nr_to_write = mapping->nrpages * 2,
- .range_start = jinode->i_dirty_start,
- .range_end = jinode->i_dirty_end,
- };
-
- return filemap_fdatawrite_wbc(mapping, &wbc);
+ return filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping,
+ jinode->i_dirty_start, jinode->i_dirty_end);
}
int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty)
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 54ed1495de9a..4b32fb5658ad 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -1569,8 +1569,6 @@ DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_delete_inode);
DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_clear_inode);
-DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_drop_inode);
-
TRACE_EVENT(ocfs2_inode_revalidate,
TP_PROTO(void *inode, unsigned long long ino,
unsigned int flags),
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 53daa4482406..2c7ba1480f7a 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -129,7 +129,7 @@ static const struct super_operations ocfs2_sops = {
.statfs = ocfs2_statfs,
.alloc_inode = ocfs2_alloc_inode,
.free_inode = ocfs2_free_inode,
- .drop_inode = ocfs2_drop_inode,
+ .drop_inode = inode_just_drop,
.evict_inode = ocfs2_evict_inode,
.sync_fs = ocfs2_sync_fs,
.put_super = ocfs2_put_super,
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 135c49c5d848..701ed85d9831 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -14,6 +14,7 @@
#include <linux/writeback.h>
#include <linux/seq_file.h>
#include <linux/crc-itu-t.h>
+#include <linux/fs_struct.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include "omfs.h"
@@ -212,7 +213,7 @@ struct inode *omfs_iget(struct super_block *sb, ino_t ino)
inode = iget_locked(sb, ino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
bh = omfs_bread(inode->i_sb, ino);
diff --git a/fs/open.c b/fs/open.c
index 3d64372ecc67..f328622061c5 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -191,12 +191,9 @@ int do_ftruncate(struct file *file, loff_t length, int small)
if (error)
return error;
- sb_start_write(inode->i_sb);
- error = do_truncate(file_mnt_idmap(file), dentry, length,
- ATTR_MTIME | ATTR_CTIME, file);
- sb_end_write(inode->i_sb);
-
- return error;
+ scoped_guard(super_write, inode->i_sb)
+ return do_truncate(file_mnt_idmap(file), dentry, length,
+ ATTR_MTIME | ATTR_CTIME, file);
}
int do_sys_ftruncate(unsigned int fd, loff_t length, int small)
@@ -631,7 +628,7 @@ out:
int chmod_common(const struct path *path, umode_t mode)
{
struct inode *inode = path->dentry->d_inode;
- struct inode *delegated_inode = NULL;
+ struct delegated_inode delegated_inode = { };
struct iattr newattrs;
int error;
@@ -651,7 +648,7 @@ retry_deleg:
&newattrs, &delegated_inode);
out_unlock:
inode_unlock(inode);
- if (delegated_inode) {
+ if (is_delegated(&delegated_inode)) {
error = break_deleg_wait(&delegated_inode);
if (!error)
goto retry_deleg;
@@ -756,7 +753,7 @@ int chown_common(const struct path *path, uid_t user, gid_t group)
struct mnt_idmap *idmap;
struct user_namespace *fs_userns;
struct inode *inode = path->dentry->d_inode;
- struct inode *delegated_inode = NULL;
+ struct delegated_inode delegated_inode = { };
int error;
struct iattr newattrs;
kuid_t uid;
@@ -791,7 +788,7 @@ retry_deleg:
error = notify_change(idmap, path->dentry, &newattrs,
&delegated_inode);
inode_unlock(inode);
- if (delegated_inode) {
+ if (is_delegated(&delegated_inode)) {
error = break_deleg_wait(&delegated_inode);
if (!error)
goto retry_deleg;
@@ -940,7 +937,7 @@ static int do_dentry_open(struct file *f,
}
error = security_file_open(f);
- if (error)
+ if (unlikely(error))
goto cleanup_all;
/*
@@ -950,11 +947,11 @@ static int do_dentry_open(struct file *f,
* pseudo file, this call will not change the mode.
*/
error = fsnotify_open_perm_and_set_mode(f);
- if (error)
+ if (unlikely(error))
goto cleanup_all;
error = break_lease(file_inode(f), f->f_flags);
- if (error)
+ if (unlikely(error))
goto cleanup_all;
/* normally all 3 are set; ->open() can clear them if needed */
@@ -1171,9 +1168,7 @@ struct file *dentry_create(const struct path *path, int flags, umode_t mode,
if (IS_ERR(f))
return f;
- error = vfs_create(mnt_idmap(path->mnt),
- d_inode(path->dentry->d_parent),
- path->dentry, mode, true);
+ error = vfs_create(mnt_idmap(path->mnt), path->dentry, mode, NULL);
if (!error)
error = vfs_open(path, f);
@@ -1421,8 +1416,8 @@ static int do_sys_openat2(int dfd, const char __user *filename,
struct open_how *how)
{
struct open_flags op;
- struct filename *tmp;
- int err, fd;
+ struct filename *tmp __free(putname) = NULL;
+ int err;
err = build_open_flags(how, &op);
if (unlikely(err))
@@ -1432,18 +1427,7 @@ static int do_sys_openat2(int dfd, const char __user *filename,
if (IS_ERR(tmp))
return PTR_ERR(tmp);
- fd = get_unused_fd_flags(how->flags);
- if (likely(fd >= 0)) {
- struct file *f = do_filp_open(dfd, tmp, &op);
- if (IS_ERR(f)) {
- put_unused_fd(fd);
- fd = PTR_ERR(f);
- } else {
- fd_install(fd, f);
- }
- }
- putname(tmp);
- return fd;
+ return FD_ADD(how->flags, do_filp_open(dfd, tmp, &op));
}
int do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 26ecda0e4d19..fb8d84bdedfb 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -236,7 +236,7 @@ found:
mutex_unlock(&op_mutex);
if (IS_ERR(inode))
return ERR_CAST(inode);
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
simple_inode_init_ts(inode);
ent_oi = OP_I(inode);
ent_oi->type = ent_type;
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index a01400cd41fd..d7275990ffa4 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -878,7 +878,9 @@ int orangefs_update_time(struct inode *inode, int flags)
gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_update_time: %pU\n",
get_khandle_from_ino(inode));
- flags = generic_update_time(inode, flags);
+
+ flags = inode_update_timestamps(inode, flags);
+
memset(&iattr, 0, sizeof iattr);
if (flags & S_ATIME)
iattr.ia_valid |= ATTR_ATIME;
@@ -1041,7 +1043,7 @@ struct inode *orangefs_iget(struct super_block *sb,
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
error = orangefs_inode_getattr(inode, ORANGEFS_GETATTR_NEW);
diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c
index 0fdceb00ca07..9ab1119ebd28 100644
--- a/fs/orangefs/orangefs-utils.c
+++ b/fs/orangefs/orangefs-utils.c
@@ -247,7 +247,7 @@ again:
spin_lock(&inode->i_lock);
/* Must have all the attributes in the mask and be within cache time. */
if ((!flags && time_before(jiffies, orangefs_inode->getattr_time)) ||
- orangefs_inode->attr_valid || inode->i_state & I_DIRTY_PAGES) {
+ orangefs_inode->attr_valid || inode_state_read(inode) & I_DIRTY_PAGES) {
if (orangefs_inode->attr_valid) {
spin_unlock(&inode->i_lock);
write_inode_now(inode, 1);
@@ -281,13 +281,13 @@ again2:
spin_lock(&inode->i_lock);
/* Must have all the attributes in the mask and be within cache time. */
if ((!flags && time_before(jiffies, orangefs_inode->getattr_time)) ||
- orangefs_inode->attr_valid || inode->i_state & I_DIRTY_PAGES) {
+ orangefs_inode->attr_valid || inode_state_read(inode) & I_DIRTY_PAGES) {
if (orangefs_inode->attr_valid) {
spin_unlock(&inode->i_lock);
write_inode_now(inode, 1);
goto again2;
}
- if (inode->i_state & I_DIRTY_PAGES) {
+ if (inode_state_read(inode) & I_DIRTY_PAGES) {
ret = 0;
goto out_unlock;
}
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 604a82acd164..758611ee4475 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -523,8 +523,8 @@ static int ovl_create_index(struct dentry *dentry, const struct ovl_fh *fh,
{
struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
struct dentry *indexdir = ovl_indexdir(dentry->d_sb);
- struct dentry *index = NULL;
struct dentry *temp = NULL;
+ struct renamedata rd = {};
struct qstr name = { };
int err;
@@ -556,17 +556,15 @@ static int ovl_create_index(struct dentry *dentry, const struct ovl_fh *fh,
if (err)
goto out;
- err = ovl_parent_lock(indexdir, temp);
+ rd.mnt_idmap = ovl_upper_mnt_idmap(ofs);
+ rd.old_parent = indexdir;
+ rd.new_parent = indexdir;
+ err = start_renaming_dentry(&rd, 0, temp, &name);
if (err)
goto out;
- index = ovl_lookup_upper(ofs, name.name, indexdir, name.len);
- if (IS_ERR(index)) {
- err = PTR_ERR(index);
- } else {
- err = ovl_do_rename(ofs, indexdir, temp, indexdir, index, 0);
- dput(index);
- }
- ovl_parent_unlock(indexdir);
+
+ err = ovl_do_rename_rd(&rd);
+ end_renaming(&rd);
out:
if (err)
ovl_cleanup(ofs, indexdir, temp);
@@ -613,9 +611,9 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c)
if (err)
goto out;
- inode_lock_nested(udir, I_MUTEX_PARENT);
- upper = ovl_lookup_upper(ofs, c->dentry->d_name.name, upperdir,
- c->dentry->d_name.len);
+ upper = ovl_start_creating_upper(ofs, upperdir,
+ &QSTR_LEN(c->dentry->d_name.name,
+ c->dentry->d_name.len));
err = PTR_ERR(upper);
if (!IS_ERR(upper)) {
err = ovl_do_link(ofs, ovl_dentry_upper(c->dentry), udir, upper);
@@ -626,9 +624,8 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c)
ovl_dentry_set_upper_alias(c->dentry);
ovl_dentry_update_reval(c->dentry, upper);
}
- dput(upper);
+ end_creating(upper);
}
- inode_unlock(udir);
if (err)
goto out;
@@ -727,34 +724,33 @@ static int ovl_copy_up_metadata(struct ovl_copy_up_ctx *c, struct dentry *temp)
return err;
}
-struct ovl_cu_creds {
- const struct cred *old;
- struct cred *new;
-};
-
-static int ovl_prep_cu_creds(struct dentry *dentry, struct ovl_cu_creds *cc)
+static const struct cred *ovl_prepare_copy_up_creds(struct dentry *dentry)
{
+ struct cred *copy_up_cred = NULL;
int err;
- cc->old = cc->new = NULL;
- err = security_inode_copy_up(dentry, &cc->new);
+ err = security_inode_copy_up(dentry, &copy_up_cred);
if (err < 0)
- return err;
+ return ERR_PTR(err);
- if (cc->new)
- cc->old = override_creds(cc->new);
+ if (!copy_up_cred)
+ return NULL;
- return 0;
+ return override_creds(copy_up_cred);
}
-static void ovl_revert_cu_creds(struct ovl_cu_creds *cc)
+static void ovl_revert_copy_up_creds(const struct cred *orig_cred)
{
- if (cc->new) {
- revert_creds(cc->old);
- put_cred(cc->new);
- }
+ const struct cred *copy_up_cred;
+
+ copy_up_cred = revert_creds(orig_cred);
+ put_cred(copy_up_cred);
}
+DEFINE_CLASS(copy_up_creds, const struct cred *,
+ if (!IS_ERR_OR_NULL(_T)) ovl_revert_copy_up_creds(_T),
+ ovl_prepare_copy_up_creds(dentry), struct dentry *dentry)
+
/*
* Copyup using workdir to prepare temp file. Used when copying up directories,
* special files or when upper fs doesn't support O_TMPFILE.
@@ -764,8 +760,8 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb);
struct inode *inode;
struct path path = { .mnt = ovl_upper_mnt(ofs) };
- struct dentry *temp, *upper, *trap;
- struct ovl_cu_creds cc;
+ struct renamedata rd = {};
+ struct dentry *temp;
int err;
struct ovl_cattr cattr = {
/* Can't properly set mode on creation because of the umask */
@@ -774,14 +770,14 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
.link = c->link
};
- err = ovl_prep_cu_creds(c->dentry, &cc);
- if (err)
- return err;
+ scoped_class(copy_up_creds, copy_up_creds, c->dentry) {
+ if (IS_ERR(copy_up_creds))
+ return PTR_ERR(copy_up_creds);
- ovl_start_write(c->dentry);
- temp = ovl_create_temp(ofs, c->workdir, &cattr);
- ovl_end_write(c->dentry);
- ovl_revert_cu_creds(&cc);
+ ovl_start_write(c->dentry);
+ temp = ovl_create_temp(ofs, c->workdir, &cattr);
+ ovl_end_write(c->dentry);
+ }
if (IS_ERR(temp))
return PTR_ERR(temp);
@@ -808,29 +804,24 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
* ovl_copy_up_data(), so lock workdir and destdir and make sure that
* temp wasn't moved before copy up completion or cleanup.
*/
- trap = lock_rename(c->workdir, c->destdir);
- if (trap || temp->d_parent != c->workdir) {
- /* temp or workdir moved underneath us? abort without cleanup */
- dput(temp);
+ rd.mnt_idmap = ovl_upper_mnt_idmap(ofs);
+ rd.old_parent = c->workdir;
+ rd.new_parent = c->destdir;
+ rd.flags = 0;
+ err = start_renaming_dentry(&rd, 0, temp,
+ &QSTR_LEN(c->destname.name, c->destname.len));
+ if (err) {
+ /* temp or workdir moved underneath us? map to -EIO */
err = -EIO;
- if (!IS_ERR(trap))
- unlock_rename(c->workdir, c->destdir);
- goto out;
}
-
- err = ovl_copy_up_metadata(c, temp);
if (err)
- goto cleanup;
+ goto cleanup_unlocked;
- upper = ovl_lookup_upper(ofs, c->destname.name, c->destdir,
- c->destname.len);
- err = PTR_ERR(upper);
- if (IS_ERR(upper))
- goto cleanup;
+ err = ovl_copy_up_metadata(c, temp);
+ if (!err)
+ err = ovl_do_rename_rd(&rd);
+ end_renaming(&rd);
- err = ovl_do_rename(ofs, c->workdir, temp, c->destdir, upper, 0);
- unlock_rename(c->workdir, c->destdir);
- dput(upper);
if (err)
goto cleanup_unlocked;
@@ -851,8 +842,6 @@ out:
return err;
-cleanup:
- unlock_rename(c->workdir, c->destdir);
cleanup_unlocked:
ovl_cleanup(ofs, c->workdir, temp);
dput(temp);
@@ -866,17 +855,17 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c)
struct inode *udir = d_inode(c->destdir);
struct dentry *temp, *upper;
struct file *tmpfile;
- struct ovl_cu_creds cc;
int err;
- err = ovl_prep_cu_creds(c->dentry, &cc);
- if (err)
- return err;
+ scoped_class(copy_up_creds, copy_up_creds, c->dentry) {
+ if (IS_ERR(copy_up_creds))
+ return PTR_ERR(copy_up_creds);
+
+ ovl_start_write(c->dentry);
+ tmpfile = ovl_do_tmpfile(ofs, c->workdir, c->stat.mode);
+ ovl_end_write(c->dentry);
+ }
- ovl_start_write(c->dentry);
- tmpfile = ovl_do_tmpfile(ofs, c->workdir, c->stat.mode);
- ovl_end_write(c->dentry);
- ovl_revert_cu_creds(&cc);
if (IS_ERR(tmpfile))
return PTR_ERR(tmpfile);
@@ -894,16 +883,14 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c)
if (err)
goto out;
- inode_lock_nested(udir, I_MUTEX_PARENT);
-
- upper = ovl_lookup_upper(ofs, c->destname.name, c->destdir,
- c->destname.len);
+ upper = ovl_start_creating_upper(ofs, c->destdir,
+ &QSTR_LEN(c->destname.name,
+ c->destname.len));
err = PTR_ERR(upper);
if (!IS_ERR(upper)) {
err = ovl_do_link(ofs, temp, udir, upper);
- dput(upper);
+ end_creating(upper);
}
- inode_unlock(udir);
if (err)
goto out;
@@ -1214,7 +1201,6 @@ static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
static int ovl_copy_up_flags(struct dentry *dentry, int flags)
{
int err = 0;
- const struct cred *old_cred;
bool disconnected = (dentry->d_flags & DCACHE_DISCONNECTED);
/*
@@ -1234,7 +1220,6 @@ static int ovl_copy_up_flags(struct dentry *dentry, int flags)
if (err)
return err;
- old_cred = ovl_override_creds(dentry->d_sb);
while (!err) {
struct dentry *next;
struct dentry *parent = NULL;
@@ -1254,12 +1239,12 @@ static int ovl_copy_up_flags(struct dentry *dentry, int flags)
next = parent;
}
- err = ovl_copy_up_one(parent, next, flags);
+ with_ovl_creds(dentry->d_sb)
+ err = ovl_copy_up_one(parent, next, flags);
dput(parent);
dput(next);
}
- ovl_revert_creds(old_cred);
return err;
}
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index a5e9ddf3023b..06b860b9ded6 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -47,79 +47,70 @@ static int ovl_cleanup_locked(struct ovl_fs *ofs, struct inode *wdir,
int ovl_cleanup(struct ovl_fs *ofs, struct dentry *workdir,
struct dentry *wdentry)
{
- int err;
-
- err = ovl_parent_lock(workdir, wdentry);
- if (err)
- return err;
+ wdentry = start_removing_dentry(workdir, wdentry);
+ if (IS_ERR(wdentry))
+ return PTR_ERR(wdentry);
ovl_cleanup_locked(ofs, workdir->d_inode, wdentry);
- ovl_parent_unlock(workdir);
+ end_removing(wdentry);
return 0;
}
-struct dentry *ovl_lookup_temp(struct ovl_fs *ofs, struct dentry *workdir)
+void ovl_tempname(char name[OVL_TEMPNAME_SIZE])
{
- struct dentry *temp;
- char name[20];
static atomic_t temp_id = ATOMIC_INIT(0);
/* counter is allowed to wrap, since temp dentries are ephemeral */
- snprintf(name, sizeof(name), "#%x", atomic_inc_return(&temp_id));
+ snprintf(name, OVL_TEMPNAME_SIZE, "#%x", atomic_inc_return(&temp_id));
+}
- temp = ovl_lookup_upper(ofs, name, workdir, strlen(name));
- if (!IS_ERR(temp) && temp->d_inode) {
- pr_err("workdir/%s already exists\n", name);
- dput(temp);
- temp = ERR_PTR(-EIO);
- }
+static struct dentry *ovl_start_creating_temp(struct ovl_fs *ofs,
+ struct dentry *workdir)
+{
+ char name[OVL_TEMPNAME_SIZE];
- return temp;
+ ovl_tempname(name);
+ return start_creating(ovl_upper_mnt_idmap(ofs), workdir,
+ &QSTR(name));
}
static struct dentry *ovl_whiteout(struct ovl_fs *ofs)
{
int err;
- struct dentry *whiteout;
+ struct dentry *whiteout, *link;
struct dentry *workdir = ofs->workdir;
struct inode *wdir = workdir->d_inode;
guard(mutex)(&ofs->whiteout_lock);
if (!ofs->whiteout) {
- inode_lock_nested(wdir, I_MUTEX_PARENT);
- whiteout = ovl_lookup_temp(ofs, workdir);
- if (!IS_ERR(whiteout)) {
- err = ovl_do_whiteout(ofs, wdir, whiteout);
- if (err) {
- dput(whiteout);
- whiteout = ERR_PTR(err);
- }
- }
- inode_unlock(wdir);
+ whiteout = ovl_start_creating_temp(ofs, workdir);
if (IS_ERR(whiteout))
return whiteout;
- ofs->whiteout = whiteout;
+ err = ovl_do_whiteout(ofs, wdir, whiteout);
+ if (!err)
+ ofs->whiteout = dget(whiteout);
+ end_creating(whiteout);
+ if (err)
+ return ERR_PTR(err);
}
if (!ofs->no_shared_whiteout) {
- inode_lock_nested(wdir, I_MUTEX_PARENT);
- whiteout = ovl_lookup_temp(ofs, workdir);
- if (!IS_ERR(whiteout)) {
- err = ovl_do_link(ofs, ofs->whiteout, wdir, whiteout);
- if (err) {
- dput(whiteout);
- whiteout = ERR_PTR(err);
- }
- }
- inode_unlock(wdir);
- if (!IS_ERR(whiteout))
+ link = ovl_start_creating_temp(ofs, workdir);
+ if (IS_ERR(link))
+ return link;
+ err = ovl_do_link(ofs, ofs->whiteout, wdir, link);
+ if (!err)
+ whiteout = dget(link);
+ end_creating(link);
+ if (!err)
return whiteout;
- if (PTR_ERR(whiteout) != -EMLINK) {
- pr_warn("Failed to link whiteout - disabling whiteout inode sharing(nlink=%u, err=%lu)\n",
+
+ if (err != -EMLINK) {
+ pr_warn("Failed to link whiteout - disabling whiteout inode sharing(nlink=%u, err=%u)\n",
ofs->whiteout->d_inode->i_nlink,
- PTR_ERR(whiteout));
+ err);
ofs->no_shared_whiteout = true;
}
}
@@ -132,6 +123,7 @@ int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct dentry *dir,
struct dentry *dentry)
{
struct dentry *whiteout;
+ struct renamedata rd = {};
int err;
int flags = 0;
@@ -143,10 +135,14 @@ int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct dentry *dir,
if (d_is_dir(dentry))
flags = RENAME_EXCHANGE;
- err = ovl_lock_rename_workdir(ofs->workdir, whiteout, dir, dentry);
+ rd.mnt_idmap = ovl_upper_mnt_idmap(ofs);
+ rd.old_parent = ofs->workdir;
+ rd.new_parent = dir;
+ rd.flags = flags;
+ err = start_renaming_two_dentries(&rd, whiteout, dentry);
if (!err) {
- err = ovl_do_rename(ofs, ofs->workdir, whiteout, dir, dentry, flags);
- unlock_rename(ofs->workdir, dir);
+ err = ovl_do_rename_rd(&rd);
+ end_renaming(&rd);
}
if (err)
goto kill_whiteout;
@@ -191,7 +187,7 @@ struct dentry *ovl_create_real(struct ovl_fs *ofs, struct dentry *parent,
if (!err && ofs->casefold != ovl_dentry_casefolded(newdentry)) {
pr_warn_ratelimited("wrong inherited casefold (%pd2)\n",
newdentry);
- dput(newdentry);
+ end_creating(newdentry);
err = -EINVAL;
}
break;
@@ -241,8 +237,7 @@ struct dentry *ovl_create_real(struct ovl_fs *ofs, struct dentry *parent,
}
out:
if (err) {
- if (!IS_ERR(newdentry))
- dput(newdentry);
+ end_creating(newdentry);
return ERR_PTR(err);
}
return newdentry;
@@ -252,11 +247,11 @@ struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir,
struct ovl_cattr *attr)
{
struct dentry *ret;
- inode_lock_nested(workdir->d_inode, I_MUTEX_PARENT);
- ret = ovl_create_real(ofs, workdir,
- ovl_lookup_temp(ofs, workdir), attr);
- inode_unlock(workdir->d_inode);
- return ret;
+ ret = ovl_start_creating_temp(ofs, workdir);
+ if (IS_ERR(ret))
+ return ret;
+ ret = ovl_create_real(ofs, workdir, ret, attr);
+ return end_creating_keep(ret);
}
static int ovl_set_opaque_xerr(struct dentry *dentry, struct dentry *upper,
@@ -354,18 +349,19 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
{
struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
- struct inode *udir = upperdir->d_inode;
struct dentry *newdentry;
int err;
- inode_lock_nested(udir, I_MUTEX_PARENT);
- newdentry = ovl_create_real(ofs, upperdir,
- ovl_lookup_upper(ofs, dentry->d_name.name,
- upperdir, dentry->d_name.len),
- attr);
- inode_unlock(udir);
+ newdentry = ovl_start_creating_upper(ofs, upperdir,
+ &QSTR_LEN(dentry->d_name.name,
+ dentry->d_name.len));
if (IS_ERR(newdentry))
return PTR_ERR(newdentry);
+ newdentry = ovl_create_real(ofs, upperdir, newdentry, attr);
+ if (IS_ERR(newdentry))
+ return PTR_ERR(newdentry);
+
+ end_creating_keep(newdentry);
if (ovl_type_merge(dentry->d_parent) && d_is_dir(newdentry) &&
!ovl_allow_offline_changes(ofs)) {
@@ -391,6 +387,7 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry,
struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
struct dentry *workdir = ovl_workdir(dentry);
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
+ struct renamedata rd = {};
struct path upperpath;
struct dentry *upper;
struct dentry *opaquedir;
@@ -416,7 +413,11 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry,
if (IS_ERR(opaquedir))
goto out;
- err = ovl_lock_rename_workdir(workdir, opaquedir, upperdir, upper);
+ rd.mnt_idmap = ovl_upper_mnt_idmap(ofs);
+ rd.old_parent = workdir;
+ rd.new_parent = upperdir;
+ rd.flags = RENAME_EXCHANGE;
+ err = start_renaming_two_dentries(&rd, opaquedir, upper);
if (err)
goto out_cleanup_unlocked;
@@ -434,8 +435,8 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry,
if (err)
goto out_cleanup;
- err = ovl_do_rename(ofs, workdir, opaquedir, upperdir, upper, RENAME_EXCHANGE);
- unlock_rename(workdir, upperdir);
+ err = ovl_do_rename_rd(&rd);
+ end_renaming(&rd);
if (err)
goto out_cleanup_unlocked;
@@ -448,7 +449,7 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry,
return opaquedir;
out_cleanup:
- unlock_rename(workdir, upperdir);
+ end_renaming(&rd);
out_cleanup_unlocked:
ovl_cleanup(ofs, workdir, opaquedir);
dput(opaquedir);
@@ -471,6 +472,7 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
struct dentry *workdir = ovl_workdir(dentry);
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
+ struct renamedata rd = {};
struct dentry *upper;
struct dentry *newdentry;
int err;
@@ -502,7 +504,11 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
if (IS_ERR(newdentry))
goto out_dput;
- err = ovl_lock_rename_workdir(workdir, newdentry, upperdir, upper);
+ rd.mnt_idmap = ovl_upper_mnt_idmap(ofs);
+ rd.old_parent = workdir;
+ rd.new_parent = upperdir;
+ rd.flags = 0;
+ err = start_renaming_two_dentries(&rd, newdentry, upper);
if (err)
goto out_cleanup_unlocked;
@@ -539,16 +545,16 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
if (err)
goto out_cleanup;
- err = ovl_do_rename(ofs, workdir, newdentry, upperdir, upper,
- RENAME_EXCHANGE);
- unlock_rename(workdir, upperdir);
+ rd.flags = RENAME_EXCHANGE;
+ err = ovl_do_rename_rd(&rd);
+ end_renaming(&rd);
if (err)
goto out_cleanup_unlocked;
ovl_cleanup(ofs, workdir, upper);
} else {
- err = ovl_do_rename(ofs, workdir, newdentry, upperdir, upper, 0);
- unlock_rename(workdir, upperdir);
+ err = ovl_do_rename_rd(&rd);
+ end_renaming(&rd);
if (err)
goto out_cleanup_unlocked;
}
@@ -568,66 +574,76 @@ out:
return err;
out_cleanup:
- unlock_rename(workdir, upperdir);
+ end_renaming(&rd);
out_cleanup_unlocked:
ovl_cleanup(ofs, workdir, newdentry);
dput(newdentry);
goto out_dput;
}
-static const struct cred *ovl_setup_cred_for_create(struct dentry *dentry,
- struct inode *inode,
- umode_t mode,
- const struct cred *old_cred)
+static const struct cred *ovl_override_creator_creds(struct dentry *dentry, struct inode *inode, umode_t mode)
{
int err;
- struct cred *override_cred;
- override_cred = prepare_creds();
+ if (WARN_ON_ONCE(current->cred != ovl_creds(dentry->d_sb)))
+ return ERR_PTR(-EINVAL);
+
+ CLASS(prepare_creds, override_cred)();
if (!override_cred)
return ERR_PTR(-ENOMEM);
override_cred->fsuid = inode->i_uid;
override_cred->fsgid = inode->i_gid;
+
err = security_dentry_create_files_as(dentry, mode, &dentry->d_name,
- old_cred, override_cred);
- if (err) {
- put_cred(override_cred);
+ current->cred, override_cred);
+ if (err)
return ERR_PTR(err);
- }
- /*
- * Caller is going to match this with revert_creds() and drop
- * referenec on the returned creds.
- * We must be called with creator creds already, otherwise we risk
- * leaking creds.
- */
- old_cred = override_creds(override_cred);
- WARN_ON_ONCE(old_cred != ovl_creds(dentry->d_sb));
+ return override_creds(no_free_ptr(override_cred));
+}
+
+static void ovl_revert_creator_creds(const struct cred *old_cred)
+{
+ const struct cred *override_cred;
- return override_cred;
+ override_cred = revert_creds(old_cred);
+ put_cred(override_cred);
+}
+
+DEFINE_CLASS(ovl_override_creator_creds,
+ const struct cred *,
+ if (!IS_ERR_OR_NULL(_T)) ovl_revert_creator_creds(_T),
+ ovl_override_creator_creds(dentry, inode, mode),
+ struct dentry *dentry, struct inode *inode, umode_t mode)
+
+static int ovl_create_handle_whiteouts(struct dentry *dentry,
+ struct inode *inode,
+ struct ovl_cattr *attr)
+{
+ if (!ovl_dentry_is_whiteout(dentry))
+ return ovl_create_upper(dentry, inode, attr);
+
+ return ovl_create_over_whiteout(dentry, inode, attr);
}
static int ovl_create_or_link(struct dentry *dentry, struct inode *inode,
struct ovl_cattr *attr, bool origin)
{
int err;
- const struct cred *old_cred, *new_cred = NULL;
struct dentry *parent = dentry->d_parent;
- old_cred = ovl_override_creds(dentry->d_sb);
-
- /*
- * When linking a file with copy up origin into a new parent, mark the
- * new parent dir "impure".
- */
- if (origin) {
- err = ovl_set_impure(parent, ovl_dentry_upper(parent));
- if (err)
- goto out_revert_creds;
- }
+ with_ovl_creds(dentry->d_sb) {
+ /*
+ * When linking a file with copy up origin into a new parent, mark the
+ * new parent dir "impure".
+ */
+ if (origin) {
+ err = ovl_set_impure(parent, ovl_dentry_upper(parent));
+ if (err)
+ return err;
+ }
- if (!attr->hardlink) {
/*
* In the creation cases(create, mkdir, mknod, symlink),
* ovl should transfer current's fs{u,g}id to underlying
@@ -641,23 +657,16 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode,
* create a new inode, so just use the ovl mounter's
* fs{u,g}id.
*/
- new_cred = ovl_setup_cred_for_create(dentry, inode, attr->mode,
- old_cred);
- err = PTR_ERR(new_cred);
- if (IS_ERR(new_cred)) {
- new_cred = NULL;
- goto out_revert_creds;
- }
- }
- if (!ovl_dentry_is_whiteout(dentry))
- err = ovl_create_upper(dentry, inode, attr);
- else
- err = ovl_create_over_whiteout(dentry, inode, attr);
+ if (attr->hardlink)
+ return ovl_create_handle_whiteouts(dentry, inode, attr);
-out_revert_creds:
- ovl_revert_creds(old_cred);
- put_cred(new_cred);
+ scoped_class(ovl_override_creator_creds, cred, dentry, inode, attr->mode) {
+ if (IS_ERR(cred))
+ return PTR_ERR(cred);
+ return ovl_create_handle_whiteouts(dentry, inode, attr);
+ }
+ }
return err;
}
@@ -686,7 +695,7 @@ static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
goto out_drop_write;
spin_lock(&inode->i_lock);
- inode->i_state |= I_CREATING;
+ inode_state_set(inode, I_CREATING);
spin_unlock(&inode->i_lock);
inode_init_owner(&nop_mnt_idmap, inode, dentry->d_parent->d_inode, mode);
@@ -733,14 +742,8 @@ static int ovl_symlink(struct mnt_idmap *idmap, struct inode *dir,
static int ovl_set_link_redirect(struct dentry *dentry)
{
- const struct cred *old_cred;
- int err;
-
- old_cred = ovl_override_creds(dentry->d_sb);
- err = ovl_set_redirect(dentry, false);
- ovl_revert_creds(old_cred);
-
- return err;
+ with_ovl_creds(dentry->d_sb)
+ return ovl_set_redirect(dentry, false);
}
static int ovl_link(struct dentry *old, struct inode *newdir,
@@ -850,17 +853,17 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir,
goto out;
}
- inode_lock_nested(dir, I_MUTEX_PARENT);
- upper = ovl_lookup_upper(ofs, dentry->d_name.name, upperdir,
- dentry->d_name.len);
+ upper = ovl_start_removing_upper(ofs, upperdir,
+ &QSTR_LEN(dentry->d_name.name,
+ dentry->d_name.len));
err = PTR_ERR(upper);
if (IS_ERR(upper))
- goto out_unlock;
+ goto out_dput;
err = -ESTALE;
if ((opaquedir && upper != opaquedir) ||
(!opaquedir && !ovl_matches_upper(dentry, upper)))
- goto out_dput_upper;
+ goto out_unlock;
if (is_dir)
err = ovl_do_rmdir(ofs, dir, upper);
@@ -876,10 +879,9 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir,
*/
if (!err)
d_drop(dentry);
-out_dput_upper:
- dput(upper);
out_unlock:
- inode_unlock(dir);
+ end_removing(upper);
+out_dput:
dput(opaquedir);
out:
return err;
@@ -916,7 +918,6 @@ static void ovl_drop_nlink(struct dentry *dentry)
static int ovl_do_remove(struct dentry *dentry, bool is_dir)
{
int err;
- const struct cred *old_cred;
bool lower_positive = ovl_lower_positive(dentry);
LIST_HEAD(list);
@@ -935,12 +936,12 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir)
if (err)
goto out;
- old_cred = ovl_override_creds(dentry->d_sb);
- if (!lower_positive)
- err = ovl_remove_upper(dentry, is_dir, &list);
- else
- err = ovl_remove_and_whiteout(dentry, &list);
- ovl_revert_creds(old_cred);
+ with_ovl_creds(dentry->d_sb) {
+ if (!lower_positive)
+ err = ovl_remove_upper(dentry, is_dir, &list);
+ else
+ err = ovl_remove_and_whiteout(dentry, &list);
+ }
if (!err) {
if (is_dir)
clear_nlink(dentry->d_inode);
@@ -1104,102 +1105,107 @@ static int ovl_set_redirect(struct dentry *dentry, bool samedir)
return err;
}
-static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir,
- struct dentry *old, struct inode *newdir,
- struct dentry *new, unsigned int flags)
+struct ovl_renamedata {
+ struct renamedata;
+ struct dentry *opaquedir;
+ bool cleanup_whiteout;
+ bool update_nlink;
+ bool overwrite;
+};
+
+static int ovl_rename_start(struct ovl_renamedata *ovlrd, struct list_head *list)
{
- int err;
- struct dentry *old_upperdir;
- struct dentry *new_upperdir;
- struct dentry *olddentry = NULL;
- struct dentry *newdentry = NULL;
- struct dentry *trap, *de;
- bool old_opaque;
- bool new_opaque;
- bool cleanup_whiteout = false;
- bool update_nlink = false;
- bool overwrite = !(flags & RENAME_EXCHANGE);
+ struct dentry *old = ovlrd->old_dentry;
+ struct dentry *new = ovlrd->new_dentry;
bool is_dir = d_is_dir(old);
bool new_is_dir = d_is_dir(new);
- bool samedir = olddir == newdir;
- struct dentry *opaquedir = NULL;
- const struct cred *old_cred = NULL;
- struct ovl_fs *ofs = OVL_FS(old->d_sb);
- LIST_HEAD(list);
+ int err;
- err = -EINVAL;
- if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE))
- goto out;
+ if (ovlrd->flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE))
+ return -EINVAL;
- flags &= ~RENAME_NOREPLACE;
+ ovlrd->flags &= ~RENAME_NOREPLACE;
/* Don't copy up directory trees */
err = -EXDEV;
if (!ovl_can_move(old))
- goto out;
- if (!overwrite && !ovl_can_move(new))
- goto out;
+ return err;
+ if (!ovlrd->overwrite && !ovl_can_move(new))
+ return err;
- if (overwrite && new_is_dir && !ovl_pure_upper(new)) {
- err = ovl_check_empty_dir(new, &list);
+ if (ovlrd->overwrite && new_is_dir && !ovl_pure_upper(new)) {
+ err = ovl_check_empty_dir(new, list);
if (err)
- goto out;
+ return err;
}
- if (overwrite) {
+ if (ovlrd->overwrite) {
if (ovl_lower_positive(old)) {
if (!ovl_dentry_is_whiteout(new)) {
/* Whiteout source */
- flags |= RENAME_WHITEOUT;
+ ovlrd->flags |= RENAME_WHITEOUT;
} else {
/* Switch whiteouts */
- flags |= RENAME_EXCHANGE;
+ ovlrd->flags |= RENAME_EXCHANGE;
}
} else if (is_dir && ovl_dentry_is_whiteout(new)) {
- flags |= RENAME_EXCHANGE;
- cleanup_whiteout = true;
+ ovlrd->flags |= RENAME_EXCHANGE;
+ ovlrd->cleanup_whiteout = true;
}
}
err = ovl_copy_up(old);
if (err)
- goto out;
+ return err;
err = ovl_copy_up(new->d_parent);
if (err)
- goto out;
- if (!overwrite) {
+ return err;
+
+ if (!ovlrd->overwrite) {
err = ovl_copy_up(new);
if (err)
- goto out;
+ return err;
} else if (d_inode(new)) {
err = ovl_nlink_start(new);
if (err)
- goto out;
+ return err;
- update_nlink = true;
+ ovlrd->update_nlink = true;
}
- if (!update_nlink) {
+ if (!ovlrd->update_nlink) {
/* ovl_nlink_start() took ovl_want_write() */
err = ovl_want_write(old);
if (err)
- goto out;
+ return err;
}
- old_cred = ovl_override_creds(old->d_sb);
+ return 0;
+}
- if (!list_empty(&list)) {
- opaquedir = ovl_clear_empty(new, &list);
- err = PTR_ERR(opaquedir);
- if (IS_ERR(opaquedir)) {
- opaquedir = NULL;
- goto out_revert_creds;
- }
- }
+static int ovl_rename_upper(struct ovl_renamedata *ovlrd, struct list_head *list)
+{
+ struct dentry *old = ovlrd->old_dentry;
+ struct dentry *new = ovlrd->new_dentry;
+ struct ovl_fs *ofs = OVL_FS(old->d_sb);
+ struct dentry *old_upperdir = ovl_dentry_upper(old->d_parent);
+ struct dentry *new_upperdir = ovl_dentry_upper(new->d_parent);
+ bool is_dir = d_is_dir(old);
+ bool new_is_dir = d_is_dir(new);
+ bool samedir = old->d_parent == new->d_parent;
+ struct renamedata rd = {};
+ struct dentry *de;
+ struct dentry *whiteout = NULL;
+ bool old_opaque, new_opaque;
+ int err;
- old_upperdir = ovl_dentry_upper(old->d_parent);
- new_upperdir = ovl_dentry_upper(new->d_parent);
+ if (!list_empty(list)) {
+ de = ovl_clear_empty(new, list);
+ if (IS_ERR(de))
+ return PTR_ERR(de);
+ ovlrd->opaquedir = de;
+ }
if (!samedir) {
/*
@@ -1211,95 +1217,88 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir,
if (ovl_type_origin(old)) {
err = ovl_set_impure(new->d_parent, new_upperdir);
if (err)
- goto out_revert_creds;
+ return err;
}
- if (!overwrite && ovl_type_origin(new)) {
+ if (!ovlrd->overwrite && ovl_type_origin(new)) {
err = ovl_set_impure(old->d_parent, old_upperdir);
if (err)
- goto out_revert_creds;
+ return err;
}
}
- trap = lock_rename(new_upperdir, old_upperdir);
- if (IS_ERR(trap)) {
- err = PTR_ERR(trap);
- goto out_revert_creds;
- }
+ rd.mnt_idmap = ovl_upper_mnt_idmap(ofs);
+ rd.old_parent = old_upperdir;
+ rd.new_parent = new_upperdir;
+ rd.flags = ovlrd->flags;
- de = ovl_lookup_upper(ofs, old->d_name.name, old_upperdir,
- old->d_name.len);
- err = PTR_ERR(de);
- if (IS_ERR(de))
- goto out_unlock;
- olddentry = de;
+ err = start_renaming(&rd, 0,
+ &QSTR_LEN(old->d_name.name, old->d_name.len),
+ &QSTR_LEN(new->d_name.name, new->d_name.len));
+ if (err)
+ return err;
err = -ESTALE;
- if (!ovl_matches_upper(old, olddentry))
+ if (!ovl_matches_upper(old, rd.old_dentry))
goto out_unlock;
- de = ovl_lookup_upper(ofs, new->d_name.name, new_upperdir,
- new->d_name.len);
- err = PTR_ERR(de);
- if (IS_ERR(de))
- goto out_unlock;
- newdentry = de;
-
old_opaque = ovl_dentry_is_opaque(old);
new_opaque = ovl_dentry_is_opaque(new);
err = -ESTALE;
if (d_inode(new) && ovl_dentry_upper(new)) {
- if (opaquedir) {
- if (newdentry != opaquedir)
+ if (ovlrd->opaquedir) {
+ if (rd.new_dentry != ovlrd->opaquedir)
goto out_unlock;
} else {
- if (!ovl_matches_upper(new, newdentry))
+ if (!ovl_matches_upper(new, rd.new_dentry))
goto out_unlock;
}
} else {
- if (!d_is_negative(newdentry)) {
- if (!new_opaque || !ovl_upper_is_whiteout(ofs, newdentry))
+ if (!d_is_negative(rd.new_dentry)) {
+ if (!new_opaque || !ovl_upper_is_whiteout(ofs, rd.new_dentry))
goto out_unlock;
} else {
- if (flags & RENAME_EXCHANGE)
+ if (ovlrd->flags & RENAME_EXCHANGE)
goto out_unlock;
}
}
- if (olddentry == trap)
- goto out_unlock;
- if (newdentry == trap)
- goto out_unlock;
-
- if (olddentry->d_inode == newdentry->d_inode)
+ if (rd.old_dentry->d_inode == rd.new_dentry->d_inode)
goto out_unlock;
err = 0;
if (ovl_type_merge_or_lower(old))
err = ovl_set_redirect(old, samedir);
else if (is_dir && !old_opaque && ovl_type_merge(new->d_parent))
- err = ovl_set_opaque_xerr(old, olddentry, -EXDEV);
+ err = ovl_set_opaque_xerr(old, rd.old_dentry, -EXDEV);
if (err)
goto out_unlock;
- if (!overwrite && ovl_type_merge_or_lower(new))
+ if (!ovlrd->overwrite && ovl_type_merge_or_lower(new))
err = ovl_set_redirect(new, samedir);
- else if (!overwrite && new_is_dir && !new_opaque &&
+ else if (!ovlrd->overwrite && new_is_dir && !new_opaque &&
ovl_type_merge(old->d_parent))
- err = ovl_set_opaque_xerr(new, newdentry, -EXDEV);
+ err = ovl_set_opaque_xerr(new, rd.new_dentry, -EXDEV);
if (err)
goto out_unlock;
- err = ovl_do_rename(ofs, old_upperdir, olddentry,
- new_upperdir, newdentry, flags);
- unlock_rename(new_upperdir, old_upperdir);
+ err = ovl_do_rename_rd(&rd);
+
+ if (!err && ovlrd->cleanup_whiteout)
+ whiteout = dget(rd.new_dentry);
+
+out_unlock:
+ end_renaming(&rd);
+
if (err)
- goto out_revert_creds;
+ return err;
- if (cleanup_whiteout)
- ovl_cleanup(ofs, old_upperdir, newdentry);
+ if (whiteout) {
+ ovl_cleanup(ofs, old_upperdir, whiteout);
+ dput(whiteout);
+ }
- if (overwrite && d_inode(new)) {
+ if (ovlrd->overwrite && d_inode(new)) {
if (new_is_dir)
clear_nlink(d_inode(new));
else
@@ -1307,7 +1306,7 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir,
}
ovl_dir_modified(old->d_parent, ovl_type_origin(old) ||
- (!overwrite && ovl_type_origin(new)));
+ (!ovlrd->overwrite && ovl_type_origin(new)));
ovl_dir_modified(new->d_parent, ovl_type_origin(old) ||
(d_inode(new) && ovl_type_origin(new)));
@@ -1316,28 +1315,47 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir,
if (d_inode(new) && ovl_dentry_upper(new))
ovl_copyattr(d_inode(new));
-out_revert_creds:
- ovl_revert_creds(old_cred);
- if (update_nlink)
- ovl_nlink_end(new);
+ return err;
+}
+
+static void ovl_rename_end(struct ovl_renamedata *ovlrd)
+{
+ if (ovlrd->update_nlink)
+ ovl_nlink_end(ovlrd->new_dentry);
else
- ovl_drop_write(old);
-out:
- dput(newdentry);
- dput(olddentry);
- dput(opaquedir);
+ ovl_drop_write(ovlrd->old_dentry);
+}
+
+static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir,
+ struct dentry *old, struct inode *newdir,
+ struct dentry *new, unsigned int flags)
+{
+ struct ovl_renamedata ovlrd = {
+ .old_parent = old->d_parent,
+ .old_dentry = old,
+ .new_parent = new->d_parent,
+ .new_dentry = new,
+ .flags = flags,
+ .overwrite = !(flags & RENAME_EXCHANGE),
+ };
+ LIST_HEAD(list);
+ int err;
+
+ err = ovl_rename_start(&ovlrd, &list);
+ if (!err) {
+ with_ovl_creds(old->d_sb)
+ err = ovl_rename_upper(&ovlrd, &list);
+ ovl_rename_end(&ovlrd);
+ }
+
+ dput(ovlrd.opaquedir);
ovl_cache_free(&list);
return err;
-
-out_unlock:
- unlock_rename(new_upperdir, old_upperdir);
- goto out_revert_creds;
}
static int ovl_create_tmpfile(struct file *file, struct dentry *dentry,
struct inode *inode, umode_t mode)
{
- const struct cred *old_cred, *new_cred = NULL;
struct path realparentpath;
struct file *realfile;
struct ovl_file *of;
@@ -1346,41 +1364,36 @@ static int ovl_create_tmpfile(struct file *file, struct dentry *dentry,
int flags = file->f_flags | OVL_OPEN_FLAGS;
int err;
- old_cred = ovl_override_creds(dentry->d_sb);
- new_cred = ovl_setup_cred_for_create(dentry, inode, mode, old_cred);
- err = PTR_ERR(new_cred);
- if (IS_ERR(new_cred)) {
- new_cred = NULL;
- goto out_revert_creds;
- }
+ with_ovl_creds(dentry->d_sb) {
+ scoped_class(ovl_override_creator_creds, cred, dentry, inode, mode) {
+ if (IS_ERR(cred))
+ return PTR_ERR(cred);
- ovl_path_upper(dentry->d_parent, &realparentpath);
- realfile = backing_tmpfile_open(&file->f_path, flags, &realparentpath,
- mode, current_cred());
- err = PTR_ERR_OR_ZERO(realfile);
- pr_debug("tmpfile/open(%pd2, 0%o) = %i\n", realparentpath.dentry, mode, err);
- if (err)
- goto out_revert_creds;
+ ovl_path_upper(dentry->d_parent, &realparentpath);
+ realfile = backing_tmpfile_open(&file->f_path, flags, &realparentpath,
+ mode, current_cred());
+ err = PTR_ERR_OR_ZERO(realfile);
+ pr_debug("tmpfile/open(%pd2, 0%o) = %i\n", realparentpath.dentry, mode, err);
+ if (err)
+ return err;
- of = ovl_file_alloc(realfile);
- if (!of) {
- fput(realfile);
- err = -ENOMEM;
- goto out_revert_creds;
- }
+ of = ovl_file_alloc(realfile);
+ if (!of) {
+ fput(realfile);
+ return -ENOMEM;
+ }
- /* ovl_instantiate() consumes the newdentry reference on success */
- newdentry = dget(realfile->f_path.dentry);
- err = ovl_instantiate(dentry, inode, newdentry, false, file);
- if (!err) {
- file->private_data = of;
- } else {
- dput(newdentry);
- ovl_file_free(of);
+ /* ovl_instantiate() consumes the newdentry reference on success */
+ newdentry = dget(realfile->f_path.dentry);
+ err = ovl_instantiate(dentry, inode, newdentry, false, file);
+ if (!err) {
+ file->private_data = of;
+ } else {
+ dput(newdentry);
+ ovl_file_free(of);
+ }
+ }
}
-out_revert_creds:
- ovl_revert_creds(old_cred);
- put_cred(new_cred);
return err;
}
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 7ab2c9daffd0..cbae89457234 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -31,7 +31,6 @@ static struct file *ovl_open_realfile(const struct file *file,
struct inode *inode = file_inode(file);
struct mnt_idmap *real_idmap;
struct file *realfile;
- const struct cred *old_cred;
int flags = file->f_flags | OVL_OPEN_FLAGS;
int acc_mode = ACC_MODE(flags);
int err;
@@ -39,19 +38,19 @@ static struct file *ovl_open_realfile(const struct file *file,
if (flags & O_APPEND)
acc_mode |= MAY_APPEND;
- old_cred = ovl_override_creds(inode->i_sb);
- real_idmap = mnt_idmap(realpath->mnt);
- err = inode_permission(real_idmap, realinode, MAY_OPEN | acc_mode);
- if (err) {
- realfile = ERR_PTR(err);
- } else {
- if (!inode_owner_or_capable(real_idmap, realinode))
- flags &= ~O_NOATIME;
-
- realfile = backing_file_open(file_user_path(file),
- flags, realpath, current_cred());
+ with_ovl_creds(inode->i_sb) {
+ real_idmap = mnt_idmap(realpath->mnt);
+ err = inode_permission(real_idmap, realinode, MAY_OPEN | acc_mode);
+ if (err) {
+ realfile = ERR_PTR(err);
+ } else {
+ if (!inode_owner_or_capable(real_idmap, realinode))
+ flags &= ~O_NOATIME;
+
+ realfile = backing_file_open(file_user_path(file),
+ flags, realpath, current_cred());
+ }
}
- ovl_revert_creds(old_cred);
pr_debug("open(%p[%pD2/%c], 0%o) -> (%p, 0%o)\n",
file, file, ovl_whatisit(inode, realinode), file->f_flags,
@@ -244,7 +243,6 @@ static loff_t ovl_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file_inode(file);
struct file *realfile;
- const struct cred *old_cred;
loff_t ret;
/*
@@ -273,9 +271,8 @@ static loff_t ovl_llseek(struct file *file, loff_t offset, int whence)
ovl_inode_lock(inode);
realfile->f_pos = file->f_pos;
- old_cred = ovl_override_creds(inode->i_sb);
- ret = vfs_llseek(realfile, offset, whence);
- ovl_revert_creds(old_cred);
+ with_ovl_creds(inode->i_sb)
+ ret = vfs_llseek(realfile, offset, whence);
file->f_pos = realfile->f_pos;
ovl_inode_unlock(inode);
@@ -447,7 +444,6 @@ static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync)
enum ovl_path_type type;
struct path upperpath;
struct file *upperfile;
- const struct cred *old_cred;
int ret;
ret = ovl_sync_status(OVL_FS(file_inode(file)->i_sb));
@@ -464,11 +460,8 @@ static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync)
if (IS_ERR(upperfile))
return PTR_ERR(upperfile);
- old_cred = ovl_override_creds(file_inode(file)->i_sb);
- ret = vfs_fsync_range(upperfile, start, end, datasync);
- ovl_revert_creds(old_cred);
-
- return ret;
+ with_ovl_creds(file_inode(file)->i_sb)
+ return vfs_fsync_range(upperfile, start, end, datasync);
}
static int ovl_mmap(struct file *file, struct vm_area_struct *vma)
@@ -486,7 +479,6 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len
{
struct inode *inode = file_inode(file);
struct file *realfile;
- const struct cred *old_cred;
int ret;
inode_lock(inode);
@@ -501,9 +493,8 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len
if (IS_ERR(realfile))
goto out_unlock;
- old_cred = ovl_override_creds(file_inode(file)->i_sb);
- ret = vfs_fallocate(realfile, mode, offset, len);
- ovl_revert_creds(old_cred);
+ with_ovl_creds(inode->i_sb)
+ ret = vfs_fallocate(realfile, mode, offset, len);
/* Update size */
ovl_file_modified(file);
@@ -517,18 +508,13 @@ out_unlock:
static int ovl_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
{
struct file *realfile;
- const struct cred *old_cred;
- int ret;
realfile = ovl_real_file(file);
if (IS_ERR(realfile))
return PTR_ERR(realfile);
- old_cred = ovl_override_creds(file_inode(file)->i_sb);
- ret = vfs_fadvise(realfile, offset, len, advice);
- ovl_revert_creds(old_cred);
-
- return ret;
+ with_ovl_creds(file_inode(file)->i_sb)
+ return vfs_fadvise(realfile, offset, len, advice);
}
enum ovl_copyop {
@@ -543,7 +529,6 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in,
{
struct inode *inode_out = file_inode(file_out);
struct file *realfile_in, *realfile_out;
- const struct cred *old_cred;
loff_t ret;
inode_lock(inode_out);
@@ -565,25 +550,25 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in,
if (IS_ERR(realfile_in))
goto out_unlock;
- old_cred = ovl_override_creds(file_inode(file_out)->i_sb);
- switch (op) {
- case OVL_COPY:
- ret = vfs_copy_file_range(realfile_in, pos_in,
- realfile_out, pos_out, len, flags);
- break;
-
- case OVL_CLONE:
- ret = vfs_clone_file_range(realfile_in, pos_in,
- realfile_out, pos_out, len, flags);
- break;
-
- case OVL_DEDUPE:
- ret = vfs_dedupe_file_range_one(realfile_in, pos_in,
- realfile_out, pos_out, len,
- flags);
- break;
+ with_ovl_creds(file_inode(file_out)->i_sb) {
+ switch (op) {
+ case OVL_COPY:
+ ret = vfs_copy_file_range(realfile_in, pos_in,
+ realfile_out, pos_out, len, flags);
+ break;
+
+ case OVL_CLONE:
+ ret = vfs_clone_file_range(realfile_in, pos_in,
+ realfile_out, pos_out, len, flags);
+ break;
+
+ case OVL_DEDUPE:
+ ret = vfs_dedupe_file_range_one(realfile_in, pos_in,
+ realfile_out, pos_out, len,
+ flags);
+ break;
+ }
}
- ovl_revert_creds(old_cred);
/* Update size */
ovl_file_modified(file_out);
@@ -632,7 +617,6 @@ static loff_t ovl_remap_file_range(struct file *file_in, loff_t pos_in,
static int ovl_flush(struct file *file, fl_owner_t id)
{
struct file *realfile;
- const struct cred *old_cred;
int err = 0;
realfile = ovl_real_file(file);
@@ -640,9 +624,8 @@ static int ovl_flush(struct file *file, fl_owner_t id)
return PTR_ERR(realfile);
if (realfile->f_op->flush) {
- old_cred = ovl_override_creds(file_inode(file)->i_sb);
- err = realfile->f_op->flush(realfile, id);
- ovl_revert_creds(old_cred);
+ with_ovl_creds(file_inode(file)->i_sb)
+ err = realfile->f_op->flush(realfile, id);
}
return err;
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index e11f310ce092..bdbf86b56a9b 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -25,7 +25,6 @@ int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
bool full_copy_up = false;
struct dentry *upperdentry;
- const struct cred *old_cred;
err = setattr_prepare(&nop_mnt_idmap, dentry, attr);
if (err)
@@ -78,9 +77,8 @@ int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
goto out_put_write;
inode_lock(upperdentry->d_inode);
- old_cred = ovl_override_creds(dentry->d_sb);
- err = ovl_do_notify_change(ofs, upperdentry, attr);
- ovl_revert_creds(old_cred);
+ with_ovl_creds(dentry->d_sb)
+ err = ovl_do_notify_change(ofs, upperdentry, attr);
if (!err)
ovl_copyattr(dentry->d_inode);
inode_unlock(upperdentry->d_inode);
@@ -153,13 +151,22 @@ static void ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid)
}
}
+static inline int ovl_real_getattr_nosec(struct super_block *sb,
+ const struct path *path,
+ struct kstat *stat, u32 request_mask,
+ unsigned int flags)
+{
+ with_ovl_creds(sb)
+ return vfs_getattr_nosec(path, stat, request_mask, flags);
+}
+
int ovl_getattr(struct mnt_idmap *idmap, const struct path *path,
struct kstat *stat, u32 request_mask, unsigned int flags)
{
struct dentry *dentry = path->dentry;
+ struct super_block *sb = dentry->d_sb;
enum ovl_path_type type;
struct path realpath;
- const struct cred *old_cred;
struct inode *inode = d_inode(dentry);
bool is_dir = S_ISDIR(inode->i_mode);
int fsid = 0;
@@ -169,10 +176,9 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path,
metacopy_blocks = ovl_is_metacopy_dentry(dentry);
type = ovl_path_real(dentry, &realpath);
- old_cred = ovl_override_creds(dentry->d_sb);
- err = vfs_getattr_nosec(&realpath, stat, request_mask, flags);
+ err = ovl_real_getattr_nosec(sb, &realpath, stat, request_mask, flags);
if (err)
- goto out;
+ return err;
/* Report the effective immutable/append-only STATX flags */
generic_fill_statx_attr(inode, stat);
@@ -195,10 +201,9 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path,
(!is_dir ? STATX_NLINK : 0);
ovl_path_lower(dentry, &realpath);
- err = vfs_getattr_nosec(&realpath, &lowerstat, lowermask,
- flags);
+ err = ovl_real_getattr_nosec(sb, &realpath, &lowerstat, lowermask, flags);
if (err)
- goto out;
+ return err;
/*
* Lower hardlinks may be broken on copy up to different
@@ -248,10 +253,10 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path,
ovl_path_lowerdata(dentry, &realpath);
if (realpath.dentry) {
- err = vfs_getattr_nosec(&realpath, &lowerdatastat,
- lowermask, flags);
+ err = ovl_real_getattr_nosec(sb, &realpath, &lowerdatastat,
+ lowermask, flags);
if (err)
- goto out;
+ return err;
} else {
lowerdatastat.blocks =
round_up(stat->size, stat->blksize) >> 9;
@@ -279,9 +284,6 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path,
if (!is_dir && ovl_test_flag(OVL_INDEX, d_inode(dentry)))
stat->nlink = dentry->d_inode->i_nlink;
-out:
- ovl_revert_creds(old_cred);
-
return err;
}
@@ -291,7 +293,6 @@ int ovl_permission(struct mnt_idmap *idmap,
struct inode *upperinode = ovl_inode_upper(inode);
struct inode *realinode;
struct path realpath;
- const struct cred *old_cred;
int err;
/* Careful in RCU walk mode */
@@ -309,33 +310,26 @@ int ovl_permission(struct mnt_idmap *idmap,
if (err)
return err;
- old_cred = ovl_override_creds(inode->i_sb);
if (!upperinode &&
!special_file(realinode->i_mode) && mask & MAY_WRITE) {
mask &= ~(MAY_WRITE | MAY_APPEND);
/* Make sure mounter can read file for copy up later */
mask |= MAY_READ;
}
- err = inode_permission(mnt_idmap(realpath.mnt), realinode, mask);
- ovl_revert_creds(old_cred);
- return err;
+ with_ovl_creds(inode->i_sb)
+ return inode_permission(mnt_idmap(realpath.mnt), realinode, mask);
}
static const char *ovl_get_link(struct dentry *dentry,
struct inode *inode,
struct delayed_call *done)
{
- const struct cred *old_cred;
- const char *p;
-
if (!dentry)
return ERR_PTR(-ECHILD);
- old_cred = ovl_override_creds(dentry->d_sb);
- p = vfs_get_link(ovl_dentry_real(dentry), done);
- ovl_revert_creds(old_cred);
- return p;
+ with_ovl_creds(dentry->d_sb)
+ return vfs_get_link(ovl_dentry_real(dentry), done);
}
#ifdef CONFIG_FS_POSIX_ACL
@@ -465,11 +459,8 @@ struct posix_acl *do_ovl_get_acl(struct mnt_idmap *idmap,
acl = get_cached_acl_rcu(realinode, type);
} else {
- const struct cred *old_cred;
-
- old_cred = ovl_override_creds(inode->i_sb);
- acl = ovl_get_acl_path(&realpath, posix_acl_xattr_name(type), noperm);
- ovl_revert_creds(old_cred);
+ with_ovl_creds(inode->i_sb)
+ acl = ovl_get_acl_path(&realpath, posix_acl_xattr_name(type), noperm);
}
return acl;
@@ -481,7 +472,6 @@ static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode,
int err;
struct path realpath;
const char *acl_name;
- const struct cred *old_cred;
struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
struct dentry *upperdentry = ovl_dentry_upper(dentry);
struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry);
@@ -495,10 +485,8 @@ static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode,
struct posix_acl *real_acl;
ovl_path_lower(dentry, &realpath);
- old_cred = ovl_override_creds(dentry->d_sb);
- real_acl = vfs_get_acl(mnt_idmap(realpath.mnt), realdentry,
- acl_name);
- ovl_revert_creds(old_cred);
+ with_ovl_creds(dentry->d_sb)
+ real_acl = vfs_get_acl(mnt_idmap(realpath.mnt), realdentry, acl_name);
if (IS_ERR(real_acl)) {
err = PTR_ERR(real_acl);
goto out;
@@ -518,12 +506,12 @@ static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode,
if (err)
goto out;
- old_cred = ovl_override_creds(dentry->d_sb);
- if (acl)
- err = ovl_do_set_acl(ofs, realdentry, acl_name, acl);
- else
- err = ovl_do_remove_acl(ofs, realdentry, acl_name);
- ovl_revert_creds(old_cred);
+ with_ovl_creds(dentry->d_sb) {
+ if (acl)
+ err = ovl_do_set_acl(ofs, realdentry, acl_name, acl);
+ else
+ err = ovl_do_remove_acl(ofs, realdentry, acl_name);
+ }
ovl_drop_write(dentry);
/* copy c/mtime */
@@ -588,9 +576,7 @@ int ovl_update_time(struct inode *inode, int flags)
static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
- int err;
struct inode *realinode = ovl_inode_realdata(inode);
- const struct cred *old_cred;
if (!realinode)
return -EIO;
@@ -598,11 +584,8 @@ static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
if (!realinode->i_op->fiemap)
return -EOPNOTSUPP;
- old_cred = ovl_override_creds(inode->i_sb);
- err = realinode->i_op->fiemap(realinode, fieinfo, start, len);
- ovl_revert_creds(old_cred);
-
- return err;
+ with_ovl_creds(inode->i_sb)
+ return realinode->i_op->fiemap(realinode, fieinfo, start, len);
}
/*
@@ -653,7 +636,6 @@ int ovl_fileattr_set(struct mnt_idmap *idmap,
{
struct inode *inode = d_inode(dentry);
struct path upperpath;
- const struct cred *old_cred;
unsigned int flags;
int err;
@@ -665,18 +647,18 @@ int ovl_fileattr_set(struct mnt_idmap *idmap,
if (err)
goto out;
- old_cred = ovl_override_creds(inode->i_sb);
- /*
- * Store immutable/append-only flags in xattr and clear them
- * in upper fileattr (in case they were set by older kernel)
- * so children of "ovl-immutable" directories lower aliases of
- * "ovl-immutable" hardlinks could be copied up.
- * Clear xattr when flags are cleared.
- */
- err = ovl_set_protattr(inode, upperpath.dentry, fa);
- if (!err)
- err = ovl_real_fileattr_set(&upperpath, fa);
- ovl_revert_creds(old_cred);
+ with_ovl_creds(inode->i_sb) {
+ /*
+ * Store immutable/append-only flags in xattr and clear them
+ * in upper fileattr (in case they were set by older kernel)
+ * so children of "ovl-immutable" directories lower aliases of
+ * "ovl-immutable" hardlinks could be copied up.
+ * Clear xattr when flags are cleared.
+ */
+ err = ovl_set_protattr(inode, upperpath.dentry, fa);
+ if (!err)
+ err = ovl_real_fileattr_set(&upperpath, fa);
+ }
ovl_drop_write(dentry);
/*
@@ -730,15 +712,13 @@ int ovl_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
struct path realpath;
- const struct cred *old_cred;
int err;
ovl_path_real(dentry, &realpath);
- old_cred = ovl_override_creds(inode->i_sb);
- err = ovl_real_fileattr_get(&realpath, fa);
+ with_ovl_creds(inode->i_sb)
+ err = ovl_real_fileattr_get(&realpath, fa);
ovl_fileattr_prot_flags(inode, fa);
- ovl_revert_creds(old_cred);
return err;
}
@@ -1152,7 +1132,7 @@ struct inode *ovl_get_trap_inode(struct super_block *sb, struct dentry *dir)
if (!trap)
return ERR_PTR(-ENOMEM);
- if (!(trap->i_state & I_NEW)) {
+ if (!(inode_state_read_once(trap) & I_NEW)) {
/* Conflicting layer roots? */
iput(trap);
return ERR_PTR(-ELOOP);
@@ -1243,7 +1223,7 @@ struct inode *ovl_get_inode(struct super_block *sb,
inode = ovl_iget5(sb, oip->newinode, key);
if (!inode)
goto out_err;
- if (!(inode->i_state & I_NEW)) {
+ if (!(inode_state_read_once(inode) & I_NEW)) {
/*
* Verify that the underlying files stored in the inode
* match those in the dentry.
@@ -1303,7 +1283,7 @@ struct inode *ovl_get_inode(struct super_block *sb,
if (upperdentry)
ovl_check_protattr(inode, upperdentry);
- if (inode->i_state & I_NEW)
+ if (inode_state_read_once(inode) & I_NEW)
unlock_new_inode(inode);
out:
return inode;
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index e93bcc5727bc..e9a69c95be91 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -979,15 +979,10 @@ static int ovl_maybe_validate_verity(struct dentry *dentry)
return err;
if (!ovl_test_flag(OVL_VERIFIED_DIGEST, inode)) {
- const struct cred *old_cred;
-
- old_cred = ovl_override_creds(dentry->d_sb);
-
- err = ovl_validate_verity(ofs, &metapath, &datapath);
+ with_ovl_creds(dentry->d_sb)
+ err = ovl_validate_verity(ofs, &metapath, &datapath);
if (err == 0)
ovl_set_flag(OVL_VERIFIED_DIGEST, inode);
-
- ovl_revert_creds(old_cred);
}
ovl_inode_unlock(inode);
@@ -1001,7 +996,6 @@ static int ovl_maybe_lookup_lowerdata(struct dentry *dentry)
struct inode *inode = d_inode(dentry);
const char *redirect = ovl_lowerdata_redirect(inode);
struct ovl_path datapath = {};
- const struct cred *old_cred;
int err;
if (!redirect || ovl_dentry_lowerdata(dentry))
@@ -1019,9 +1013,8 @@ static int ovl_maybe_lookup_lowerdata(struct dentry *dentry)
if (ovl_dentry_lowerdata(dentry))
goto out;
- old_cred = ovl_override_creds(dentry->d_sb);
- err = ovl_lookup_data_layers(dentry, redirect, &datapath);
- ovl_revert_creds(old_cred);
+ with_ovl_creds(dentry->d_sb)
+ err = ovl_lookup_data_layers(dentry, redirect, &datapath);
if (err)
goto out_err;
@@ -1077,57 +1070,44 @@ static bool ovl_check_follow_redirect(struct ovl_lookup_data *d)
return true;
}
-struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
- unsigned int flags)
+struct ovl_lookup_ctx {
+ struct dentry *dentry;
+ struct ovl_entry *oe;
+ struct ovl_path *stack;
+ struct ovl_path *origin_path;
+ struct dentry *upperdentry;
+ struct dentry *index;
+ struct inode *inode;
+ unsigned int ctr;
+};
+
+static int ovl_lookup_layers(struct ovl_lookup_ctx *ctx, struct ovl_lookup_data *d)
{
- struct ovl_entry *oe = NULL;
- const struct cred *old_cred;
+ struct dentry *dentry = ctx->dentry;
struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
struct ovl_entry *poe = OVL_E(dentry->d_parent);
struct ovl_entry *roe = OVL_E(dentry->d_sb->s_root);
- struct ovl_path *stack = NULL, *origin_path = NULL;
- struct dentry *upperdir, *upperdentry = NULL;
- struct dentry *origin = NULL;
- struct dentry *index = NULL;
- unsigned int ctr = 0;
- struct inode *inode = NULL;
- bool upperopaque = false;
bool check_redirect = (ovl_redirect_follow(ofs) || ofs->numdatalayer);
+ struct dentry *upperdir;
struct dentry *this;
- unsigned int i;
- int err;
+ struct dentry *origin = NULL;
+ bool upperopaque = false;
bool uppermetacopy = false;
int metacopy_size = 0;
- struct ovl_lookup_data d = {
- .sb = dentry->d_sb,
- .dentry = dentry,
- .name = dentry->d_name,
- .is_dir = false,
- .opaque = false,
- .stop = false,
- .last = check_redirect ? false : !ovl_numlower(poe),
- .redirect = NULL,
- .upperredirect = NULL,
- .metacopy = 0,
- };
-
- if (dentry->d_name.len > ofs->namelen)
- return ERR_PTR(-ENAMETOOLONG);
+ unsigned int i;
+ int err;
- old_cred = ovl_override_creds(dentry->d_sb);
upperdir = ovl_dentry_upper(dentry->d_parent);
if (upperdir) {
- d.layer = &ofs->layers[0];
- err = ovl_lookup_layer(upperdir, &d, &upperdentry, true);
+ d->layer = &ofs->layers[0];
+ err = ovl_lookup_layer(upperdir, d, &ctx->upperdentry, true);
if (err)
- goto out;
+ return err;
- if (upperdentry && upperdentry->d_flags & DCACHE_OP_REAL) {
- dput(upperdentry);
- err = -EREMOTE;
- goto out;
- }
- if (upperdentry && !d.is_dir) {
+ if (ctx->upperdentry && ctx->upperdentry->d_flags & DCACHE_OP_REAL)
+ return -EREMOTE;
+
+ if (ctx->upperdentry && !d->is_dir) {
/*
* Lookup copy up origin by decoding origin file handle.
* We may get a disconnected dentry, which is fine,
@@ -1138,50 +1118,50 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
* number - it's the same as if we held a reference
* to a dentry in lower layer that was moved under us.
*/
- err = ovl_check_origin(ofs, upperdentry, &origin_path);
+ err = ovl_check_origin(ofs, ctx->upperdentry, &ctx->origin_path);
if (err)
- goto out_put_upper;
+ return err;
- if (d.metacopy)
+ if (d->metacopy)
uppermetacopy = true;
- metacopy_size = d.metacopy;
+ metacopy_size = d->metacopy;
}
- if (d.redirect) {
+ if (d->redirect) {
err = -ENOMEM;
- d.upperredirect = kstrdup(d.redirect, GFP_KERNEL);
- if (!d.upperredirect)
- goto out_put_upper;
- if (d.redirect[0] == '/')
+ d->upperredirect = kstrdup(d->redirect, GFP_KERNEL);
+ if (!d->upperredirect)
+ return err;
+ if (d->redirect[0] == '/')
poe = roe;
}
- upperopaque = d.opaque;
+ upperopaque = d->opaque;
}
- if (!d.stop && ovl_numlower(poe)) {
+ if (!d->stop && ovl_numlower(poe)) {
err = -ENOMEM;
- stack = ovl_stack_alloc(ofs->numlayer - 1);
- if (!stack)
- goto out_put_upper;
+ ctx->stack = ovl_stack_alloc(ofs->numlayer - 1);
+ if (!ctx->stack)
+ return err;
}
- for (i = 0; !d.stop && i < ovl_numlower(poe); i++) {
+ for (i = 0; !d->stop && i < ovl_numlower(poe); i++) {
struct ovl_path lower = ovl_lowerstack(poe)[i];
- if (!ovl_check_follow_redirect(&d)) {
+ if (!ovl_check_follow_redirect(d)) {
err = -EPERM;
- goto out_put;
+ return err;
}
if (!check_redirect)
- d.last = i == ovl_numlower(poe) - 1;
- else if (d.is_dir || !ofs->numdatalayer)
- d.last = lower.layer->idx == ovl_numlower(roe);
+ d->last = i == ovl_numlower(poe) - 1;
+ else if (d->is_dir || !ofs->numdatalayer)
+ d->last = lower.layer->idx == ovl_numlower(roe);
- d.layer = lower.layer;
- err = ovl_lookup_layer(lower.dentry, &d, &this, false);
+ d->layer = lower.layer;
+ err = ovl_lookup_layer(lower.dentry, d, &this, false);
if (err)
- goto out_put;
+ return err;
if (!this)
continue;
@@ -1190,11 +1170,11 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
* If no origin fh is stored in upper of a merge dir, store fh
* of lower dir and set upper parent "impure".
*/
- if (upperdentry && !ctr && !ofs->noxattr && d.is_dir) {
- err = ovl_fix_origin(ofs, dentry, this, upperdentry);
+ if (ctx->upperdentry && !ctx->ctr && !ofs->noxattr && d->is_dir) {
+ err = ovl_fix_origin(ofs, dentry, this, ctx->upperdentry);
if (err) {
dput(this);
- goto out_put;
+ return err;
}
}
@@ -1207,23 +1187,23 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
* matches the dentry found using path based lookup,
* otherwise error out.
*/
- if (upperdentry && !ctr &&
- ((d.is_dir && ovl_verify_lower(dentry->d_sb)) ||
- (!d.is_dir && ofs->config.index && origin_path))) {
- err = ovl_verify_origin(ofs, upperdentry, this, false);
+ if (ctx->upperdentry && !ctx->ctr &&
+ ((d->is_dir && ovl_verify_lower(dentry->d_sb)) ||
+ (!d->is_dir && ofs->config.index && ctx->origin_path))) {
+ err = ovl_verify_origin(ofs, ctx->upperdentry, this, false);
if (err) {
dput(this);
- if (d.is_dir)
+ if (d->is_dir)
break;
- goto out_put;
+ return err;
}
origin = this;
}
- if (!upperdentry && !d.is_dir && !ctr && d.metacopy)
- metacopy_size = d.metacopy;
+ if (!ctx->upperdentry && !d->is_dir && !ctx->ctr && d->metacopy)
+ metacopy_size = d->metacopy;
- if (d.metacopy && ctr) {
+ if (d->metacopy && ctx->ctr) {
/*
* Do not store intermediate metacopy dentries in
* lower chain, except top most lower metacopy dentry.
@@ -1233,15 +1213,15 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
dput(this);
this = NULL;
} else {
- stack[ctr].dentry = this;
- stack[ctr].layer = lower.layer;
- ctr++;
+ ctx->stack[ctx->ctr].dentry = this;
+ ctx->stack[ctx->ctr].layer = lower.layer;
+ ctx->ctr++;
}
- if (d.stop)
+ if (d->stop)
break;
- if (d.redirect && d.redirect[0] == '/' && poe != roe) {
+ if (d->redirect && d->redirect[0] == '/' && poe != roe) {
poe = roe;
/* Find the current layer on the root dentry */
i = lower.layer->idx - 1;
@@ -1252,12 +1232,12 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
* Defer lookup of lowerdata in data-only layers to first access.
* Don't require redirect=follow and metacopy=on in this case.
*/
- if (d.metacopy && ctr && ofs->numdatalayer && d.absolute_redirect) {
- d.metacopy = 0;
- ctr++;
- } else if (!ovl_check_follow_redirect(&d)) {
+ if (d->metacopy && ctx->ctr && ofs->numdatalayer && d->absolute_redirect) {
+ d->metacopy = 0;
+ ctx->ctr++;
+ } else if (!ovl_check_follow_redirect(d)) {
err = -EPERM;
- goto out_put;
+ return err;
}
/*
@@ -1268,20 +1248,20 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
* For metacopy dentry, path based lookup will find lower dentries.
* Just make sure a corresponding data dentry has been found.
*/
- if (d.metacopy || (uppermetacopy && !ctr)) {
+ if (d->metacopy || (uppermetacopy && !ctx->ctr)) {
pr_warn_ratelimited("metacopy with no lower data found - abort lookup (%pd2)\n",
dentry);
err = -EIO;
- goto out_put;
- } else if (!d.is_dir && upperdentry && !ctr && origin_path) {
- if (WARN_ON(stack != NULL)) {
+ return err;
+ } else if (!d->is_dir && ctx->upperdentry && !ctx->ctr && ctx->origin_path) {
+ if (WARN_ON(ctx->stack != NULL)) {
err = -EIO;
- goto out_put;
+ return err;
}
- stack = origin_path;
- ctr = 1;
- origin = origin_path->dentry;
- origin_path = NULL;
+ ctx->stack = ctx->origin_path;
+ ctx->ctr = 1;
+ origin = ctx->origin_path->dentry;
+ ctx->origin_path = NULL;
}
/*
@@ -1303,38 +1283,39 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
* is enabled and if upper had an ORIGIN xattr.
*
*/
- if (!upperdentry && ctr)
- origin = stack[0].dentry;
+ if (!ctx->upperdentry && ctx->ctr)
+ origin = ctx->stack[0].dentry;
if (origin && ovl_indexdir(dentry->d_sb) &&
- (!d.is_dir || ovl_index_all(dentry->d_sb))) {
- index = ovl_lookup_index(ofs, upperdentry, origin, true);
- if (IS_ERR(index)) {
- err = PTR_ERR(index);
- index = NULL;
- goto out_put;
+ (!d->is_dir || ovl_index_all(dentry->d_sb))) {
+ ctx->index = ovl_lookup_index(ofs, ctx->upperdentry, origin, true);
+ if (IS_ERR(ctx->index)) {
+ err = PTR_ERR(ctx->index);
+ ctx->index = NULL;
+ return err;
}
}
- if (ctr) {
- oe = ovl_alloc_entry(ctr);
+ if (ctx->ctr) {
+ ctx->oe = ovl_alloc_entry(ctx->ctr);
err = -ENOMEM;
- if (!oe)
- goto out_put;
+ if (!ctx->oe)
+ return err;
- ovl_stack_cpy(ovl_lowerstack(oe), stack, ctr);
+ ovl_stack_cpy(ovl_lowerstack(ctx->oe), ctx->stack, ctx->ctr);
}
if (upperopaque)
ovl_dentry_set_opaque(dentry);
- if (d.xwhiteouts)
+ if (d->xwhiteouts)
ovl_dentry_set_xwhiteouts(dentry);
- if (upperdentry)
+ if (ctx->upperdentry)
ovl_dentry_set_upper_alias(dentry);
- else if (index) {
+ else if (ctx->index) {
+ char *upperredirect;
struct path upperpath = {
- .dentry = upperdentry = dget(index),
+ .dentry = ctx->upperdentry = dget(ctx->index),
.mnt = ovl_upper_mnt(ofs),
};
@@ -1343,84 +1324,100 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
* assignment happens only if upperdentry is non-NULL, and
* this one only if upperdentry is NULL.
*/
- d.upperredirect = ovl_get_redirect_xattr(ofs, &upperpath, 0);
- if (IS_ERR(d.upperredirect)) {
- err = PTR_ERR(d.upperredirect);
- d.upperredirect = NULL;
- goto out_free_oe;
- }
+ upperredirect = ovl_get_redirect_xattr(ofs, &upperpath, 0);
+ if (IS_ERR(upperredirect))
+ return PTR_ERR(upperredirect);
+ d->upperredirect = upperredirect;
err = ovl_check_metacopy_xattr(ofs, &upperpath, NULL);
if (err < 0)
- goto out_free_oe;
- d.metacopy = uppermetacopy = err;
+ return err;
+ d->metacopy = uppermetacopy = err;
metacopy_size = err;
- if (!ovl_check_follow_redirect(&d)) {
+ if (!ovl_check_follow_redirect(d)) {
err = -EPERM;
- goto out_free_oe;
+ return err;
}
}
- if (upperdentry || ctr) {
+ if (ctx->upperdentry || ctx->ctr) {
+ struct inode *inode;
struct ovl_inode_params oip = {
- .upperdentry = upperdentry,
- .oe = oe,
- .index = index,
- .redirect = d.upperredirect,
+ .upperdentry = ctx->upperdentry,
+ .oe = ctx->oe,
+ .index = ctx->index,
+ .redirect = d->upperredirect,
};
/* Store lowerdata redirect for lazy lookup */
- if (ctr > 1 && !d.is_dir && !stack[ctr - 1].dentry) {
- oip.lowerdata_redirect = d.redirect;
- d.redirect = NULL;
+ if (ctx->ctr > 1 && !d->is_dir && !ctx->stack[ctx->ctr - 1].dentry) {
+ oip.lowerdata_redirect = d->redirect;
+ d->redirect = NULL;
}
+
inode = ovl_get_inode(dentry->d_sb, &oip);
- err = PTR_ERR(inode);
if (IS_ERR(inode))
- goto out_free_oe;
- if (upperdentry && !uppermetacopy)
- ovl_set_flag(OVL_UPPERDATA, inode);
+ return PTR_ERR(inode);
+
+ ctx->inode = inode;
+ if (ctx->upperdentry && !uppermetacopy)
+ ovl_set_flag(OVL_UPPERDATA, ctx->inode);
if (metacopy_size > OVL_METACOPY_MIN_SIZE)
- ovl_set_flag(OVL_HAS_DIGEST, inode);
+ ovl_set_flag(OVL_HAS_DIGEST, ctx->inode);
}
- ovl_dentry_init_reval(dentry, upperdentry, OVL_I_E(inode));
+ ovl_dentry_init_reval(dentry, ctx->upperdentry, OVL_I_E(ctx->inode));
+
+ return 0;
+}
+
+struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
+{
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
+ struct ovl_entry *poe = OVL_E(dentry->d_parent);
+ bool check_redirect = (ovl_redirect_follow(ofs) || ofs->numdatalayer);
+ int err;
+ struct ovl_lookup_ctx ctx = {
+ .dentry = dentry,
+ };
+ struct ovl_lookup_data d = {
+ .sb = dentry->d_sb,
+ .dentry = dentry,
+ .name = dentry->d_name,
+ .last = check_redirect ? false : !ovl_numlower(poe),
+ };
+
+ if (dentry->d_name.len > ofs->namelen)
+ return ERR_PTR(-ENAMETOOLONG);
+
+ with_ovl_creds(dentry->d_sb)
+ err = ovl_lookup_layers(&ctx, &d);
- ovl_revert_creds(old_cred);
- if (origin_path) {
- dput(origin_path->dentry);
- kfree(origin_path);
+ if (ctx.origin_path) {
+ dput(ctx.origin_path->dentry);
+ kfree(ctx.origin_path);
}
- dput(index);
- ovl_stack_free(stack, ctr);
+ dput(ctx.index);
+ ovl_stack_free(ctx.stack, ctx.ctr);
kfree(d.redirect);
- return d_splice_alias(inode, dentry);
-out_free_oe:
- ovl_free_entry(oe);
-out_put:
- dput(index);
- ovl_stack_free(stack, ctr);
-out_put_upper:
- if (origin_path) {
- dput(origin_path->dentry);
- kfree(origin_path);
+ if (err) {
+ ovl_free_entry(ctx.oe);
+ dput(ctx.upperdentry);
+ kfree(d.upperredirect);
+ return ERR_PTR(err);
}
- dput(upperdentry);
- kfree(d.upperredirect);
-out:
- kfree(d.redirect);
- ovl_revert_creds(old_cred);
- return ERR_PTR(err);
+
+ return d_splice_alias(ctx.inode, dentry);
}
bool ovl_lower_positive(struct dentry *dentry)
{
struct ovl_entry *poe = OVL_E(dentry->d_parent);
const struct qstr *name = &dentry->d_name;
- const struct cred *old_cred;
unsigned int i;
bool positive = false;
bool done = false;
@@ -1436,46 +1433,45 @@ bool ovl_lower_positive(struct dentry *dentry)
if (!ovl_dentry_upper(dentry))
return true;
- old_cred = ovl_override_creds(dentry->d_sb);
- /* Positive upper -> have to look up lower to see whether it exists */
- for (i = 0; !done && !positive && i < ovl_numlower(poe); i++) {
- struct dentry *this;
- struct ovl_path *parentpath = &ovl_lowerstack(poe)[i];
+ with_ovl_creds(dentry->d_sb) {
+ /* Positive upper -> have to look up lower to see whether it exists */
+ for (i = 0; !done && !positive && i < ovl_numlower(poe); i++) {
+ struct dentry *this;
+ struct ovl_path *parentpath = &ovl_lowerstack(poe)[i];
- /*
- * We need to make a non-const copy of dentry->d_name,
- * because lookup_one_positive_unlocked() will hash name
- * with parentpath base, which is on another (lower fs).
- */
- this = lookup_one_positive_unlocked(
- mnt_idmap(parentpath->layer->mnt),
- &QSTR_LEN(name->name, name->len),
- parentpath->dentry);
- if (IS_ERR(this)) {
- switch (PTR_ERR(this)) {
- case -ENOENT:
- case -ENAMETOOLONG:
- break;
-
- default:
- /*
- * Assume something is there, we just couldn't
- * access it.
- */
- positive = true;
- break;
+ /*
+ * We need to make a non-const copy of dentry->d_name,
+ * because lookup_one_positive_unlocked() will hash name
+ * with parentpath base, which is on another (lower fs).
+ */
+ this = lookup_one_positive_unlocked(mnt_idmap(parentpath->layer->mnt),
+ &QSTR_LEN(name->name, name->len),
+ parentpath->dentry);
+ if (IS_ERR(this)) {
+ switch (PTR_ERR(this)) {
+ case -ENOENT:
+ case -ENAMETOOLONG:
+ break;
+
+ default:
+ /*
+ * Assume something is there, we just couldn't
+ * access it.
+ */
+ positive = true;
+ break;
+ }
+ } else {
+ struct path path = {
+ .dentry = this,
+ .mnt = parentpath->layer->mnt,
+ };
+ positive = !ovl_path_is_whiteout(OVL_FS(dentry->d_sb), &path);
+ done = true;
+ dput(this);
}
- } else {
- struct path path = {
- .dentry = this,
- .mnt = parentpath->layer->mnt,
- };
- positive = !ovl_path_is_whiteout(OVL_FS(dentry->d_sb), &path);
- done = true;
- dput(this);
}
}
- ovl_revert_creds(old_cred);
return positive;
}
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index c8fd5951fc5e..f9ac9bdde830 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -206,7 +206,7 @@ static inline int ovl_do_notify_change(struct ovl_fs *ofs,
static inline int ovl_do_rmdir(struct ovl_fs *ofs,
struct inode *dir, struct dentry *dentry)
{
- int err = vfs_rmdir(ovl_upper_mnt_idmap(ofs), dir, dentry);
+ int err = vfs_rmdir(ovl_upper_mnt_idmap(ofs), dir, dentry, NULL);
pr_debug("rmdir(%pd2) = %i\n", dentry, err);
return err;
@@ -235,7 +235,7 @@ static inline int ovl_do_create(struct ovl_fs *ofs,
struct inode *dir, struct dentry *dentry,
umode_t mode)
{
- int err = vfs_create(ovl_upper_mnt_idmap(ofs), dir, dentry, mode, true);
+ int err = vfs_create(ovl_upper_mnt_idmap(ofs), dentry, mode, NULL);
pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
return err;
@@ -248,7 +248,7 @@ static inline struct dentry *ovl_do_mkdir(struct ovl_fs *ofs,
{
struct dentry *ret;
- ret = vfs_mkdir(ovl_upper_mnt_idmap(ofs), dir, dentry, mode);
+ ret = vfs_mkdir(ovl_upper_mnt_idmap(ofs), dir, dentry, mode, NULL);
pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, PTR_ERR_OR_ZERO(ret));
return ret;
}
@@ -257,7 +257,7 @@ static inline int ovl_do_mknod(struct ovl_fs *ofs,
struct inode *dir, struct dentry *dentry,
umode_t mode, dev_t dev)
{
- int err = vfs_mknod(ovl_upper_mnt_idmap(ofs), dir, dentry, mode, dev);
+ int err = vfs_mknod(ovl_upper_mnt_idmap(ofs), dir, dentry, mode, dev, NULL);
pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n", dentry, mode, dev, err);
return err;
@@ -267,7 +267,7 @@ static inline int ovl_do_symlink(struct ovl_fs *ofs,
struct inode *dir, struct dentry *dentry,
const char *oldname)
{
- int err = vfs_symlink(ovl_upper_mnt_idmap(ofs), dir, dentry, oldname);
+ int err = vfs_symlink(ovl_upper_mnt_idmap(ofs), dir, dentry, oldname, NULL);
pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
return err;
@@ -355,11 +355,24 @@ static inline int ovl_do_remove_acl(struct ovl_fs *ofs, struct dentry *dentry,
return vfs_remove_acl(ovl_upper_mnt_idmap(ofs), dentry, acl_name);
}
+static inline int ovl_do_rename_rd(struct renamedata *rd)
+{
+ int err;
+
+ pr_debug("rename(%pd2, %pd2, 0x%x)\n", rd->old_dentry, rd->new_dentry,
+ rd->flags);
+ err = vfs_rename(rd);
+ if (err) {
+ pr_debug("...rename(%pd2, %pd2, ...) = %i\n",
+ rd->old_dentry, rd->new_dentry, err);
+ }
+ return err;
+}
+
static inline int ovl_do_rename(struct ovl_fs *ofs, struct dentry *olddir,
struct dentry *olddentry, struct dentry *newdir,
struct dentry *newdentry, unsigned int flags)
{
- int err;
struct renamedata rd = {
.mnt_idmap = ovl_upper_mnt_idmap(ofs),
.old_parent = olddir,
@@ -369,13 +382,7 @@ static inline int ovl_do_rename(struct ovl_fs *ofs, struct dentry *olddir,
.flags = flags,
};
- pr_debug("rename(%pd2, %pd2, 0x%x)\n", olddentry, newdentry, flags);
- err = vfs_rename(&rd);
- if (err) {
- pr_debug("...rename(%pd2, %pd2, ...) = %i\n",
- olddentry, newdentry, err);
- }
- return err;
+ return ovl_do_rename_rd(&rd);
}
static inline int ovl_do_whiteout(struct ovl_fs *ofs,
@@ -415,6 +422,22 @@ static inline struct dentry *ovl_lookup_upper_unlocked(struct ovl_fs *ofs,
&QSTR_LEN(name, len), base);
}
+static inline struct dentry *ovl_start_creating_upper(struct ovl_fs *ofs,
+ struct dentry *parent,
+ struct qstr *name)
+{
+ return start_creating(ovl_upper_mnt_idmap(ofs),
+ parent, name);
+}
+
+static inline struct dentry *ovl_start_removing_upper(struct ovl_fs *ofs,
+ struct dentry *parent,
+ struct qstr *name)
+{
+ return start_removing(ovl_upper_mnt_idmap(ofs),
+ parent, name);
+}
+
static inline bool ovl_open_flags_need_copy_up(int flags)
{
if (!flags)
@@ -424,11 +447,6 @@ static inline bool ovl_open_flags_need_copy_up(int flags)
}
/* util.c */
-int ovl_parent_lock(struct dentry *parent, struct dentry *child);
-static inline void ovl_parent_unlock(struct dentry *parent)
-{
- inode_unlock(parent->d_inode);
-}
int ovl_get_write_access(struct dentry *dentry);
void ovl_put_write_access(struct dentry *dentry);
void ovl_start_write(struct dentry *dentry);
@@ -437,7 +455,11 @@ int ovl_want_write(struct dentry *dentry);
void ovl_drop_write(struct dentry *dentry);
struct dentry *ovl_workdir(struct dentry *dentry);
const struct cred *ovl_override_creds(struct super_block *sb);
-void ovl_revert_creds(const struct cred *old_cred);
+
+EXTEND_CLASS(override_creds, _ovl, ovl_override_creds(sb), struct super_block *sb)
+
+#define with_ovl_creds(sb) \
+ scoped_class(override_creds_ovl, __UNIQUE_ID(label), sb)
static inline const struct cred *ovl_creds(struct super_block *sb)
{
@@ -865,7 +887,8 @@ struct dentry *ovl_create_real(struct ovl_fs *ofs,
struct dentry *parent, struct dentry *newdentry,
struct ovl_cattr *attr);
int ovl_cleanup(struct ovl_fs *ofs, struct dentry *workdir, struct dentry *dentry);
-struct dentry *ovl_lookup_temp(struct ovl_fs *ofs, struct dentry *workdir);
+#define OVL_TEMPNAME_SIZE 20
+void ovl_tempname(char name[OVL_TEMPNAME_SIZE]);
struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir,
struct ovl_cattr *attr);
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 1e9792cc557b..160960bb0ad0 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -348,11 +348,7 @@ static bool ovl_fill_merge(struct dir_context *ctx, const char *name,
static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data *rdd)
{
- int err = 0;
struct dentry *dentry, *dir = path->dentry;
- const struct cred *old_cred;
-
- old_cred = ovl_override_creds(rdd->dentry->d_sb);
while (rdd->first_maybe_whiteout) {
struct ovl_cache_entry *p =
@@ -365,13 +361,11 @@ static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data
p->is_whiteout = ovl_is_whiteout(dentry);
dput(dentry);
} else if (PTR_ERR(dentry) == -EINTR) {
- err = -EINTR;
- break;
+ return -EINTR;
}
}
- ovl_revert_creds(old_cred);
- return err;
+ return 0;
}
static inline int ovl_dir_read(const struct path *realpath,
@@ -838,36 +832,12 @@ static int ovl_iterate_real(struct file *file, struct dir_context *ctx)
return err;
}
-
-static int ovl_iterate(struct file *file, struct dir_context *ctx)
+static int ovl_iterate_merged(struct file *file, struct dir_context *ctx)
{
struct ovl_dir_file *od = file->private_data;
struct dentry *dentry = file->f_path.dentry;
- struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
struct ovl_cache_entry *p;
- const struct cred *old_cred;
- int err;
-
- old_cred = ovl_override_creds(dentry->d_sb);
- if (!ctx->pos)
- ovl_dir_reset(file);
-
- if (od->is_real) {
- /*
- * If parent is merge, then need to adjust d_ino for '..', if
- * dir is impure then need to adjust d_ino for copied up
- * entries.
- */
- if (ovl_xino_bits(ofs) ||
- (ovl_same_fs(ofs) &&
- (ovl_is_impure_dir(file) ||
- OVL_TYPE_MERGE(ovl_path_type(dentry->d_parent))))) {
- err = ovl_iterate_real(file, ctx);
- } else {
- err = iterate_dir(od->realfile, ctx);
- }
- goto out;
- }
+ int err = 0;
if (!od->cache) {
struct ovl_dir_cache *cache;
@@ -875,7 +845,7 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx)
cache = ovl_cache_get(dentry);
err = PTR_ERR(cache);
if (IS_ERR(cache))
- goto out;
+ return err;
od->cache = cache;
ovl_seek_cursor(od, ctx->pos);
@@ -887,7 +857,7 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx)
if (!p->ino || p->check_xwhiteout) {
err = ovl_cache_update(&file->f_path, p, !p->ino);
if (err)
- goto out;
+ return err;
}
}
/* ovl_cache_update() sets is_whiteout on stale entry */
@@ -898,12 +868,50 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx)
od->cursor = p->l_node.next;
ctx->pos++;
}
- err = 0;
-out:
- ovl_revert_creds(old_cred);
return err;
}
+static bool ovl_need_adjust_d_ino(struct file *file)
+{
+ struct dentry *dentry = file->f_path.dentry;
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
+
+ /* If parent is merge, then need to adjust d_ino for '..' */
+ if (ovl_xino_bits(ofs))
+ return true;
+
+ /* Can't do consistent inode numbering */
+ if (!ovl_same_fs(ofs))
+ return false;
+
+ /* If dir is impure then need to adjust d_ino for copied up entries */
+ if (ovl_is_impure_dir(file) ||
+ OVL_TYPE_MERGE(ovl_path_type(dentry->d_parent)))
+ return true;
+
+ /* Pure: no need to adjust d_ino */
+ return false;
+}
+
+
+static int ovl_iterate(struct file *file, struct dir_context *ctx)
+{
+ struct ovl_dir_file *od = file->private_data;
+
+ if (!ctx->pos)
+ ovl_dir_reset(file);
+
+ with_ovl_creds(file_dentry(file)->d_sb) {
+ if (!od->is_real)
+ return ovl_iterate_merged(file, ctx);
+
+ if (ovl_need_adjust_d_ino(file))
+ return ovl_iterate_real(file, ctx);
+
+ return iterate_dir(od->realfile, ctx);
+ }
+}
+
static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
{
loff_t res;
@@ -947,14 +955,8 @@ out_unlock:
static struct file *ovl_dir_open_realfile(const struct file *file,
const struct path *realpath)
{
- struct file *res;
- const struct cred *old_cred;
-
- old_cred = ovl_override_creds(file_inode(file)->i_sb);
- res = ovl_path_open(realpath, O_RDONLY | (file->f_flags & O_LARGEFILE));
- ovl_revert_creds(old_cred);
-
- return res;
+ with_ovl_creds(file_inode(file)->i_sb)
+ return ovl_path_open(realpath, O_RDONLY | (file->f_flags & O_LARGEFILE));
}
/*
@@ -1075,11 +1077,9 @@ int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
int err;
struct ovl_cache_entry *p, *n;
struct rb_root root = RB_ROOT;
- const struct cred *old_cred;
- old_cred = ovl_override_creds(dentry->d_sb);
- err = ovl_dir_read_merged(dentry, list, &root);
- ovl_revert_creds(old_cred);
+ with_ovl_creds(dentry->d_sb)
+ err = ovl_dir_read_merged(dentry, list, &root);
if (err)
return err;
@@ -1242,11 +1242,11 @@ int ovl_workdir_cleanup(struct ovl_fs *ofs, struct dentry *parent,
if (!d_is_dir(dentry) || level > 1)
return ovl_cleanup(ofs, parent, dentry);
- err = ovl_parent_lock(parent, dentry);
- if (err)
- return err;
+ dentry = start_removing_dentry(parent, dentry);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
err = ovl_do_rmdir(ofs, parent->d_inode, dentry);
- ovl_parent_unlock(parent);
+ end_removing(dentry);
if (err) {
struct path path = { .mnt = mnt, .dentry = dentry };
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 43ee4c7296a7..28b2f707cfbc 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -310,8 +310,7 @@ static struct dentry *ovl_workdir_create(struct ovl_fs *ofs,
bool retried = false;
retry:
- inode_lock_nested(dir, I_MUTEX_PARENT);
- work = ovl_lookup_upper(ofs, name, ofs->workbasedir, strlen(name));
+ work = ovl_start_creating_upper(ofs, ofs->workbasedir, &QSTR(name));
if (!IS_ERR(work)) {
struct iattr attr = {
@@ -320,14 +319,12 @@ retry:
};
if (work->d_inode) {
+ end_creating_keep(work);
+ if (persist)
+ return work;
err = -EEXIST;
- inode_unlock(dir);
if (retried)
goto out_dput;
-
- if (persist)
- return work;
-
retried = true;
err = ovl_workdir_cleanup(ofs, ofs->workbasedir, mnt, work, 0);
dput(work);
@@ -338,7 +335,7 @@ retry:
}
work = ovl_do_mkdir(ofs, dir, work, attr.ia_mode);
- inode_unlock(dir);
+ end_creating_keep(work);
err = PTR_ERR(work);
if (IS_ERR(work))
goto out_err;
@@ -376,7 +373,6 @@ retry:
if (err)
goto out_dput;
} else {
- inode_unlock(dir);
err = PTR_ERR(work);
goto out_err;
}
@@ -567,9 +563,10 @@ static int ovl_check_rename_whiteout(struct ovl_fs *ofs)
{
struct dentry *workdir = ofs->workdir;
struct dentry *temp;
- struct dentry *dest;
struct dentry *whiteout;
struct name_snapshot name;
+ struct renamedata rd = {};
+ char name2[OVL_TEMPNAME_SIZE];
int err;
temp = ovl_create_temp(ofs, workdir, OVL_CATTR(S_IFREG | 0));
@@ -577,23 +574,21 @@ static int ovl_check_rename_whiteout(struct ovl_fs *ofs)
if (IS_ERR(temp))
return err;
- err = ovl_parent_lock(workdir, temp);
+ rd.mnt_idmap = ovl_upper_mnt_idmap(ofs);
+ rd.old_parent = workdir;
+ rd.new_parent = workdir;
+ rd.flags = RENAME_WHITEOUT;
+ ovl_tempname(name2);
+ err = start_renaming_dentry(&rd, 0, temp, &QSTR(name2));
if (err) {
dput(temp);
return err;
}
- dest = ovl_lookup_temp(ofs, workdir);
- err = PTR_ERR(dest);
- if (IS_ERR(dest)) {
- dput(temp);
- ovl_parent_unlock(workdir);
- return err;
- }
/* Name is inline and stable - using snapshot as a copy helper */
take_dentry_name_snapshot(&name, temp);
- err = ovl_do_rename(ofs, workdir, temp, workdir, dest, RENAME_WHITEOUT);
- ovl_parent_unlock(workdir);
+ err = ovl_do_rename_rd(&rd);
+ end_renaming(&rd);
if (err) {
if (err == -EINVAL)
err = 0;
@@ -617,7 +612,6 @@ cleanup_temp:
ovl_cleanup(ofs, workdir, temp);
release_dentry_name_snapshot(&name);
dput(temp);
- dput(dest);
return err;
}
@@ -626,14 +620,15 @@ static struct dentry *ovl_lookup_or_create(struct ovl_fs *ofs,
struct dentry *parent,
const char *name, umode_t mode)
{
- size_t len = strlen(name);
struct dentry *child;
- inode_lock_nested(parent->d_inode, I_MUTEX_PARENT);
- child = ovl_lookup_upper(ofs, name, parent, len);
- if (!IS_ERR(child) && !child->d_inode)
- child = ovl_create_real(ofs, parent, child, OVL_CATTR(mode));
- inode_unlock(parent->d_inode);
+ child = ovl_start_creating_upper(ofs, parent, &QSTR(name));
+ if (!IS_ERR(child)) {
+ if (!child->d_inode)
+ child = ovl_create_real(ofs, parent, child,
+ OVL_CATTR(mode));
+ end_creating_keep(child);
+ }
dput(parent);
return child;
@@ -1369,53 +1364,35 @@ static void ovl_set_d_op(struct super_block *sb)
set_default_d_op(sb, &ovl_dentry_operations);
}
-int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
+static int ovl_fill_super_creds(struct fs_context *fc, struct super_block *sb)
{
struct ovl_fs *ofs = sb->s_fs_info;
+ struct cred *creator_cred = (struct cred *)ofs->creator_cred;
struct ovl_fs_context *ctx = fc->fs_private;
- const struct cred *old_cred = NULL;
- struct dentry *root_dentry;
- struct ovl_entry *oe;
struct ovl_layer *layers;
- struct cred *cred;
+ struct ovl_entry *oe = NULL;
int err;
- err = -EIO;
- if (WARN_ON(fc->user_ns != current_user_ns()))
- goto out_err;
-
- ovl_set_d_op(sb);
-
- err = -ENOMEM;
- if (!ofs->creator_cred)
- ofs->creator_cred = cred = prepare_creds();
- else
- cred = (struct cred *)ofs->creator_cred;
- if (!cred)
- goto out_err;
-
- old_cred = ovl_override_creds(sb);
-
err = ovl_fs_params_verify(ctx, &ofs->config);
if (err)
- goto out_err;
+ return err;
err = -EINVAL;
if (ctx->nr == 0) {
if (!(fc->sb_flags & SB_SILENT))
pr_err("missing 'lowerdir'\n");
- goto out_err;
+ return err;
}
err = -ENOMEM;
layers = kcalloc(ctx->nr + 1, sizeof(struct ovl_layer), GFP_KERNEL);
if (!layers)
- goto out_err;
+ return err;
ofs->config.lowerdirs = kcalloc(ctx->nr + 1, sizeof(char *), GFP_KERNEL);
if (!ofs->config.lowerdirs) {
kfree(layers);
- goto out_err;
+ return err;
}
ofs->layers = layers;
/*
@@ -1448,12 +1425,12 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
err = -EINVAL;
if (!ofs->config.workdir) {
pr_err("missing 'workdir'\n");
- goto out_err;
+ return err;
}
err = ovl_get_upper(sb, ofs, &layers[0], &ctx->upper);
if (err)
- goto out_err;
+ return err;
upper_sb = ovl_upper_mnt(ofs)->mnt_sb;
if (!ovl_should_sync(ofs)) {
@@ -1461,13 +1438,13 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
if (errseq_check(&upper_sb->s_wb_err, ofs->errseq)) {
err = -EIO;
pr_err("Cannot mount volatile when upperdir has an unseen error. Sync upperdir fs to clear state.\n");
- goto out_err;
+ return err;
}
}
err = ovl_get_workdir(sb, ofs, &ctx->upper, &ctx->work);
if (err)
- goto out_err;
+ return err;
if (!ofs->workdir)
sb->s_flags |= SB_RDONLY;
@@ -1478,7 +1455,7 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
oe = ovl_get_lowerstack(sb, ctx, ofs, layers);
err = PTR_ERR(oe);
if (IS_ERR(oe))
- goto out_err;
+ return err;
/* If the upper fs is nonexistent, we mark overlayfs r/o too */
if (!ovl_upper_mnt(ofs))
@@ -1531,7 +1508,7 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_export_op = &ovl_export_fid_operations;
/* Never override disk quota limits or use reserved space */
- cap_lower(cred->cap_effective, CAP_SYS_RESOURCE);
+ cap_lower(creator_cred->cap_effective, CAP_SYS_RESOURCE);
sb->s_magic = OVERLAYFS_SUPER_MAGIC;
sb->s_xattr = ovl_xattr_handlers(ofs);
@@ -1549,27 +1526,44 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_iflags |= SB_I_EVM_HMAC_UNSUPPORTED;
err = -ENOMEM;
- root_dentry = ovl_get_root(sb, ctx->upper.dentry, oe);
- if (!root_dentry)
+ sb->s_root = ovl_get_root(sb, ctx->upper.dentry, oe);
+ if (!sb->s_root)
goto out_free_oe;
- sb->s_root = root_dentry;
-
- ovl_revert_creds(old_cred);
return 0;
out_free_oe:
ovl_free_entry(oe);
+ return err;
+}
+
+int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
+{
+ struct ovl_fs *ofs = sb->s_fs_info;
+ int err;
+
+ err = -EIO;
+ if (WARN_ON(fc->user_ns != current_user_ns()))
+ goto out_err;
+
+ ovl_set_d_op(sb);
+
+ if (!ofs->creator_cred) {
+ err = -ENOMEM;
+ ofs->creator_cred = prepare_creds();
+ if (!ofs->creator_cred)
+ goto out_err;
+ }
+
+ with_ovl_creds(sb)
+ err = ovl_fill_super_creds(fc, sb);
+
out_err:
- /*
- * Revert creds before calling ovl_free_fs() which will call
- * put_cred() and put_cred() requires that the cred's that are
- * put are not the caller's creds, i.e., current->cred.
- */
- if (old_cred)
- ovl_revert_creds(old_cred);
- ovl_free_fs(ofs);
- sb->s_fs_info = NULL;
+ if (err) {
+ ovl_free_fs(ofs);
+ sb->s_fs_info = NULL;
+ }
+
return err;
}
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index f76672f2e686..94986d11a166 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -69,11 +69,6 @@ const struct cred *ovl_override_creds(struct super_block *sb)
return override_creds(ofs->creator_cred);
}
-void ovl_revert_creds(const struct cred *old_cred)
-{
- revert_creds(old_cred);
-}
-
/*
* Check if underlying fs supports file handles and try to determine encoding
* type, in order to deduce maximum inode number used by fs.
@@ -1019,8 +1014,8 @@ bool ovl_inuse_trylock(struct dentry *dentry)
bool locked = false;
spin_lock(&inode->i_lock);
- if (!(inode->i_state & I_OVL_INUSE)) {
- inode->i_state |= I_OVL_INUSE;
+ if (!(inode_state_read(inode) & I_OVL_INUSE)) {
+ inode_state_set(inode, I_OVL_INUSE);
locked = true;
}
spin_unlock(&inode->i_lock);
@@ -1034,8 +1029,8 @@ void ovl_inuse_unlock(struct dentry *dentry)
struct inode *inode = d_inode(dentry);
spin_lock(&inode->i_lock);
- WARN_ON(!(inode->i_state & I_OVL_INUSE));
- inode->i_state &= ~I_OVL_INUSE;
+ WARN_ON(!(inode_state_read(inode) & I_OVL_INUSE));
+ inode_state_clear(inode, I_OVL_INUSE);
spin_unlock(&inode->i_lock);
}
}
@@ -1046,7 +1041,7 @@ bool ovl_is_inuse(struct dentry *dentry)
bool inuse;
spin_lock(&inode->i_lock);
- inuse = (inode->i_state & I_OVL_INUSE);
+ inuse = (inode_state_read(inode) & I_OVL_INUSE);
spin_unlock(&inode->i_lock);
return inuse;
@@ -1147,7 +1142,6 @@ fail:
int ovl_nlink_start(struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
- const struct cred *old_cred;
int err;
if (WARN_ON(!inode))
@@ -1184,15 +1178,14 @@ int ovl_nlink_start(struct dentry *dentry)
if (d_is_dir(dentry) || !ovl_test_flag(OVL_INDEX, inode))
return 0;
- old_cred = ovl_override_creds(dentry->d_sb);
/*
* The overlay inode nlink should be incremented/decremented IFF the
* upper operation succeeds, along with nlink change of upper inode.
* Therefore, before link/unlink/rename, we store the union nlink
* value relative to the upper inode nlink in an upper inode xattr.
*/
- err = ovl_set_nlink_upper(dentry);
- ovl_revert_creds(old_cred);
+ with_ovl_creds(dentry->d_sb)
+ err = ovl_set_nlink_upper(dentry);
if (err)
goto out_drop_write;
@@ -1213,11 +1206,8 @@ void ovl_nlink_end(struct dentry *dentry)
ovl_drop_write(dentry);
if (ovl_test_flag(OVL_INDEX, inode) && inode->i_nlink == 0) {
- const struct cred *old_cred;
-
- old_cred = ovl_override_creds(dentry->d_sb);
- ovl_cleanup_index(dentry);
- ovl_revert_creds(old_cred);
+ with_ovl_creds(dentry->d_sb)
+ ovl_cleanup_index(dentry);
}
ovl_inode_unlock(inode);
@@ -1234,9 +1224,9 @@ int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *work,
goto err;
if (trap)
goto err_unlock;
- if (work && work->d_parent != workdir)
+ if (work && (work->d_parent != workdir || d_unhashed(work)))
goto err_unlock;
- if (upper && upper->d_parent != upperdir)
+ if (upper && (upper->d_parent != upperdir || d_unhashed(upper)))
goto err_unlock;
return 0;
@@ -1548,14 +1538,3 @@ void ovl_copyattr(struct inode *inode)
i_size_write(inode, i_size_read(realinode));
spin_unlock(&inode->i_lock);
}
-
-int ovl_parent_lock(struct dentry *parent, struct dentry *child)
-{
- inode_lock_nested(parent->d_inode, I_MUTEX_PARENT);
- if (!child ||
- (!d_unhashed(child) && child->d_parent == parent))
- return 0;
-
- inode_unlock(parent->d_inode);
- return -EINVAL;
-}
diff --git a/fs/overlayfs/xattrs.c b/fs/overlayfs/xattrs.c
index 88055deca936..aa95855c7023 100644
--- a/fs/overlayfs/xattrs.c
+++ b/fs/overlayfs/xattrs.c
@@ -41,13 +41,11 @@ static int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char
struct dentry *upperdentry = ovl_i_dentry_upper(inode);
struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry);
struct path realpath;
- const struct cred *old_cred;
if (!value && !upperdentry) {
ovl_path_lower(dentry, &realpath);
- old_cred = ovl_override_creds(dentry->d_sb);
- err = vfs_getxattr(mnt_idmap(realpath.mnt), realdentry, name, NULL, 0);
- ovl_revert_creds(old_cred);
+ with_ovl_creds(dentry->d_sb)
+ err = vfs_getxattr(mnt_idmap(realpath.mnt), realdentry, name, NULL, 0);
if (err < 0)
goto out;
}
@@ -64,15 +62,14 @@ static int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char
if (err)
goto out;
- old_cred = ovl_override_creds(dentry->d_sb);
- if (value) {
- err = ovl_do_setxattr(ofs, realdentry, name, value, size,
- flags);
- } else {
- WARN_ON(flags != XATTR_REPLACE);
- err = ovl_do_removexattr(ofs, realdentry, name);
+ with_ovl_creds(dentry->d_sb) {
+ if (value) {
+ err = ovl_do_setxattr(ofs, realdentry, name, value, size, flags);
+ } else {
+ WARN_ON(flags != XATTR_REPLACE);
+ err = ovl_do_removexattr(ofs, realdentry, name);
+ }
}
- ovl_revert_creds(old_cred);
ovl_drop_write(dentry);
/* copy c/mtime */
@@ -84,15 +81,11 @@ out:
static int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name,
void *value, size_t size)
{
- ssize_t res;
- const struct cred *old_cred;
struct path realpath;
ovl_i_path_real(inode, &realpath);
- old_cred = ovl_override_creds(dentry->d_sb);
- res = vfs_getxattr(mnt_idmap(realpath.mnt), realpath.dentry, name, value, size);
- ovl_revert_creds(old_cred);
- return res;
+ with_ovl_creds(dentry->d_sb)
+ return vfs_getxattr(mnt_idmap(realpath.mnt), realpath.dentry, name, value, size);
}
static bool ovl_can_list(struct super_block *sb, const char *s)
@@ -116,12 +109,10 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
ssize_t res;
size_t len;
char *s;
- const struct cred *old_cred;
size_t prefix_len, name_len;
- old_cred = ovl_override_creds(dentry->d_sb);
- res = vfs_listxattr(realdentry, list, size);
- ovl_revert_creds(old_cred);
+ with_ovl_creds(dentry->d_sb)
+ res = vfs_listxattr(realdentry, list, size);
if (res <= 0 || size == 0)
return res;
diff --git a/fs/pidfs.c b/fs/pidfs.c
index 0ef5b47d796a..dba703d4ce4a 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -39,20 +39,20 @@ void pidfs_get_root(struct path *path)
path_get(path);
}
-/*
- * Stashes information that userspace needs to access even after the
- * process has been reaped.
- */
-struct pidfs_exit_info {
- __u64 cgroupid;
- __s32 exit_code;
- __u32 coredump_mask;
+enum pidfs_attr_mask_bits {
+ PIDFS_ATTR_BIT_EXIT = 0,
+ PIDFS_ATTR_BIT_COREDUMP = 1,
};
struct pidfs_attr {
+ unsigned long attr_mask;
struct simple_xattrs *xattrs;
- struct pidfs_exit_info __pei;
- struct pidfs_exit_info *exit_info;
+ struct /* exit info */ {
+ __u64 cgroupid;
+ __s32 exit_code;
+ };
+ __u32 coredump_mask;
+ __u32 coredump_signal;
};
static struct rb_root pidfs_ino_tree = RB_ROOT;
@@ -293,6 +293,15 @@ static __u32 pidfs_coredump_mask(unsigned long mm_flags)
return 0;
}
+/* This must be updated whenever a new flag is added */
+#define PIDFD_INFO_SUPPORTED (PIDFD_INFO_PID | \
+ PIDFD_INFO_CREDS | \
+ PIDFD_INFO_CGROUPID | \
+ PIDFD_INFO_EXIT | \
+ PIDFD_INFO_COREDUMP | \
+ PIDFD_INFO_SUPPORTED_MASK | \
+ PIDFD_INFO_COREDUMP_SIGNAL)
+
static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
{
struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg;
@@ -300,12 +309,13 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
struct pid *pid = pidfd_pid(file);
size_t usize = _IOC_SIZE(cmd);
struct pidfd_info kinfo = {};
- struct pidfs_exit_info *exit_info;
struct user_namespace *user_ns;
struct pidfs_attr *attr;
const struct cred *c;
__u64 mask;
+ BUILD_BUG_ON(sizeof(struct pidfd_info) != PIDFD_INFO_SIZE_VER2);
+
if (!uinfo)
return -EINVAL;
if (usize < PIDFD_INFO_SIZE_VER0)
@@ -323,20 +333,24 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
attr = READ_ONCE(pid->attr);
if (mask & PIDFD_INFO_EXIT) {
- exit_info = READ_ONCE(attr->exit_info);
- if (exit_info) {
+ if (test_bit(PIDFS_ATTR_BIT_EXIT, &attr->attr_mask)) {
+ smp_rmb();
kinfo.mask |= PIDFD_INFO_EXIT;
#ifdef CONFIG_CGROUPS
- kinfo.cgroupid = exit_info->cgroupid;
+ kinfo.cgroupid = attr->cgroupid;
kinfo.mask |= PIDFD_INFO_CGROUPID;
#endif
- kinfo.exit_code = exit_info->exit_code;
+ kinfo.exit_code = attr->exit_code;
}
}
if (mask & PIDFD_INFO_COREDUMP) {
- kinfo.mask |= PIDFD_INFO_COREDUMP;
- kinfo.coredump_mask = READ_ONCE(attr->__pei.coredump_mask);
+ if (test_bit(PIDFS_ATTR_BIT_COREDUMP, &attr->attr_mask)) {
+ smp_rmb();
+ kinfo.mask |= PIDFD_INFO_COREDUMP | PIDFD_INFO_COREDUMP_SIGNAL;
+ kinfo.coredump_mask = attr->coredump_mask;
+ kinfo.coredump_signal = attr->coredump_signal;
+ }
}
task = get_pid_task(pid, PIDTYPE_PID);
@@ -355,14 +369,15 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
if (!c)
return -ESRCH;
- if ((kinfo.mask & PIDFD_INFO_COREDUMP) && !(kinfo.coredump_mask)) {
- task_lock(task);
+ if ((mask & PIDFD_INFO_COREDUMP) && !kinfo.coredump_mask) {
+ guard(task_lock)(task);
if (task->mm) {
unsigned long flags = __mm_flags_get_dumpable(task->mm);
kinfo.coredump_mask = pidfs_coredump_mask(flags);
+ kinfo.mask |= PIDFD_INFO_COREDUMP;
+ /* No coredump actually took place, so no coredump signal. */
}
- task_unlock(task);
}
/* Unconditionally return identifiers and credentials, the rest only on request */
@@ -409,6 +424,13 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
return -ESRCH;
copy_out:
+ if (mask & PIDFD_INFO_SUPPORTED_MASK) {
+ kinfo.mask |= PIDFD_INFO_SUPPORTED_MASK;
+ kinfo.supported_mask = PIDFD_INFO_SUPPORTED;
+ }
+
+ /* Are there bits in the return mask not present in PIDFD_INFO_SUPPORTED? */
+ WARN_ON_ONCE(~PIDFD_INFO_SUPPORTED & kinfo.mask);
/*
* If userspace and the kernel have the same struct size it can just
* be copied. If userspace provides an older struct, only the bits that
@@ -454,7 +476,6 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
struct task_struct *task __free(put_task) = NULL;
struct nsproxy *nsp __free(put_nsproxy) = NULL;
struct ns_common *ns_common = NULL;
- struct pid_namespace *pid_ns;
if (!pidfs_ioctl_valid(cmd))
return -ENOIOCTLCMD;
@@ -496,66 +517,64 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
switch (cmd) {
/* Namespaces that hang of nsproxy. */
case PIDFD_GET_CGROUP_NAMESPACE:
- if (IS_ENABLED(CONFIG_CGROUPS)) {
- get_cgroup_ns(nsp->cgroup_ns);
- ns_common = to_ns_common(nsp->cgroup_ns);
- }
+ if (!ns_ref_get(nsp->cgroup_ns))
+ break;
+ ns_common = to_ns_common(nsp->cgroup_ns);
break;
case PIDFD_GET_IPC_NAMESPACE:
- if (IS_ENABLED(CONFIG_IPC_NS)) {
- get_ipc_ns(nsp->ipc_ns);
- ns_common = to_ns_common(nsp->ipc_ns);
- }
+ if (!ns_ref_get(nsp->ipc_ns))
+ break;
+ ns_common = to_ns_common(nsp->ipc_ns);
break;
case PIDFD_GET_MNT_NAMESPACE:
- get_mnt_ns(nsp->mnt_ns);
+ if (!ns_ref_get(nsp->mnt_ns))
+ break;
ns_common = to_ns_common(nsp->mnt_ns);
break;
case PIDFD_GET_NET_NAMESPACE:
- if (IS_ENABLED(CONFIG_NET_NS)) {
- ns_common = to_ns_common(nsp->net_ns);
- get_net_ns(ns_common);
- }
+ if (!ns_ref_get(nsp->net_ns))
+ break;
+ ns_common = to_ns_common(nsp->net_ns);
break;
case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE:
- if (IS_ENABLED(CONFIG_PID_NS)) {
- get_pid_ns(nsp->pid_ns_for_children);
- ns_common = to_ns_common(nsp->pid_ns_for_children);
- }
+ if (!ns_ref_get(nsp->pid_ns_for_children))
+ break;
+ ns_common = to_ns_common(nsp->pid_ns_for_children);
break;
case PIDFD_GET_TIME_NAMESPACE:
- if (IS_ENABLED(CONFIG_TIME_NS)) {
- get_time_ns(nsp->time_ns);
- ns_common = to_ns_common(nsp->time_ns);
- }
+ if (!ns_ref_get(nsp->time_ns))
+ break;
+ ns_common = to_ns_common(nsp->time_ns);
break;
case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE:
- if (IS_ENABLED(CONFIG_TIME_NS)) {
- get_time_ns(nsp->time_ns_for_children);
- ns_common = to_ns_common(nsp->time_ns_for_children);
- }
+ if (!ns_ref_get(nsp->time_ns_for_children))
+ break;
+ ns_common = to_ns_common(nsp->time_ns_for_children);
break;
case PIDFD_GET_UTS_NAMESPACE:
- if (IS_ENABLED(CONFIG_UTS_NS)) {
- get_uts_ns(nsp->uts_ns);
- ns_common = to_ns_common(nsp->uts_ns);
- }
+ if (!ns_ref_get(nsp->uts_ns))
+ break;
+ ns_common = to_ns_common(nsp->uts_ns);
break;
/* Namespaces that don't hang of nsproxy. */
case PIDFD_GET_USER_NAMESPACE:
- if (IS_ENABLED(CONFIG_USER_NS)) {
- rcu_read_lock();
- ns_common = to_ns_common(get_user_ns(task_cred_xxx(task, user_ns)));
- rcu_read_unlock();
+ scoped_guard(rcu) {
+ struct user_namespace *user_ns;
+
+ user_ns = task_cred_xxx(task, user_ns);
+ if (!ns_ref_get(user_ns))
+ break;
+ ns_common = to_ns_common(user_ns);
}
break;
case PIDFD_GET_PID_NAMESPACE:
- if (IS_ENABLED(CONFIG_PID_NS)) {
- rcu_read_lock();
+ scoped_guard(rcu) {
+ struct pid_namespace *pid_ns;
+
pid_ns = task_active_pid_ns(task);
- if (pid_ns)
- ns_common = to_ns_common(get_pid_ns(pid_ns));
- rcu_read_unlock();
+ if (!ns_ref_get(pid_ns))
+ break;
+ ns_common = to_ns_common(pid_ns);
}
break;
default:
@@ -606,24 +625,25 @@ void pidfs_exit(struct task_struct *tsk)
{
struct pid *pid = task_pid(tsk);
struct pidfs_attr *attr;
- struct pidfs_exit_info *exit_info;
#ifdef CONFIG_CGROUPS
struct cgroup *cgrp;
#endif
might_sleep();
- guard(spinlock_irq)(&pid->wait_pidfd.lock);
- attr = pid->attr;
- if (!attr) {
- /*
- * No one ever held a pidfd for this struct pid.
- * Mark it as dead so no one can add a pidfs
- * entry anymore. We're about to be reaped and
- * so no exit information would be available.
- */
- pid->attr = PIDFS_PID_DEAD;
- return;
+ /* Synchronize with pidfs_register_pid(). */
+ scoped_guard(spinlock_irq, &pid->wait_pidfd.lock) {
+ attr = pid->attr;
+ if (!attr) {
+ /*
+ * No one ever held a pidfd for this struct pid.
+ * Mark it as dead so no one can add a pidfs
+ * entry anymore. We're about to be reaped and
+ * so no exit information would be available.
+ */
+ pid->attr = PIDFS_PID_DEAD;
+ return;
+ }
}
/*
@@ -634,41 +654,39 @@ void pidfs_exit(struct task_struct *tsk)
* is put
*/
- exit_info = &attr->__pei;
-
#ifdef CONFIG_CGROUPS
rcu_read_lock();
cgrp = task_dfl_cgroup(tsk);
- exit_info->cgroupid = cgroup_id(cgrp);
+ attr->cgroupid = cgroup_id(cgrp);
rcu_read_unlock();
#endif
- exit_info->exit_code = tsk->exit_code;
+ attr->exit_code = tsk->exit_code;
/* Ensure that PIDFD_GET_INFO sees either all or nothing. */
- smp_store_release(&attr->exit_info, &attr->__pei);
+ smp_wmb();
+ set_bit(PIDFS_ATTR_BIT_EXIT, &attr->attr_mask);
}
#ifdef CONFIG_COREDUMP
void pidfs_coredump(const struct coredump_params *cprm)
{
struct pid *pid = cprm->pid;
- struct pidfs_exit_info *exit_info;
struct pidfs_attr *attr;
- __u32 coredump_mask = 0;
attr = READ_ONCE(pid->attr);
VFS_WARN_ON_ONCE(!attr);
VFS_WARN_ON_ONCE(attr == PIDFS_PID_DEAD);
- exit_info = &attr->__pei;
- /* Note how we were coredumped. */
- coredump_mask = pidfs_coredump_mask(cprm->mm_flags);
- /* Note that we actually did coredump. */
- coredump_mask |= PIDFD_COREDUMPED;
+ /* Note how we were coredumped and that we coredumped. */
+ attr->coredump_mask = pidfs_coredump_mask(cprm->mm_flags) |
+ PIDFD_COREDUMPED;
/* If coredumping is set to skip we should never end up here. */
- VFS_WARN_ON_ONCE(coredump_mask & PIDFD_COREDUMP_SKIP);
- smp_store_release(&exit_info->coredump_mask, coredump_mask);
+ VFS_WARN_ON_ONCE(attr->coredump_mask & PIDFD_COREDUMP_SKIP);
+ /* Expose the signal number that caused the coredump. */
+ attr->coredump_signal = cprm->siginfo->si_signo;
+ smp_wmb();
+ set_bit(PIDFS_ATTR_BIT_COREDUMP, &attr->attr_mask);
}
#endif
@@ -1022,6 +1040,7 @@ static int pidfs_init_fs_context(struct fs_context *fc)
fc->s_iflags |= SB_I_NOEXEC;
fc->s_iflags |= SB_I_NODEV;
+ ctx->s_d_flags |= DCACHE_DONTCACHE;
ctx->ops = &pidfs_sops;
ctx->eops = &pidfs_export_operations;
ctx->dops = &pidfs_dentry_operations;
diff --git a/fs/pipe.c b/fs/pipe.c
index 42fead1efe52..2d0fed2ecbfd 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -908,7 +908,7 @@ static struct inode * get_pipe_inode(void)
* list because "mark_inode_dirty()" will think
* that it already _is_ on the dirty list.
*/
- inode->i_state = I_DIRTY;
+ inode_state_assign_raw(inode, I_DIRTY);
inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 4050942ab52f..768f027c1428 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -1091,7 +1091,7 @@ int vfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
int acl_type;
int error;
struct inode *inode = d_inode(dentry);
- struct inode *delegated_inode = NULL;
+ struct delegated_inode delegated_inode = { };
acl_type = posix_acl_type(acl_name);
if (acl_type < 0)
@@ -1141,7 +1141,7 @@ retry_deleg:
out_inode_unlock:
inode_unlock(inode);
- if (delegated_inode) {
+ if (is_delegated(&delegated_inode)) {
error = break_deleg_wait(&delegated_inode);
if (!error)
goto retry_deleg;
@@ -1212,7 +1212,7 @@ int vfs_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry,
int acl_type;
int error;
struct inode *inode = d_inode(dentry);
- struct inode *delegated_inode = NULL;
+ struct delegated_inode delegated_inode = { };
acl_type = posix_acl_type(acl_name);
if (acl_type < 0)
@@ -1249,7 +1249,7 @@ retry_deleg:
out_inode_unlock:
inode_unlock(inode);
- if (delegated_inode) {
+ if (is_delegated(&delegated_inode)) {
error = break_deleg_wait(&delegated_inode);
if (!error)
goto retry_deleg;
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 2ae63189091e..cbd4bc4a58e4 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -481,7 +481,6 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
unsigned long flags;
int exit_code = task->exit_code;
struct signal_struct *sig = task->signal;
- unsigned int seq = 1;
state = *get_task_state(task);
vsize = eip = esp = 0;
@@ -538,10 +537,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
if (permitted && (!whole || num_threads < 2))
wchan = !task_is_running(task);
- do {
- seq++; /* 2 on the 1st/lockless path, otherwise odd */
- flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
-
+ scoped_seqlock_read (&sig->stats_lock, ss_lock_irqsave) {
cmin_flt = sig->cmin_flt;
cmaj_flt = sig->cmaj_flt;
cutime = sig->cutime;
@@ -563,8 +559,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
}
rcu_read_unlock();
}
- } while (need_seqretry(&sig->stats_lock, seq));
- done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
+ }
if (whole) {
thread_group_cputime_adjusted(task, &utime, &stime);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 6299878e3d97..407b41cb6e7c 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3043,21 +3043,14 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh
if (whole) {
struct signal_struct *sig = task->signal;
struct task_struct *t;
- unsigned int seq = 1;
- unsigned long flags;
-
- rcu_read_lock();
- do {
- seq++; /* 2 on the 1st/lockless path, otherwise odd */
- flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
+ guard(rcu)();
+ scoped_seqlock_read (&sig->stats_lock, ss_lock_irqsave) {
acct = sig->ioac;
__for_each_thread(sig, t)
task_io_accounting_add(&acct, &t->ioac);
- } while (need_seqretry(&sig->stats_lock, seq));
- done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
- rcu_read_unlock();
+ }
} else {
acct = task->ioac;
}
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 176281112273..501889856461 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -698,6 +698,12 @@ void pde_put(struct proc_dir_entry *pde)
}
}
+static void pde_erase(struct proc_dir_entry *pde, struct proc_dir_entry *parent)
+{
+ rb_erase(&pde->subdir_node, &parent->subdir);
+ RB_CLEAR_NODE(&pde->subdir_node);
+}
+
/*
* Remove a /proc entry and free it if it's not currently in use.
*/
@@ -720,7 +726,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
WARN(1, "removing permanent /proc entry '%s'", de->name);
de = NULL;
} else {
- rb_erase(&de->subdir_node, &parent->subdir);
+ pde_erase(de, parent);
if (S_ISDIR(de->mode))
parent->nlink--;
}
@@ -764,7 +770,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
root->parent->name, root->name);
return -EINVAL;
}
- rb_erase(&root->subdir_node, &parent->subdir);
+ pde_erase(root, parent);
de = root;
while (1) {
@@ -776,7 +782,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
next->parent->name, next->name);
return -EINVAL;
}
- rb_erase(&next->subdir_node, &de->subdir);
+ pde_erase(next, de);
de = next;
continue;
}
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index e399e2dd3a12..31d78da203ea 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -290,7 +290,7 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino)
inode = iget_locked(sb, ino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
qnx4_inode = qnx4_raw_inode(inode);
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 3310d1ad4d0e..88d285005083 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -521,7 +521,7 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino)
inode = iget_locked(sb, ino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
ei = QNX6_I(inode);
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 6c4a6ee1fa2b..376739f6420e 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1033,7 +1033,7 @@ static int add_dquot_ref(struct super_block *sb, int type)
spin_lock(&sb->s_inode_list_lock);
list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
spin_lock(&inode->i_lock);
- if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
+ if ((inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) ||
!atomic_read(&inode->i_writecount) ||
!dqinit_needed(inode, type)) {
spin_unlock(&inode->i_lock);
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 0addcc849ff2..360b00854115 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -302,7 +302,7 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos)
if (!i)
return ERR_PTR(-ENOMEM);
- if (!(i->i_state & I_NEW))
+ if (!(inode_state_read_once(i) & I_NEW))
return i;
/* precalculate the data offset */
diff --git a/fs/signalfd.c b/fs/signalfd.c
index d469782f97f4..d69eab584bc6 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -250,8 +250,6 @@ static const struct file_operations signalfd_fops = {
static int do_signalfd4(int ufd, sigset_t *mask, int flags)
{
- struct signalfd_ctx *ctx;
-
/* Check the SFD_* constants for consistency. */
BUILD_BUG_ON(SFD_CLOEXEC != O_CLOEXEC);
BUILD_BUG_ON(SFD_NONBLOCK != O_NONBLOCK);
@@ -263,7 +261,8 @@ static int do_signalfd4(int ufd, sigset_t *mask, int flags)
signotset(mask);
if (ufd == -1) {
- struct file *file;
+ int fd;
+ struct signalfd_ctx *ctx __free(kfree) = NULL;
ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx)
@@ -271,22 +270,16 @@ static int do_signalfd4(int ufd, sigset_t *mask, int flags)
ctx->sigmask = *mask;
- ufd = get_unused_fd_flags(flags & O_CLOEXEC);
- if (ufd < 0) {
- kfree(ctx);
- return ufd;
- }
-
- file = anon_inode_getfile_fmode("[signalfd]", &signalfd_fops,
- ctx, O_RDWR | (flags & O_NONBLOCK),
- FMODE_NOWAIT);
- if (IS_ERR(file)) {
- put_unused_fd(ufd);
- kfree(ctx);
- return PTR_ERR(file);
- }
- fd_install(ufd, file);
+ fd = FD_ADD(flags & O_CLOEXEC,
+ anon_inode_getfile_fmode(
+ "[signalfd]", &signalfd_fops, ctx,
+ O_RDWR | (flags & O_NONBLOCK), FMODE_NOWAIT));
+ if (fd >= 0)
+ retain_and_null_ptr(ctx);
+ return fd;
} else {
+ struct signalfd_ctx *ctx;
+
CLASS(fd, f)(ufd);
if (fd_empty(f))
return -EBADF;
diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c
index 018055fd2cdb..e3ea6fe7edb4 100644
--- a/fs/smb/client/cached_dir.c
+++ b/fs/smb/client/cached_dir.c
@@ -16,6 +16,7 @@ static struct cached_fid *init_cached_dir(const char *path);
static void free_cached_dir(struct cached_fid *cfid);
static void smb2_close_cached_fid(struct kref *ref);
static void cfids_laundromat_worker(struct work_struct *work);
+static void close_cached_dir_locked(struct cached_fid *cfid);
struct cached_dir_dentry {
struct list_head entry;
@@ -388,7 +389,7 @@ out:
* lease. Release one here, and the second below.
*/
cfid->has_lease = false;
- close_cached_dir(cfid);
+ close_cached_dir_locked(cfid);
}
spin_unlock(&cfids->cfid_list_lock);
@@ -480,18 +481,52 @@ void drop_cached_dir_by_name(const unsigned int xid, struct cifs_tcon *tcon,
spin_lock(&cfid->cfids->cfid_list_lock);
if (cfid->has_lease) {
cfid->has_lease = false;
- close_cached_dir(cfid);
+ close_cached_dir_locked(cfid);
}
spin_unlock(&cfid->cfids->cfid_list_lock);
close_cached_dir(cfid);
}
-
+/**
+ * close_cached_dir - drop a reference of a cached dir
+ *
+ * The release function will be called with cfid_list_lock held to remove the
+ * cached dirs from the list before any other thread can take another @cfid
+ * ref. Must not be called with cfid_list_lock held; use
+ * close_cached_dir_locked() called instead.
+ *
+ * @cfid: cached dir
+ */
void close_cached_dir(struct cached_fid *cfid)
{
+ lockdep_assert_not_held(&cfid->cfids->cfid_list_lock);
kref_put_lock(&cfid->refcount, smb2_close_cached_fid, &cfid->cfids->cfid_list_lock);
}
+/**
+ * close_cached_dir_locked - put a reference of a cached dir with
+ * cfid_list_lock held
+ *
+ * Calling close_cached_dir() with cfid_list_lock held has the potential effect
+ * of causing a deadlock if the invariant of refcount >= 2 is false.
+ *
+ * This function is used in paths that hold cfid_list_lock and expect at least
+ * two references. If that invariant is violated, WARNs and returns without
+ * dropping a reference; the final put must still go through
+ * close_cached_dir().
+ *
+ * @cfid: cached dir
+ */
+static void close_cached_dir_locked(struct cached_fid *cfid)
+{
+ lockdep_assert_held(&cfid->cfids->cfid_list_lock);
+
+ if (WARN_ON(kref_read(&cfid->refcount) < 2))
+ return;
+
+ kref_put(&cfid->refcount, smb2_close_cached_fid);
+}
+
/*
* Called from cifs_kill_sb when we unmount a share
*/
diff --git a/fs/smb/client/cifs_spnego.c b/fs/smb/client/cifs_spnego.c
index 9891f55bac1e..da935bd1ce87 100644
--- a/fs/smb/client/cifs_spnego.c
+++ b/fs/smb/client/cifs_spnego.c
@@ -90,7 +90,6 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo,
size_t desc_len;
struct key *spnego_key;
const char *hostname = server->hostname;
- const struct cred *saved_cred;
/* length of fields (with semicolons): ver=0xyz ip4=ipaddress
host=hostname sec=mechanism uid=0xFF user=username */
@@ -158,9 +157,8 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo,
dp += sprintf(dp, ";upcall_target=app");
cifs_dbg(FYI, "key description = %s\n", description);
- saved_cred = override_creds(spnego_cred);
- spnego_key = request_key(&cifs_spnego_key_type, description, "");
- revert_creds(saved_cred);
+ scoped_with_creds(spnego_cred)
+ spnego_key = request_key(&cifs_spnego_key_type, description, "");
#ifdef CONFIG_CIFS_DEBUG2
if (cifsFYI && !IS_ERR(spnego_key)) {
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index 185ac41bd7e9..6eccb9ed9daa 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -500,7 +500,7 @@ cifs_evict_inode(struct inode *inode)
{
netfs_wait_for_outstanding_io(inode);
truncate_inode_pages_final(&inode->i_data);
- if (inode->i_state & I_PINNING_NETFS_WB)
+ if (inode_state_read_once(inode) & I_PINNING_NETFS_WB)
cifs_fscache_unuse_inode_cookie(inode, true);
cifs_fscache_release_inode_cookie(inode);
clear_inode(inode);
@@ -1149,6 +1149,9 @@ cifs_setlease(struct file *file, int arg, struct file_lease **lease, void **priv
struct inode *inode = file_inode(file);
struct cifsFileInfo *cfile = file->private_data;
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
+
/* Check if file is oplocked if this is request for new lease */
if (arg == F_UNLCK ||
((arg == F_RDLCK) && CIFS_CACHE_READ(CIFS_I(inode))) ||
diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c
index 7da194f29fef..dcc50a2bfa4b 100644
--- a/fs/smb/client/cifssmb.c
+++ b/fs/smb/client/cifssmb.c
@@ -1363,6 +1363,14 @@ do_retry:
if (rdata->result == -ENODATA) {
rdata->result = 0;
__set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags);
+ trace_smb3_read_err(rdata->rreq->debug_id,
+ rdata->subreq.debug_index,
+ rdata->xid,
+ rdata->req->cfile->fid.persistent_fid,
+ tcon->tid, tcon->ses->Suid,
+ rdata->subreq.start + rdata->subreq.transferred,
+ rdata->subreq.len - rdata->subreq.transferred,
+ rdata->result);
} else {
size_t trans = rdata->subreq.transferred + rdata->got_bytes;
if (trans < rdata->subreq.len &&
@@ -1374,6 +1382,13 @@ do_retry:
}
if (rdata->got_bytes)
__set_bit(NETFS_SREQ_MADE_PROGRESS, &rdata->subreq.flags);
+ trace_smb3_read_done(rdata->rreq->debug_id,
+ rdata->subreq.debug_index,
+ rdata->xid,
+ rdata->req->cfile->fid.persistent_fid,
+ tcon->tid, tcon->ses->Suid,
+ rdata->subreq.start + rdata->subreq.transferred,
+ rdata->got_bytes);
}
trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, rdata->credits.value,
@@ -1445,6 +1460,13 @@ cifs_async_readv(struct cifs_io_subrequest *rdata)
rdata->iov[1].iov_base = (char *)smb + 4;
rdata->iov[1].iov_len = get_rfc1002_length(smb);
+ trace_smb3_read_enter(rdata->rreq->debug_id,
+ rdata->subreq.debug_index,
+ rdata->xid,
+ rdata->req->cfile->fid.netfid,
+ tcon->tid, tcon->ses->Suid,
+ rdata->subreq.start, rdata->subreq.len);
+
rc = cifs_call_async(tcon->ses->server, &rqst, cifs_readv_receive,
cifs_readv_callback, NULL, rdata, 0, NULL);
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index 55cb4b0cbd48..2f94d93b95e9 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -4451,6 +4451,7 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
out:
kfree(ctx->username);
+ kfree(ctx->domainname);
kfree_sensitive(ctx->password);
kfree(origin_fullpath);
kfree(ctx);
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 474dadeb1593..9dc0a968ec89 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -9,6 +9,7 @@
*
*/
#include <linux/fs.h>
+#include <linux/fs_struct.h>
#include <linux/filelock.h>
#include <linux/backing-dev.h>
#include <linux/stat.h>
diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c
index e60927b2a7c8..2a0d8b87bd8e 100644
--- a/fs/smb/client/fs_context.c
+++ b/fs/smb/client/fs_context.c
@@ -1435,12 +1435,14 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
cifs_errorf(fc, "Unknown error parsing devname\n");
goto cifs_parse_mount_err;
}
+ kfree(ctx->source);
ctx->source = smb3_fs_context_fullpath(ctx, '/');
if (IS_ERR(ctx->source)) {
ctx->source = NULL;
cifs_errorf(fc, "OOM when copying UNC string\n");
goto cifs_parse_mount_err;
}
+ kfree(fc->source);
fc->source = kstrdup(ctx->source, GFP_KERNEL);
if (fc->source == NULL) {
cifs_errorf(fc, "OOM when copying UNC string\n");
@@ -1468,7 +1470,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
break;
}
- if (strnlen(param->string, CIFS_MAX_USERNAME_LEN) >
+ if (strnlen(param->string, CIFS_MAX_USERNAME_LEN) ==
CIFS_MAX_USERNAME_LEN) {
pr_warn("username too long\n");
goto cifs_parse_mount_err;
@@ -1832,6 +1834,10 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
ctx->password = NULL;
kfree_sensitive(ctx->password2);
ctx->password2 = NULL;
+ kfree(ctx->source);
+ ctx->source = NULL;
+ kfree(fc->source);
+ fc->source = NULL;
return -EINVAL;
}
diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index cac355364e43..b75482730912 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -6,6 +6,7 @@
*
*/
#include <linux/fs.h>
+#include <linux/fs_struct.h>
#include <linux/stat.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
@@ -101,7 +102,7 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
cifs_dbg(FYI, "%s: revalidating inode %llu\n",
__func__, cifs_i->uniqueid);
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
cifs_dbg(FYI, "%s: inode %llu is new\n",
__func__, cifs_i->uniqueid);
return;
@@ -146,7 +147,7 @@ cifs_nlink_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
*/
if (fattr->cf_flags & CIFS_FATTR_UNKNOWN_NLINK) {
/* only provide fake values on a new inode */
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
if (fattr->cf_cifsattrs & ATTR_DIRECTORY)
set_nlink(inode, 2);
else
@@ -167,12 +168,12 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr,
struct cifsInodeInfo *cifs_i = CIFS_I(inode);
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
- if (!(inode->i_state & I_NEW) &&
+ if (!(inode_state_read_once(inode) & I_NEW) &&
unlikely(inode_wrong_type(inode, fattr->cf_mode))) {
CIFS_I(inode)->time = 0; /* force reval */
return -ESTALE;
}
- if (inode->i_state & I_NEW)
+ if (inode_state_read_once(inode) & I_NEW)
CIFS_I(inode)->netfs.zero_point = fattr->cf_eof;
cifs_revalidate_cache(inode, fattr);
@@ -194,7 +195,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr,
inode->i_gid = fattr->cf_gid;
/* if dynperm is set, don't clobber existing mode */
- if (inode->i_state & I_NEW ||
+ if (inode_state_read(inode) & I_NEW ||
!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM))
inode->i_mode = fattr->cf_mode;
@@ -236,7 +237,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr,
if (fattr->cf_flags & CIFS_FATTR_JUNCTION)
inode->i_flags |= S_AUTOMOUNT;
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
cifs_set_netfs_context(inode);
cifs_set_ops(inode);
}
@@ -1638,7 +1639,7 @@ retry_iget5_locked:
cifs_fattr_to_inode(inode, fattr, false);
if (sb->s_flags & SB_NOATIME)
inode->i_flags |= S_NOATIME | S_NOCMTIME;
- if (inode->i_state & I_NEW) {
+ if (inode_state_read_once(inode) & I_NEW) {
inode->i_ino = hash;
cifs_fscache_get_inode_cookie(inode);
unlock_new_inode(inode);
diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c
index ca8f3dd7ff63..78650527d4bb 100644
--- a/fs/smb/client/smb1ops.c
+++ b/fs/smb/client/smb1ops.c
@@ -7,6 +7,7 @@
#include <linux/pagemap.h>
#include <linux/vfs.h>
+#include <linux/fs_struct.h>
#include <uapi/linux/magic.h>
#include "cifsglob.h"
#include "cifsproto.h"
diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c
index 85a4c55b61b8..c6c428c2e08d 100644
--- a/fs/smb/client/smbdirect.c
+++ b/fs/smb/client/smbdirect.c
@@ -290,6 +290,9 @@ static void smbd_disconnect_rdma_connection(struct smbdirect_socket *sc)
break;
case SMBDIRECT_SOCKET_CREATED:
+ sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
+ break;
+
case SMBDIRECT_SOCKET_CONNECTED:
sc->status = SMBDIRECT_SOCKET_ERROR;
break;
diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c
index 051cd9dbba13..915cedde5d66 100644
--- a/fs/smb/client/transport.c
+++ b/fs/smb/client/transport.c
@@ -830,7 +830,7 @@ struct TCP_Server_Info *cifs_pick_channel(struct cifs_ses *ses)
if (!server || server->terminate)
continue;
- if (CIFS_CHAN_NEEDS_RECONNECT(ses, i))
+ if (CIFS_CHAN_NEEDS_RECONNECT(ses, cur))
continue;
/*
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index f901ae18e68a..94454e8826b0 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -6092,8 +6092,8 @@ static int smb2_create_link(struct ksmbd_work *work,
}
ksmbd_debug(SMB, "target name is %s\n", target_name);
- rc = ksmbd_vfs_kern_path_locked(work, link_name, LOOKUP_NO_SYMLINKS,
- &path, 0);
+ rc = ksmbd_vfs_kern_path_start_removing(work, link_name, LOOKUP_NO_SYMLINKS,
+ &path, 0);
if (rc) {
if (rc != -ENOENT)
goto out;
@@ -6111,7 +6111,7 @@ static int smb2_create_link(struct ksmbd_work *work,
ksmbd_debug(SMB, "link already exists\n");
goto out;
}
- ksmbd_vfs_kern_path_unlock(&path);
+ ksmbd_vfs_kern_path_end_removing(&path);
}
rc = ksmbd_vfs_link(work, target_name, link_name);
if (rc)
diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c
index 5d3b48e77012..e2be9a496154 100644
--- a/fs/smb/server/transport_rdma.c
+++ b/fs/smb/server/transport_rdma.c
@@ -334,6 +334,9 @@ smb_direct_disconnect_rdma_connection(struct smbdirect_socket *sc)
break;
case SMBDIRECT_SOCKET_CREATED:
+ sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
+ break;
+
case SMBDIRECT_SOCKET_CONNECTED:
sc->status = SMBDIRECT_SOCKET_ERROR;
break;
@@ -1883,6 +1886,7 @@ static int smb_direct_accept_client(struct smbdirect_socket *sc)
static int smb_direct_prepare_negotiation(struct smbdirect_socket *sc)
{
struct smbdirect_recv_io *recvmsg;
+ bool recv_posted = false;
int ret;
WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_CREATED);
@@ -1899,6 +1903,7 @@ static int smb_direct_prepare_negotiation(struct smbdirect_socket *sc)
pr_err("Can't post recv: %d\n", ret);
goto out_err;
}
+ recv_posted = true;
ret = smb_direct_accept_client(sc);
if (ret) {
@@ -1908,7 +1913,14 @@ static int smb_direct_prepare_negotiation(struct smbdirect_socket *sc)
return 0;
out_err:
- put_recvmsg(sc, recvmsg);
+ /*
+ * If the recv was never posted, return it to the free list.
+ * If it was posted, leave it alone so disconnect teardown can
+ * drain the QP and complete it (flush) and the completion path
+ * will unmap it exactly once.
+ */
+ if (!recv_posted)
+ put_recvmsg(sc, recvmsg);
return ret;
}
diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c
index 7a1e3dcc2cde..d2e391c29464 100644
--- a/fs/smb/server/transport_tcp.c
+++ b/fs/smb/server/transport_tcp.c
@@ -290,8 +290,11 @@ static int ksmbd_kthread_fn(void *p)
}
}
up_read(&conn_list_lock);
- if (ret == -EAGAIN)
+ if (ret == -EAGAIN) {
+ /* Per-IP limit hit: release the just-accepted socket. */
+ sock_release(client_sk);
continue;
+ }
skip_max_ip_conns_limit:
if (server_conf.max_connections &&
diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c
index 891ed2dc2b73..03fd7409be79 100644
--- a/fs/smb/server/vfs.c
+++ b/fs/smb/server/vfs.c
@@ -49,27 +49,9 @@ static void ksmbd_vfs_inherit_owner(struct ksmbd_work *work,
i_uid_write(inode, i_uid_read(parent_inode));
}
-/**
- * ksmbd_vfs_lock_parent() - lock parent dentry if it is stable
- * @parent: parent dentry
- * @child: child dentry
- *
- * Returns: %0 on success, %-ENOENT if the parent dentry is not stable
- */
-int ksmbd_vfs_lock_parent(struct dentry *parent, struct dentry *child)
-{
- inode_lock_nested(d_inode(parent), I_MUTEX_PARENT);
- if (child->d_parent != parent) {
- inode_unlock(d_inode(parent));
- return -ENOENT;
- }
-
- return 0;
-}
-
static int ksmbd_vfs_path_lookup(struct ksmbd_share_config *share_conf,
char *pathname, unsigned int flags,
- struct path *path, bool do_lock)
+ struct path *path, bool for_remove)
{
struct qstr last;
struct filename *filename __free(putname) = NULL;
@@ -99,22 +81,20 @@ static int ksmbd_vfs_path_lookup(struct ksmbd_share_config *share_conf,
return -ENOENT;
}
- if (do_lock) {
+ if (for_remove) {
err = mnt_want_write(path->mnt);
if (err) {
path_put(path);
return -ENOENT;
}
- inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
- d = lookup_one_qstr_excl(&last, path->dentry, 0);
+ d = start_removing_noperm(path->dentry, &last);
if (!IS_ERR(d)) {
dput(path->dentry);
path->dentry = d;
return 0;
}
- inode_unlock(path->dentry->d_inode);
mnt_drop_write(path->mnt);
path_put(path);
return -ENOENT;
@@ -188,8 +168,7 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode)
}
mode |= S_IFREG;
- err = vfs_create(mnt_idmap(path.mnt), d_inode(path.dentry),
- dentry, mode, true);
+ err = vfs_create(mnt_idmap(path.mnt), dentry, mode, NULL);
if (!err) {
ksmbd_vfs_inherit_owner(work, d_inode(path.dentry),
d_inode(dentry));
@@ -230,7 +209,7 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode)
idmap = mnt_idmap(path.mnt);
mode |= S_IFDIR;
d = dentry;
- dentry = vfs_mkdir(idmap, d_inode(path.dentry), dentry, mode);
+ dentry = vfs_mkdir(idmap, d_inode(path.dentry), dentry, mode, NULL);
if (IS_ERR(dentry))
err = PTR_ERR(dentry);
else if (d_is_negative(dentry))
@@ -609,7 +588,7 @@ int ksmbd_vfs_remove_file(struct ksmbd_work *work, const struct path *path)
idmap = mnt_idmap(path->mnt);
if (S_ISDIR(d_inode(path->dentry)->i_mode)) {
- err = vfs_rmdir(idmap, d_inode(parent), path->dentry);
+ err = vfs_rmdir(idmap, d_inode(parent), path->dentry, NULL);
if (err && err != -ENOTEMPTY)
ksmbd_debug(VFS, "rmdir failed, err %d\n", err);
} else {
@@ -681,7 +660,6 @@ out1:
int ksmbd_vfs_rename(struct ksmbd_work *work, const struct path *old_path,
char *newname, int flags)
{
- struct dentry *old_parent, *new_dentry, *trap;
struct dentry *old_child = old_path->dentry;
struct path new_path;
struct qstr new_last;
@@ -691,7 +669,6 @@ int ksmbd_vfs_rename(struct ksmbd_work *work, const struct path *old_path,
struct ksmbd_file *parent_fp;
int new_type;
int err, lookup_flags = LOOKUP_NO_SYMLINKS;
- int target_lookup_flags = LOOKUP_RENAME_TARGET | LOOKUP_CREATE;
if (ksmbd_override_fsids(work))
return -ENOMEM;
@@ -702,14 +679,6 @@ int ksmbd_vfs_rename(struct ksmbd_work *work, const struct path *old_path,
goto revert_fsids;
}
- /*
- * explicitly handle file overwrite case, for compatibility with
- * filesystems that may not support rename flags (e.g: fuse)
- */
- if (flags & RENAME_NOREPLACE)
- target_lookup_flags |= LOOKUP_EXCL;
- flags &= ~(RENAME_NOREPLACE);
-
retry:
err = vfs_path_parent_lookup(to, lookup_flags | LOOKUP_BENEATH,
&new_path, &new_last, &new_type,
@@ -726,17 +695,14 @@ retry:
if (err)
goto out2;
- trap = lock_rename_child(old_child, new_path.dentry);
- if (IS_ERR(trap)) {
- err = PTR_ERR(trap);
+ rd.mnt_idmap = mnt_idmap(old_path->mnt);
+ rd.old_parent = NULL;
+ rd.new_parent = new_path.dentry;
+ rd.flags = flags;
+ rd.delegated_inode = NULL,
+ err = start_renaming_dentry(&rd, lookup_flags, old_child, &new_last);
+ if (err)
goto out_drop_write;
- }
-
- old_parent = dget(old_child->d_parent);
- if (d_unhashed(old_child)) {
- err = -EINVAL;
- goto out3;
- }
parent_fp = ksmbd_lookup_fd_inode(old_child->d_parent);
if (parent_fp) {
@@ -749,44 +715,17 @@ retry:
ksmbd_fd_put(work, parent_fp);
}
- new_dentry = lookup_one_qstr_excl(&new_last, new_path.dentry,
- lookup_flags | target_lookup_flags);
- if (IS_ERR(new_dentry)) {
- err = PTR_ERR(new_dentry);
- goto out3;
- }
-
- if (d_is_symlink(new_dentry)) {
+ if (d_is_symlink(rd.new_dentry)) {
err = -EACCES;
- goto out4;
- }
-
- if (old_child == trap) {
- err = -EINVAL;
- goto out4;
- }
-
- if (new_dentry == trap) {
- err = -ENOTEMPTY;
- goto out4;
+ goto out3;
}
- rd.mnt_idmap = mnt_idmap(old_path->mnt),
- rd.old_parent = old_parent,
- rd.old_dentry = old_child,
- rd.new_parent = new_path.dentry,
- rd.new_dentry = new_dentry,
- rd.flags = flags,
- rd.delegated_inode = NULL,
err = vfs_rename(&rd);
if (err)
ksmbd_debug(VFS, "vfs_rename failed err %d\n", err);
-out4:
- dput(new_dentry);
out3:
- dput(old_parent);
- unlock_rename(old_parent, new_path.dentry);
+ end_renaming(&rd);
out_drop_write:
mnt_drop_write(old_path->mnt);
out2:
@@ -1084,18 +1023,17 @@ int ksmbd_vfs_unlink(struct file *filp)
return err;
dir = dget_parent(dentry);
- err = ksmbd_vfs_lock_parent(dir, dentry);
- if (err)
+ dentry = start_removing_dentry(dir, dentry);
+ err = PTR_ERR(dentry);
+ if (IS_ERR(dentry))
goto out;
- dget(dentry);
if (S_ISDIR(d_inode(dentry)->i_mode))
- err = vfs_rmdir(idmap, d_inode(dir), dentry);
+ err = vfs_rmdir(idmap, d_inode(dir), dentry, NULL);
else
err = vfs_unlink(idmap, d_inode(dir), dentry, NULL);
- dput(dentry);
- inode_unlock(d_inode(dir));
+ end_removing(dentry);
if (err)
ksmbd_debug(VFS, "failed to delete, err %d\n", err);
out:
@@ -1207,7 +1145,7 @@ static int ksmbd_vfs_lookup_in_dir(const struct path *dir, char *name,
static
int __ksmbd_vfs_kern_path(struct ksmbd_work *work, char *filepath,
unsigned int flags,
- struct path *path, bool caseless, bool do_lock)
+ struct path *path, bool caseless, bool for_remove)
{
struct ksmbd_share_config *share_conf = work->tcon->share_conf;
struct path parent_path;
@@ -1215,7 +1153,7 @@ int __ksmbd_vfs_kern_path(struct ksmbd_work *work, char *filepath,
int err;
retry:
- err = ksmbd_vfs_path_lookup(share_conf, filepath, flags, path, do_lock);
+ err = ksmbd_vfs_path_lookup(share_conf, filepath, flags, path, for_remove);
if (!err || !caseless)
return err;
@@ -1286,7 +1224,7 @@ int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *filepath,
}
/**
- * ksmbd_vfs_kern_path_locked() - lookup a file and get path info
+ * ksmbd_vfs_kern_path_start_remove() - lookup a file and get path info prior to removal
* @work: work
* @filepath: file path that is relative to share
* @flags: lookup flags
@@ -1298,20 +1236,19 @@ int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *filepath,
* filesystem will have been gained.
* Return: 0 on if file was found, otherwise error
*/
-int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *filepath,
- unsigned int flags,
- struct path *path, bool caseless)
+int ksmbd_vfs_kern_path_start_removing(struct ksmbd_work *work, char *filepath,
+ unsigned int flags,
+ struct path *path, bool caseless)
{
return __ksmbd_vfs_kern_path(work, filepath, flags, path,
caseless, true);
}
-void ksmbd_vfs_kern_path_unlock(const struct path *path)
+void ksmbd_vfs_kern_path_end_removing(const struct path *path)
{
- /* While lock is still held, ->d_parent is safe */
- inode_unlock(d_inode(path->dentry->d_parent));
+ end_removing(path->dentry);
mnt_drop_write(path->mnt);
- path_put(path);
+ mntput(path->mnt);
}
struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work,
diff --git a/fs/smb/server/vfs.h b/fs/smb/server/vfs.h
index df6421b4590b..16ca29ee16e5 100644
--- a/fs/smb/server/vfs.h
+++ b/fs/smb/server/vfs.h
@@ -120,10 +120,10 @@ int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap,
int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *name,
unsigned int flags,
struct path *path, bool caseless);
-int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name,
- unsigned int flags,
- struct path *path, bool caseless);
-void ksmbd_vfs_kern_path_unlock(const struct path *path);
+int ksmbd_vfs_kern_path_start_removing(struct ksmbd_work *work, char *name,
+ unsigned int flags,
+ struct path *path, bool caseless);
+void ksmbd_vfs_kern_path_end_removing(const struct path *path);
struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work,
const char *name,
unsigned int flags,
diff --git a/fs/splice.c b/fs/splice.c
index f5094b6d00a0..d338fe56b50b 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1498,7 +1498,7 @@ static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
/*
* For lack of a better implementation, implement vmsplice() to userspace
- * as a simple copy of the pipes pages to the user iov.
+ * as a simple copy of the pipe's pages to the user iov.
*/
static ssize_t vmsplice_to_user(struct file *file, struct iov_iter *iter,
unsigned int flags)
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index cceae3b78698..82b687414e65 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -86,7 +86,7 @@ struct inode *squashfs_iget(struct super_block *sb, long long ino,
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
err = squashfs_read_inode(inode, ino);
diff --git a/fs/super.c b/fs/super.c
index 5bab94fb7e03..7c66b96b59be 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -389,6 +389,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
goto fail;
if (list_lru_init_memcg(&s->s_inode_lru, s->s_shrink))
goto fail;
+ s->s_min_writeback_pages = MIN_WRITEBACK_PAGES;
return s;
fail:
@@ -1183,11 +1184,14 @@ static inline bool get_active_super(struct super_block *sb)
static const char *filesystems_freeze_ptr = "filesystems_freeze";
-static void filesystems_freeze_callback(struct super_block *sb, void *unused)
+static void filesystems_freeze_callback(struct super_block *sb, void *freeze_all_ptr)
{
if (!sb->s_op->freeze_fs && !sb->s_op->freeze_super)
return;
+ if (freeze_all_ptr && !(sb->s_type->fs_flags & FS_POWER_FREEZE))
+ return;
+
if (!get_active_super(sb))
return;
@@ -1201,9 +1205,13 @@ static void filesystems_freeze_callback(struct super_block *sb, void *unused)
deactivate_super(sb);
}
-void filesystems_freeze(void)
+void filesystems_freeze(bool freeze_all)
{
- __iterate_supers(filesystems_freeze_callback, NULL,
+ void *freeze_all_ptr = NULL;
+
+ if (freeze_all)
+ freeze_all_ptr = &freeze_all;
+ __iterate_supers(filesystems_freeze_callback, freeze_all_ptr,
SUPER_ITER_UNLOCKED | SUPER_ITER_REVERSE);
}
diff --git a/fs/sync.c b/fs/sync.c
index 2955cd4c77a3..431fc5f5be06 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -117,16 +117,17 @@ SYSCALL_DEFINE0(sync)
static void do_sync_work(struct work_struct *work)
{
int nowait = 0;
+ int wait = 1;
/*
* Sync twice to reduce the possibility we skipped some inodes / pages
* because they were temporarily locked
*/
- iterate_supers(sync_inodes_one_sb, &nowait);
+ iterate_supers(sync_inodes_one_sb, NULL);
iterate_supers(sync_fs_one_sb, &nowait);
sync_bdevs(false);
- iterate_supers(sync_inodes_one_sb, &nowait);
- iterate_supers(sync_fs_one_sb, &nowait);
+ iterate_supers(sync_inodes_one_sb, NULL);
+ iterate_supers(sync_fs_one_sb, &wait);
sync_bdevs(false);
printk("Emergency Sync complete\n");
kfree(work);
@@ -182,7 +183,7 @@ int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
if (!file->f_op->fsync)
return -EINVAL;
- if (!datasync && (inode->i_state & I_DIRTY_TIME))
+ if (!datasync && (inode_state_read_once(inode) & I_DIRTY_TIME))
mark_inode_dirty_sync(inode);
return file->f_op->fsync(file, start, end, datasync);
}
@@ -280,14 +281,12 @@ int sync_file_range(struct file *file, loff_t offset, loff_t nbytes,
}
if (flags & SYNC_FILE_RANGE_WRITE) {
- int sync_mode = WB_SYNC_NONE;
-
if ((flags & SYNC_FILE_RANGE_WRITE_AND_WAIT) ==
SYNC_FILE_RANGE_WRITE_AND_WAIT)
- sync_mode = WB_SYNC_ALL;
-
- ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
- sync_mode);
+ ret = filemap_fdatawrite_range(mapping, offset,
+ endbyte);
+ else
+ ret = filemap_flush_range(mapping, offset, endbyte);
if (ret < 0)
goto out;
}
diff --git a/fs/timerfd.c b/fs/timerfd.c
index c68f28d9c426..9fcea7860ddf 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -393,9 +393,8 @@ static const struct file_operations timerfd_fops = {
SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
{
- int ufd;
- struct timerfd_ctx *ctx;
- struct file *file;
+ struct timerfd_ctx *ctx __free(kfree) = NULL;
+ int ret;
/* Check the TFD_* constants for consistency. */
BUILD_BUG_ON(TFD_CLOEXEC != O_CLOEXEC);
@@ -432,23 +431,13 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
ctx->moffs = ktime_mono_to_real(0);
- ufd = get_unused_fd_flags(flags & TFD_SHARED_FCNTL_FLAGS);
- if (ufd < 0) {
- kfree(ctx);
- return ufd;
- }
-
- file = anon_inode_getfile_fmode("[timerfd]", &timerfd_fops, ctx,
- O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS),
- FMODE_NOWAIT);
- if (IS_ERR(file)) {
- put_unused_fd(ufd);
- kfree(ctx);
- return PTR_ERR(file);
- }
-
- fd_install(ufd, file);
- return ufd;
+ ret = FD_ADD(flags & TFD_SHARED_FCNTL_FLAGS,
+ anon_inode_getfile_fmode("[timerfd]", &timerfd_fops, ctx,
+ O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS),
+ FMODE_NOWAIT));
+ if (ret >= 0)
+ retain_and_null_ptr(ctx);
+ return ret;
}
static int do_timerfd_settime(int ufd, int flags,
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index ca41ce8208c4..c3265b8804f5 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1323,7 +1323,7 @@ int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
inode_lock(inode);
/* Synchronize the inode unless this is a 'datasync()' call. */
- if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) {
+ if (!datasync || (inode_state_read_once(inode) & I_DIRTY_DATASYNC)) {
err = inode->i_sb->s_op->write_inode(inode, NULL);
if (err)
goto out;
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 46952a33c4e6..f453c37cee37 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -114,7 +114,7 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
inode = iget_locked(sb, inum);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
ui = ubifs_inode(inode);
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index a79d73f28aa7..7fae8002344a 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1962,7 +1962,7 @@ struct inode *__udf_iget(struct super_block *sb, struct kernel_lb_addr *ino,
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW)) {
+ if (!(inode_state_read_once(inode) & I_NEW)) {
if (UDF_I(inode)->i_hidden != hidden_inode) {
iput(inode);
return ERR_PTR(-EFSCORRUPTED);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 8361c00e8fa6..e2b0a35de2a7 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -655,7 +655,7 @@ struct inode *ufs_iget(struct super_block *sb, unsigned long ino)
inode = iget_locked(sb, ino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
ufsi = UFS_I(inode);
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 54c6cc7fe9c6..e6e74b384087 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -2111,9 +2111,7 @@ static void init_once_userfaultfd_ctx(void *mem)
static int new_userfaultfd(int flags)
{
- struct userfaultfd_ctx *ctx;
- struct file *file;
- int fd;
+ struct userfaultfd_ctx *ctx __free(kfree) = NULL;
VM_WARN_ON_ONCE(!current->mm);
@@ -2135,26 +2133,18 @@ static int new_userfaultfd(int flags)
atomic_set(&ctx->mmap_changing, 0);
ctx->mm = current->mm;
- fd = get_unused_fd_flags(flags & UFFD_SHARED_FCNTL_FLAGS);
- if (fd < 0)
- goto err_out;
+ FD_PREPARE(fdf, flags & UFFD_SHARED_FCNTL_FLAGS,
+ anon_inode_create_getfile("[userfaultfd]", &userfaultfd_fops, ctx,
+ O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS),
+ NULL));
+ if (fdf.err)
+ return fdf.err;
- /* Create a new inode so that the LSM can block the creation. */
- file = anon_inode_create_getfile("[userfaultfd]", &userfaultfd_fops, ctx,
- O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL);
- if (IS_ERR(file)) {
- put_unused_fd(fd);
- fd = PTR_ERR(file);
- goto err_out;
- }
/* prevent the mm struct to be freed */
mmgrab(ctx->mm);
- file->f_mode |= FMODE_NOWAIT;
- fd_install(fd, file);
- return fd;
-err_out:
- kmem_cache_free(userfaultfd_ctx_cachep, ctx);
- return fd;
+ fd_prepare_file(fdf)->f_mode |= FMODE_NOWAIT;
+ retain_and_null_ptr(ctx);
+ return fd_publish(fdf);
}
static inline bool userfaultfd_syscall_allowed(int flags)
diff --git a/fs/utimes.c b/fs/utimes.c
index c7c7958e57b2..86f8ce8cd6b1 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -22,7 +22,7 @@ int vfs_utimes(const struct path *path, struct timespec64 *times)
int error;
struct iattr newattrs;
struct inode *inode = path->dentry->d_inode;
- struct inode *delegated_inode = NULL;
+ struct delegated_inode delegated_inode = { };
if (times) {
if (!nsec_valid(times[0].tv_nsec) ||
@@ -66,7 +66,7 @@ retry_deleg:
error = notify_change(mnt_idmap(path->mnt), path->dentry, &newattrs,
&delegated_inode);
inode_unlock(inode);
- if (delegated_inode) {
+ if (is_delegated(&delegated_inode)) {
error = break_deleg_wait(&delegated_inode);
if (!error)
goto retry_deleg;
@@ -76,6 +76,7 @@ retry_deleg:
out:
return error;
}
+EXPORT_SYMBOL_GPL(vfs_utimes);
static int do_utimes_path(int dfd, const char __user *filename,
struct timespec64 *times, int flags)
diff --git a/fs/xattr.c b/fs/xattr.c
index 8851a5ef34f5..32d445fb60aa 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -274,7 +274,7 @@ int __vfs_setxattr_noperm(struct mnt_idmap *idmap,
int
__vfs_setxattr_locked(struct mnt_idmap *idmap, struct dentry *dentry,
const char *name, const void *value, size_t size,
- int flags, struct inode **delegated_inode)
+ int flags, struct delegated_inode *delegated_inode)
{
struct inode *inode = dentry->d_inode;
int error;
@@ -305,7 +305,7 @@ vfs_setxattr(struct mnt_idmap *idmap, struct dentry *dentry,
const char *name, const void *value, size_t size, int flags)
{
struct inode *inode = dentry->d_inode;
- struct inode *delegated_inode = NULL;
+ struct delegated_inode delegated_inode = { };
const void *orig_value = value;
int error;
@@ -322,7 +322,7 @@ retry_deleg:
flags, &delegated_inode);
inode_unlock(inode);
- if (delegated_inode) {
+ if (is_delegated(&delegated_inode)) {
error = break_deleg_wait(&delegated_inode);
if (!error)
goto retry_deleg;
@@ -533,7 +533,7 @@ EXPORT_SYMBOL(__vfs_removexattr);
int
__vfs_removexattr_locked(struct mnt_idmap *idmap,
struct dentry *dentry, const char *name,
- struct inode **delegated_inode)
+ struct delegated_inode *delegated_inode)
{
struct inode *inode = dentry->d_inode;
int error;
@@ -567,7 +567,7 @@ vfs_removexattr(struct mnt_idmap *idmap, struct dentry *dentry,
const char *name)
{
struct inode *inode = dentry->d_inode;
- struct inode *delegated_inode = NULL;
+ struct delegated_inode delegated_inode = { };
int error;
retry_deleg:
@@ -576,7 +576,7 @@ retry_deleg:
name, &delegated_inode);
inode_unlock(inode);
- if (delegated_inode) {
+ if (is_delegated(&delegated_inode)) {
error = break_deleg_wait(&delegated_inode);
if (!error)
goto retry_deleg;
diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h
index de840abc0bcd..57e47077c75a 100644
--- a/fs/xfs/libxfs/xfs_errortag.h
+++ b/fs/xfs/libxfs/xfs_errortag.h
@@ -73,7 +73,8 @@
#define XFS_ERRTAG_WRITE_DELAY_MS 43
#define XFS_ERRTAG_EXCHMAPS_FINISH_ONE 44
#define XFS_ERRTAG_METAFILE_RESV_CRITICAL 45
-#define XFS_ERRTAG_MAX 46
+#define XFS_ERRTAG_FORCE_ZERO_RANGE 46
+#define XFS_ERRTAG_MAX 47
/*
* Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -133,7 +134,8 @@ XFS_ERRTAG(ATTR_LEAF_TO_NODE, attr_leaf_to_node, 1) \
XFS_ERRTAG(WB_DELAY_MS, wb_delay_ms, 3000) \
XFS_ERRTAG(WRITE_DELAY_MS, write_delay_ms, 3000) \
XFS_ERRTAG(EXCHMAPS_FINISH_ONE, exchmaps_finish_one, 1) \
-XFS_ERRTAG(METAFILE_RESV_CRITICAL, metafile_resv_crit, 4)
+XFS_ERRTAG(METAFILE_RESV_CRITICAL, metafile_resv_crit, 4) \
+XFS_ERRTAG(FORCE_ZERO_RANGE, force_zero_range, 4)
#endif /* XFS_ERRTAG */
#endif /* __XFS_ERRORTAG_H_ */
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 2ef7742be7d3..7bfa37c99480 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -1249,7 +1249,7 @@ xchk_irele(
* hits do not clear DONTCACHE, so we must do it here.
*/
spin_lock(&VFS_I(ip)->i_lock);
- VFS_I(ip)->i_state &= ~I_DONTCACHE;
+ inode_state_clear(VFS_I(ip), I_DONTCACHE);
spin_unlock(&VFS_I(ip)->i_lock);
}
diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
index a90a011c7e5f..4f7040c9ddf0 100644
--- a/fs/xfs/scrub/inode_repair.c
+++ b/fs/xfs/scrub/inode_repair.c
@@ -1933,7 +1933,7 @@ xrep_inode_pptr(
* Unlinked inodes that cannot be added to the directory tree will not
* have a parent pointer.
*/
- if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
+ if (inode->i_nlink == 0 && !(inode_state_read_once(inode) & I_LINKABLE))
return 0;
/* Children of the superblock do not have parent pointers. */
diff --git a/fs/xfs/scrub/orphanage.c b/fs/xfs/scrub/orphanage.c
index 9c12cb844231..4e550a1d5353 100644
--- a/fs/xfs/scrub/orphanage.c
+++ b/fs/xfs/scrub/orphanage.c
@@ -152,11 +152,10 @@ xrep_orphanage_create(
}
/* Try to find the orphanage directory. */
- inode_lock_nested(root_inode, I_MUTEX_PARENT);
- orphanage_dentry = lookup_noperm(&QSTR(ORPHANAGE), root_dentry);
+ orphanage_dentry = start_creating_noperm(root_dentry, &QSTR(ORPHANAGE));
if (IS_ERR(orphanage_dentry)) {
error = PTR_ERR(orphanage_dentry);
- goto out_unlock_root;
+ goto out_dput_root;
}
/*
@@ -167,10 +166,10 @@ xrep_orphanage_create(
*/
if (d_really_is_negative(orphanage_dentry)) {
orphanage_dentry = vfs_mkdir(&nop_mnt_idmap, root_inode,
- orphanage_dentry, 0750);
+ orphanage_dentry, 0750, NULL);
error = PTR_ERR(orphanage_dentry);
if (IS_ERR(orphanage_dentry))
- goto out_unlock_root;
+ goto out_dput_orphanage;
}
/* Not a directory? Bail out. */
@@ -200,9 +199,7 @@ xrep_orphanage_create(
sc->orphanage_ilock_flags = 0;
out_dput_orphanage:
- dput(orphanage_dentry);
-out_unlock_root:
- inode_unlock(VFS_I(sc->mp->m_rootip));
+ end_creating(orphanage_dentry);
out_dput_root:
dput(root_dentry);
out:
diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c
index 3b692c4acc1e..11d5de10fd56 100644
--- a/fs/xfs/scrub/parent.c
+++ b/fs/xfs/scrub/parent.c
@@ -915,7 +915,7 @@ xchk_pptr_looks_zapped(
* Temporary files that cannot be linked into the directory tree do not
* have attr forks because they cannot ever have parents.
*/
- if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
+ if (inode->i_nlink == 0 && !(inode_state_read_once(inode) & I_LINKABLE))
return false;
/*
diff --git a/fs/xfs/scrub/symlink_repair.c b/fs/xfs/scrub/symlink_repair.c
index 5902398185a8..df629892462f 100644
--- a/fs/xfs/scrub/symlink_repair.c
+++ b/fs/xfs/scrub/symlink_repair.c
@@ -184,7 +184,7 @@ xrep_symlink_salvage_inline(
sc->ip->i_disk_size == 1 && old_target[0] == '?')
return 0;
- nr = min(XFS_SYMLINK_MAXLEN, xfs_inode_data_fork_size(ip));
+ nr = min(XFS_SYMLINK_MAXLEN, ifp->if_bytes);
memcpy(target_buf, ifp->if_data, nr);
return nr;
}
diff --git a/fs/xfs/scrub/xfarray.c b/fs/xfs/scrub/xfarray.c
index cdd13ed9c569..ed2e8c64b1a8 100644
--- a/fs/xfs/scrub/xfarray.c
+++ b/fs/xfs/scrub/xfarray.c
@@ -834,7 +834,7 @@ xfarray_sort_scan(
si->first_folio_idx = xfarray_idx(si->array,
folio_pos(si->folio) + si->array->obj_size - 1);
- next_pos = folio_pos(si->folio) + folio_size(si->folio);
+ next_pos = folio_next_pos(si->folio);
si->last_folio_idx = xfarray_idx(si->array, next_pos - 1);
if (xfarray_pos(si->array, si->last_folio_idx + 1) > next_pos)
si->last_folio_idx--;
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index a26f79815533..56a544638491 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -271,7 +271,7 @@ xfs_discard_folio(
* folio itself and not the start offset that is passed in.
*/
xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos,
- folio_pos(folio) + folio_size(folio), NULL);
+ folio_next_pos(folio), NULL);
}
/*
@@ -742,14 +742,15 @@ xfs_vm_read_folio(
struct file *unused,
struct folio *folio)
{
- return iomap_read_folio(folio, &xfs_read_iomap_ops);
+ iomap_bio_read_folio(folio, &xfs_read_iomap_ops);
+ return 0;
}
STATIC void
xfs_vm_readahead(
struct readahead_control *rac)
{
- iomap_readahead(rac, &xfs_read_iomap_ops);
+ iomap_bio_readahead(rac, &xfs_read_iomap_ops);
}
static int
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 06ca11731e43..2208a720ec3f 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -514,7 +514,7 @@ xfs_can_free_eofblocks(
* Caller must either hold the exclusive io lock; or be inactivating
* the inode, which guarantees there are no other users of the inode.
*/
- if (!(VFS_I(ip)->i_state & I_FREEING))
+ if (!(inode_state_read_once(VFS_I(ip)) & I_FREEING))
xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL);
/* prealloc/delalloc exists only on regular files */
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 2702fef2c90c..6108612182e2 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -27,6 +27,8 @@
#include "xfs_file.h"
#include "xfs_aops.h"
#include "xfs_zone_alloc.h"
+#include "xfs_error.h"
+#include "xfs_errortag.h"
#include <linux/dax.h>
#include <linux/falloc.h>
@@ -674,8 +676,17 @@ xfs_file_dio_write_aligned(
struct xfs_zone_alloc_ctx *ac)
{
unsigned int iolock = XFS_IOLOCK_SHARED;
+ unsigned int dio_flags = 0;
ssize_t ret;
+ /*
+ * For always COW inodes, each bio must be aligned to the file system
+ * block size and not just the device sector size because we need to
+ * allocate a block-aligned amount of space for each write.
+ */
+ if (xfs_is_always_cow_inode(ip))
+ dio_flags |= IOMAP_DIO_FSBLOCK_ALIGNED;
+
ret = xfs_ilock_iocb_for_write(iocb, &iolock);
if (ret)
return ret;
@@ -693,7 +704,7 @@ xfs_file_dio_write_aligned(
iolock = XFS_IOLOCK_SHARED;
}
trace_xfs_file_direct_write(iocb, from);
- ret = iomap_dio_rw(iocb, from, ops, dops, 0, ac, 0);
+ ret = iomap_dio_rw(iocb, from, ops, dops, dio_flags, ac, 0);
out_unlock:
xfs_iunlock(ip, iolock);
return ret;
@@ -890,15 +901,7 @@ xfs_file_dio_write(
if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
return -EINVAL;
- /*
- * For always COW inodes we also must check the alignment of each
- * individual iovec segment, as they could end up with different
- * I/Os due to the way bio_iov_iter_get_pages works, and we'd
- * then overwrite an already written block.
- */
- if (((iocb->ki_pos | count) & ip->i_mount->m_blockmask) ||
- (xfs_is_always_cow_inode(ip) &&
- (iov_iter_alignment(from) & ip->i_mount->m_blockmask)))
+ if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
return xfs_file_dio_write_unaligned(ip, iocb, from);
if (xfs_is_zoned_inode(ip))
return xfs_file_dio_write_zoned(ip, iocb, from);
@@ -1254,23 +1257,36 @@ xfs_falloc_zero_range(
struct xfs_zone_alloc_ctx *ac)
{
struct inode *inode = file_inode(file);
+ struct xfs_inode *ip = XFS_I(inode);
unsigned int blksize = i_blocksize(inode);
loff_t new_size = 0;
int error;
- trace_xfs_zero_file_space(XFS_I(inode));
+ trace_xfs_zero_file_space(ip);
error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
if (error)
return error;
- error = xfs_free_file_space(XFS_I(inode), offset, len, ac);
- if (error)
- return error;
+ /*
+ * Zero range implements a full zeroing mechanism but is only used in
+ * limited situations. It is more efficient to allocate unwritten
+ * extents than to perform zeroing here, so use an errortag to randomly
+ * force zeroing on DEBUG kernels for added test coverage.
+ */
+ if (XFS_TEST_ERROR(ip->i_mount,
+ XFS_ERRTAG_FORCE_ZERO_RANGE)) {
+ error = xfs_zero_range(ip, offset, len, ac, NULL);
+ } else {
+ error = xfs_free_file_space(ip, offset, len, ac);
+ if (error)
+ return error;
- len = round_up(offset + len, blksize) - round_down(offset, blksize);
- offset = round_down(offset, blksize);
- error = xfs_alloc_file_space(XFS_I(inode), offset, len);
+ len = round_up(offset + len, blksize) -
+ round_down(offset, blksize);
+ offset = round_down(offset, blksize);
+ error = xfs_alloc_file_space(ip, offset, len);
+ }
if (error)
return error;
return xfs_falloc_setsize(file, new_size);
diff --git a/fs/xfs/xfs_handle.c b/fs/xfs/xfs_handle.c
index f19fce557354..5a3e3bf4e7cc 100644
--- a/fs/xfs/xfs_handle.c
+++ b/fs/xfs/xfs_handle.c
@@ -233,14 +233,11 @@ xfs_open_by_handle(
xfs_fsop_handlereq_t *hreq)
{
const struct cred *cred = current_cred();
- int error;
- int fd;
int permflag;
- struct file *filp;
struct inode *inode;
struct dentry *dentry;
fmode_t fmode;
- struct path path;
+ struct path path __free(path_put) = {};
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -249,12 +246,11 @@ xfs_open_by_handle(
if (IS_ERR(dentry))
return PTR_ERR(dentry);
inode = d_inode(dentry);
+ path.dentry = dentry;
/* Restrict xfs_open_by_handle to directories & regular files. */
- if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) {
- error = -EPERM;
- goto out_dput;
- }
+ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
+ return -EPERM;
#if BITS_PER_LONG != 32
hreq->oflags |= O_LARGEFILE;
@@ -263,48 +259,30 @@ xfs_open_by_handle(
permflag = hreq->oflags;
fmode = OPEN_FMODE(permflag);
if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) &&
- (fmode & FMODE_WRITE) && IS_APPEND(inode)) {
- error = -EPERM;
- goto out_dput;
- }
+ (fmode & FMODE_WRITE) && IS_APPEND(inode))
+ return -EPERM;
- if ((fmode & FMODE_WRITE) && IS_IMMUTABLE(inode)) {
- error = -EPERM;
- goto out_dput;
- }
+ if ((fmode & FMODE_WRITE) && IS_IMMUTABLE(inode))
+ return -EPERM;
/* Can't write directories. */
- if (S_ISDIR(inode->i_mode) && (fmode & FMODE_WRITE)) {
- error = -EISDIR;
- goto out_dput;
- }
+ if (S_ISDIR(inode->i_mode) && (fmode & FMODE_WRITE))
+ return -EISDIR;
- fd = get_unused_fd_flags(0);
- if (fd < 0) {
- error = fd;
- goto out_dput;
- }
+ path.mnt = mntget(parfilp->f_path.mnt);
- path.mnt = parfilp->f_path.mnt;
- path.dentry = dentry;
- filp = dentry_open(&path, hreq->oflags, cred);
- dput(dentry);
- if (IS_ERR(filp)) {
- put_unused_fd(fd);
- return PTR_ERR(filp);
- }
+ FD_PREPARE(fdf, 0, dentry_open(&path, hreq->oflags, cred));
+ if (fdf.err)
+ return fdf.err;
if (S_ISREG(inode->i_mode)) {
+ struct file *filp = fd_prepare_file(fdf);
+
filp->f_flags |= O_NOATIME;
filp->f_mode |= FMODE_NOCMTIME;
}
- fd_install(fd, filp);
- return fd;
-
- out_dput:
- dput(dentry);
- return error;
+ return fd_publish(fdf);
}
int
diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c
index 7c541fb373d5..3c1557fb1cf0 100644
--- a/fs/xfs/xfs_health.c
+++ b/fs/xfs/xfs_health.c
@@ -285,7 +285,7 @@ xfs_inode_mark_sick(
* is not the case here.
*/
spin_lock(&VFS_I(ip)->i_lock);
- VFS_I(ip)->i_state &= ~I_DONTCACHE;
+ inode_state_clear(VFS_I(ip), I_DONTCACHE);
spin_unlock(&VFS_I(ip)->i_lock);
}
@@ -309,7 +309,7 @@ xfs_inode_mark_corrupt(
* is not the case here.
*/
spin_lock(&VFS_I(ip)->i_lock);
- VFS_I(ip)->i_state &= ~I_DONTCACHE;
+ inode_state_clear(VFS_I(ip), I_DONTCACHE);
spin_unlock(&VFS_I(ip)->i_lock);
}
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index e44040206851..f3fc4d21bfe1 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -334,7 +334,7 @@ xfs_reinit_inode(
dev_t dev = inode->i_rdev;
kuid_t uid = inode->i_uid;
kgid_t gid = inode->i_gid;
- unsigned long state = inode->i_state;
+ unsigned long state = inode_state_read_once(inode);
error = inode_init_always(mp->m_super, inode);
@@ -345,7 +345,7 @@ xfs_reinit_inode(
inode->i_rdev = dev;
inode->i_uid = uid;
inode->i_gid = gid;
- inode->i_state = state;
+ inode_state_assign_raw(inode, state);
mapping_set_folio_min_order(inode->i_mapping,
M_IGEO(mp)->min_folio_order);
return error;
@@ -411,7 +411,7 @@ xfs_iget_recycle(
ip->i_flags |= XFS_INEW;
xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
XFS_ICI_RECLAIM_TAG);
- inode->i_state = I_NEW;
+ inode_state_assign_raw(inode, I_NEW);
spin_unlock(&ip->i_flags_lock);
spin_unlock(&pag->pag_ici_lock);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 36b39539e561..f1f88e48fe22 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1580,7 +1580,7 @@ xfs_iunlink_reload_next(
next_ip->i_prev_unlinked = prev_agino;
trace_xfs_iunlink_reload_next(next_ip);
rele:
- ASSERT(!(VFS_I(next_ip)->i_state & I_DONTCACHE));
+ ASSERT(!(inode_state_read_once(VFS_I(next_ip)) & I_DONTCACHE));
if (xfs_is_quotacheck_running(mp) && next_ip)
xfs_iflags_set(next_ip, XFS_IQUOTAUNCHECKED);
xfs_irele(next_ip);
@@ -2111,7 +2111,7 @@ xfs_rename_alloc_whiteout(
*/
xfs_setup_iops(tmpfile);
xfs_finish_inode_setup(tmpfile);
- VFS_I(tmpfile)->i_state |= I_LINKABLE;
+ inode_state_set_raw(VFS_I(tmpfile), I_LINKABLE);
*wip = tmpfile;
return 0;
@@ -2330,7 +2330,7 @@ retry:
* flag from the inode so it doesn't accidentally get misused in
* future.
*/
- VFS_I(du_wip.ip)->i_state &= ~I_LINKABLE;
+ inode_state_clear_raw(VFS_I(du_wip.ip), I_LINKABLE);
}
out_commit:
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 1bd411a1114c..2eb0c6011a2e 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -113,9 +113,9 @@ xfs_inode_item_precommit(
* to log the timestamps, or will clear already cleared fields in the
* worst case.
*/
- if (inode->i_state & I_DIRTY_TIME) {
+ if (inode_state_read_once(inode) & I_DIRTY_TIME) {
spin_lock(&inode->i_lock);
- inode->i_state &= ~I_DIRTY_TIME;
+ inode_state_clear(inode, I_DIRTY_TIME);
spin_unlock(&inode->i_lock);
}
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index a6bb7ee7a27a..59eaad774371 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1408,10 +1408,8 @@ xfs_file_ioctl(
trace_xfs_ioc_free_eofblocks(mp, &icw, _RET_IP_);
- sb_start_write(mp->m_super);
- error = xfs_blockgc_free_space(mp, &icw);
- sb_end_write(mp->m_super);
- return error;
+ guard(super_write)(mp->m_super);
+ return xfs_blockgc_free_space(mp, &icw);
}
case XFS_IOC_EXCHANGE_RANGE:
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 490e12cb99be..04f39ea15898 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1758,6 +1758,8 @@ xfs_buffered_write_iomap_begin(
struct iomap *iomap,
struct iomap *srcmap)
{
+ struct iomap_iter *iter = container_of(iomap, struct iomap_iter,
+ iomap);
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
@@ -1823,21 +1825,41 @@ xfs_buffered_write_iomap_begin(
}
/*
- * For zeroing, trim a delalloc extent that extends beyond the EOF
- * block. If it starts beyond the EOF block, convert it to an
+ * For zeroing, trim extents that extend beyond the EOF block. If a
+ * delalloc extent starts beyond the EOF block, convert it to an
* unwritten extent.
*/
- if ((flags & IOMAP_ZERO) && imap.br_startoff <= offset_fsb &&
- isnullstartblock(imap.br_startblock)) {
+ if (flags & IOMAP_ZERO) {
xfs_fileoff_t eof_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
+ u64 end;
- if (offset_fsb >= eof_fsb)
+ if (isnullstartblock(imap.br_startblock) &&
+ offset_fsb >= eof_fsb)
goto convert_delay;
- if (end_fsb > eof_fsb) {
+ if (offset_fsb < eof_fsb && end_fsb > eof_fsb)
end_fsb = eof_fsb;
- xfs_trim_extent(&imap, offset_fsb,
- end_fsb - offset_fsb);
+
+ /*
+ * Look up dirty folios for unwritten mappings within EOF.
+ * Providing this bypasses the flush iomap uses to trigger
+ * extent conversion when unwritten mappings have dirty
+ * pagecache in need of zeroing.
+ *
+ * Trim the mapping to the end pos of the lookup, which in turn
+ * was trimmed to the end of the batch if it became full before
+ * the end of the mapping.
+ */
+ if (imap.br_state == XFS_EXT_UNWRITTEN &&
+ offset_fsb < eof_fsb) {
+ loff_t len = min(count,
+ XFS_FSB_TO_B(mp, imap.br_blockcount));
+
+ end = iomap_fill_dirty_folios(iter, offset, len);
+ end_fsb = min_t(xfs_fileoff_t, end_fsb,
+ XFS_B_TO_FSB(mp, end));
}
+
+ xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
}
/*
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index caff0125faea..ad94fbf55014 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -1420,7 +1420,7 @@ xfs_setup_inode(
bool is_meta = xfs_is_internal_inode(ip);
inode->i_ino = ip->i_ino;
- inode->i_state |= I_NEW;
+ inode_state_set_raw(inode, I_NEW);
inode_sb_list_add(inode);
/* make the inode look hashed for the writeback code */
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index 36cda724da89..9d1ed9bb0bee 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -17,7 +17,7 @@ xfs_can_free_cowblocks(struct xfs_inode *ip)
{
struct inode *inode = VFS_I(ip);
- if ((inode->i_state & I_DIRTY_PAGES) ||
+ if ((inode_state_read_once(inode) & I_DIRTY_PAGES) ||
mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY) ||
mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK) ||
atomic_read(&inode->i_dio_count))
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 1067ebb3b001..bc71aa9dcee8 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1693,7 +1693,10 @@ xfs_fs_fill_super(
if (error)
return error;
- sb_min_blocksize(sb, BBSIZE);
+ if (!sb_min_blocksize(sb, BBSIZE)) {
+ xfs_err(mp, "unable to set blocksize");
+ return -EINVAL;
+ }
sb->s_xattr = xfs_xattr_handlers;
sb->s_export_op = &xfs_export_operations;
#ifdef CONFIG_XFS_QUOTA
diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c
index ef7a931ebde5..8dde444596f1 100644
--- a/fs/xfs/xfs_zone_alloc.c
+++ b/fs/xfs/xfs_zone_alloc.c
@@ -1204,6 +1204,7 @@ xfs_mount_zones(
.mp = mp,
};
struct xfs_buftarg *bt = mp->m_rtdev_targp;
+ xfs_extlen_t zone_blocks = mp->m_groups[XG_TYPE_RTG].blocks;
int error;
if (!bt) {
@@ -1234,10 +1235,33 @@ xfs_mount_zones(
return -ENOMEM;
xfs_info(mp, "%u zones of %u blocks (%u max open zones)",
- mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks,
- mp->m_max_open_zones);
+ mp->m_sb.sb_rgcount, zone_blocks, mp->m_max_open_zones);
trace_xfs_zones_mount(mp);
+ /*
+ * The writeback code switches between inodes regularly to provide
+ * fairness. The default lower bound is 4MiB, but for zoned file
+ * systems we want to increase that both to reduce seeks, but also more
+ * importantly so that workloads that writes files in a multiple of the
+ * zone size do not get fragmented and require garbage collection when
+ * they shouldn't. Increase is to the zone size capped by the max
+ * extent len.
+ *
+ * Note that because s_min_writeback_pages is a superblock field, this
+ * value also get applied to non-zoned files on the data device if
+ * there are any. On typical zoned setup all data is on the RT device
+ * because using the more efficient sequential write required zones
+ * is the reason for using the zone allocator, and either the RT device
+ * and the (meta)data device are on the same block device, or the
+ * (meta)data device is on a fast SSD while the data on the RT device
+ * is on a SMR HDD. In any combination of the above cases enforcing
+ * the higher min_writeback_pages for non-RT inodes is either a noop
+ * or beneficial.
+ */
+ mp->m_super->s_min_writeback_pages =
+ XFS_FSB_TO_B(mp, min(zone_blocks, XFS_MAX_BMBT_EXTLEN)) >>
+ PAGE_SHIFT;
+
if (bdev_is_zoned(bt->bt_bdev)) {
error = blkdev_report_zones(bt->bt_bdev,
XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart),
diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
index 90e2ad8ee5f4..c1e5e30e90a0 100644
--- a/fs/zonefs/file.c
+++ b/fs/zonefs/file.c
@@ -112,12 +112,13 @@ static const struct iomap_ops zonefs_write_iomap_ops = {
static int zonefs_read_folio(struct file *unused, struct folio *folio)
{
- return iomap_read_folio(folio, &zonefs_read_iomap_ops);
+ iomap_bio_read_folio(folio, &zonefs_read_iomap_ops);
+ return 0;
}
static void zonefs_readahead(struct readahead_control *rac)
{
- iomap_readahead(rac, &zonefs_read_iomap_ops);
+ iomap_bio_readahead(rac, &zonefs_read_iomap_ops);
}
/*
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 70be0b3dda49..086a31269198 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -644,7 +644,7 @@ static struct inode *zonefs_get_file_inode(struct inode *dir,
inode = iget_locked(sb, ino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW)) {
+ if (!(inode_state_read_once(inode) & I_NEW)) {
WARN_ON_ONCE(inode->i_private != z);
return inode;
}
@@ -683,7 +683,7 @@ static struct inode *zonefs_get_zgroup_inode(struct super_block *sb,
inode = iget_locked(sb, ino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode_state_read_once(inode) & I_NEW))
return inode;
inode->i_ino = ino;
diff --git a/include/acpi/processor.h b/include/acpi/processor.h
index 7146a8e9e9c2..d0eccbd920e5 100644
--- a/include/acpi/processor.h
+++ b/include/acpi/processor.h
@@ -417,15 +417,32 @@ static inline void acpi_processor_throttling_init(void) {}
#endif /* CONFIG_ACPI_CPU_FREQ_PSS */
/* in processor_idle.c */
+extern struct cpuidle_driver acpi_idle_driver;
#ifdef CONFIG_ACPI_PROCESSOR_IDLE
-void acpi_processor_power_init(struct acpi_processor *pr);
-void acpi_processor_power_exit(struct acpi_processor *pr);
+int acpi_processor_power_init(struct acpi_processor *pr);
+int acpi_processor_power_exit(struct acpi_processor *pr);
int acpi_processor_power_state_has_changed(struct acpi_processor *pr);
int acpi_processor_hotplug(struct acpi_processor *pr);
-void acpi_processor_register_idle_driver(void);
-void acpi_processor_unregister_idle_driver(void);
-int acpi_processor_ffh_lpi_probe(unsigned int cpu);
-int acpi_processor_ffh_lpi_enter(struct acpi_lpi_state *lpi);
+#else
+static inline int acpi_processor_power_init(struct acpi_processor *pr)
+{
+ return -ENODEV;
+}
+
+static inline int acpi_processor_power_exit(struct acpi_processor *pr)
+{
+ return -ENODEV;
+}
+
+static inline int acpi_processor_power_state_has_changed(struct acpi_processor *pr)
+{
+ return -ENODEV;
+}
+
+static inline int acpi_processor_hotplug(struct acpi_processor *pr)
+{
+ return -ENODEV;
+}
#endif /* CONFIG_ACPI_PROCESSOR_IDLE */
/* in processor_thermal.c */
@@ -448,6 +465,11 @@ static inline void acpi_thermal_cpufreq_exit(struct cpufreq_policy *policy)
}
#endif /* CONFIG_CPU_FREQ */
+#ifdef CONFIG_ACPI_PROCESSOR_IDLE
+extern int acpi_processor_ffh_lpi_probe(unsigned int cpu);
+extern int acpi_processor_ffh_lpi_enter(struct acpi_lpi_state *lpi);
+#endif
+
void acpi_processor_init_invariance_cppc(void);
#endif
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 0cdae6f809b5..a464ff6c1a61 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -971,7 +971,8 @@
#define RUNTIME_CONST_VARIABLES \
RUNTIME_CONST(shift, d_hash_shift) \
- RUNTIME_CONST(ptr, dentry_hashtable)
+ RUNTIME_CONST(ptr, dentry_hashtable) \
+ RUNTIME_CONST(ptr, __dentry_cache)
/* Alignment must be consistent with (kunit_suite *) in include/kunit/test.h */
#define KUNIT_TABLE() \
diff --git a/include/drm/intel/pciids.h b/include/drm/intel/pciids.h
index da6301a6fcea..69d4ae92d822 100644
--- a/include/drm/intel/pciids.h
+++ b/include/drm/intel/pciids.h
@@ -877,7 +877,10 @@
MACRO__(0xB08F, ## __VA_ARGS__), \
MACRO__(0xB090, ## __VA_ARGS__), \
MACRO__(0xB0A0, ## __VA_ARGS__), \
- MACRO__(0xB0B0, ## __VA_ARGS__), \
+ MACRO__(0xB0B0, ## __VA_ARGS__)
+
+/* WCL */
+#define INTEL_WCL_IDS(MACRO__, ...) \
MACRO__(0xFD80, ## __VA_ARGS__), \
MACRO__(0xFD81, ## __VA_ARGS__)
diff --git a/include/linux/ata.h b/include/linux/ata.h
index 792e10a09787..c9013e472aa3 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -566,6 +566,7 @@ struct ata_bmdma_prd {
#define ata_id_has_ncq(id) ((id)[ATA_ID_SATA_CAPABILITY] & (1 << 8))
#define ata_id_queue_depth(id) (((id)[ATA_ID_QUEUE_DEPTH] & 0x1f) + 1)
#define ata_id_removable(id) ((id)[ATA_ID_CONFIG] & (1 << 7))
+#define ata_id_is_locked(id) (((id)[ATA_ID_DLF] & 0x7) == 0x7)
#define ata_id_has_atapi_AN(id) \
((((id)[ATA_ID_SATA_CAPABILITY] != 0x0000) && \
((id)[ATA_ID_SATA_CAPABILITY] != 0xffff)) && \
diff --git a/include/linux/atomic/atomic-instrumented.h b/include/linux/atomic/atomic-instrumented.h
index 9409a6ddf3e0..37ab6314a9f7 100644
--- a/include/linux/atomic/atomic-instrumented.h
+++ b/include/linux/atomic/atomic-instrumented.h
@@ -1276,7 +1276,7 @@ atomic_try_cmpxchg(atomic_t *v, int *old, int new)
{
kcsan_mb();
instrument_atomic_read_write(v, sizeof(*v));
- instrument_atomic_read_write(old, sizeof(*old));
+ instrument_read_write(old, sizeof(*old));
return raw_atomic_try_cmpxchg(v, old, new);
}
@@ -1298,7 +1298,7 @@ static __always_inline bool
atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new)
{
instrument_atomic_read_write(v, sizeof(*v));
- instrument_atomic_read_write(old, sizeof(*old));
+ instrument_read_write(old, sizeof(*old));
return raw_atomic_try_cmpxchg_acquire(v, old, new);
}
@@ -1321,7 +1321,7 @@ atomic_try_cmpxchg_release(atomic_t *v, int *old, int new)
{
kcsan_release();
instrument_atomic_read_write(v, sizeof(*v));
- instrument_atomic_read_write(old, sizeof(*old));
+ instrument_read_write(old, sizeof(*old));
return raw_atomic_try_cmpxchg_release(v, old, new);
}
@@ -1343,7 +1343,7 @@ static __always_inline bool
atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new)
{
instrument_atomic_read_write(v, sizeof(*v));
- instrument_atomic_read_write(old, sizeof(*old));
+ instrument_read_write(old, sizeof(*old));
return raw_atomic_try_cmpxchg_relaxed(v, old, new);
}
@@ -2854,7 +2854,7 @@ atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new)
{
kcsan_mb();
instrument_atomic_read_write(v, sizeof(*v));
- instrument_atomic_read_write(old, sizeof(*old));
+ instrument_read_write(old, sizeof(*old));
return raw_atomic64_try_cmpxchg(v, old, new);
}
@@ -2876,7 +2876,7 @@ static __always_inline bool
atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new)
{
instrument_atomic_read_write(v, sizeof(*v));
- instrument_atomic_read_write(old, sizeof(*old));
+ instrument_read_write(old, sizeof(*old));
return raw_atomic64_try_cmpxchg_acquire(v, old, new);
}
@@ -2899,7 +2899,7 @@ atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new)
{
kcsan_release();
instrument_atomic_read_write(v, sizeof(*v));
- instrument_atomic_read_write(old, sizeof(*old));
+ instrument_read_write(old, sizeof(*old));
return raw_atomic64_try_cmpxchg_release(v, old, new);
}
@@ -2921,7 +2921,7 @@ static __always_inline bool
atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new)
{
instrument_atomic_read_write(v, sizeof(*v));
- instrument_atomic_read_write(old, sizeof(*old));
+ instrument_read_write(old, sizeof(*old));
return raw_atomic64_try_cmpxchg_relaxed(v, old, new);
}
@@ -4432,7 +4432,7 @@ atomic_long_try_cmpxchg(atomic_long_t *v, long *old, long new)
{
kcsan_mb();
instrument_atomic_read_write(v, sizeof(*v));
- instrument_atomic_read_write(old, sizeof(*old));
+ instrument_read_write(old, sizeof(*old));
return raw_atomic_long_try_cmpxchg(v, old, new);
}
@@ -4454,7 +4454,7 @@ static __always_inline bool
atomic_long_try_cmpxchg_acquire(atomic_long_t *v, long *old, long new)
{
instrument_atomic_read_write(v, sizeof(*v));
- instrument_atomic_read_write(old, sizeof(*old));
+ instrument_read_write(old, sizeof(*old));
return raw_atomic_long_try_cmpxchg_acquire(v, old, new);
}
@@ -4477,7 +4477,7 @@ atomic_long_try_cmpxchg_release(atomic_long_t *v, long *old, long new)
{
kcsan_release();
instrument_atomic_read_write(v, sizeof(*v));
- instrument_atomic_read_write(old, sizeof(*old));
+ instrument_read_write(old, sizeof(*old));
return raw_atomic_long_try_cmpxchg_release(v, old, new);
}
@@ -4499,7 +4499,7 @@ static __always_inline bool
atomic_long_try_cmpxchg_relaxed(atomic_long_t *v, long *old, long new)
{
instrument_atomic_read_write(v, sizeof(*v));
- instrument_atomic_read_write(old, sizeof(*old));
+ instrument_read_write(old, sizeof(*old));
return raw_atomic_long_try_cmpxchg_relaxed(v, old, new);
}
@@ -5050,4 +5050,4 @@ atomic_long_dec_if_positive(atomic_long_t *v)
#endif /* _LINUX_ATOMIC_INSTRUMENTED_H */
-// 8829b337928e9508259079d32581775ececd415b
+// f618ac667f868941a84ce0ab2242f1786e049ed4
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index c5c9d89c73ed..610ef62b6a32 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -63,6 +63,8 @@ enum wb_reason {
struct wb_completion {
atomic_t cnt;
wait_queue_head_t *waitq;
+ unsigned long progress_stamp; /* The jiffies when slow progress is detected */
+ unsigned long wait_start; /* The jiffies when waiting for the writeback work to finish */
};
#define __WB_COMPLETION_INIT(_waitq) \
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 3e64f14739dd..0c8342747cab 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -277,10 +277,11 @@ unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie)
rcu_read_lock();
/*
- * Paired with store_release in inode_switch_wbs_work_fn() and
+ * Paired with a release fence in inode_do_switch_wbs() and
* ensures that we see the new wb if we see cleared I_WB_SWITCH.
*/
- cookie->locked = smp_load_acquire(&inode->i_state) & I_WB_SWITCH;
+ cookie->locked = inode_state_read_once(inode) & I_WB_SWITCH;
+ smp_rmb();
if (unlikely(cookie->locked))
xa_lock_irqsave(&inode->i_mapping->i_pages, cookie->flags);
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 733e7f93db66..63e0e2aa1ce9 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -306,8 +306,7 @@ struct ceph_entity_addr *ceph_client_addr(struct ceph_client *client);
u64 ceph_client_gid(struct ceph_client *client);
extern void ceph_destroy_client(struct ceph_client *client);
extern void ceph_reset_client_addr(struct ceph_client *client);
-extern int __ceph_open_session(struct ceph_client *client,
- unsigned long started);
+extern int __ceph_open_session(struct ceph_client *client);
extern int ceph_open_session(struct ceph_client *client);
int ceph_wait_for_latest_osdmap(struct ceph_client *client,
unsigned long timeout);
diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h
index 2573585b7f06..b8bd2f15f91f 100644
--- a/include/linux/cleanup.h
+++ b/include/linux/cleanup.h
@@ -261,6 +261,10 @@ const volatile void * __must_check_fn(const volatile void *val)
* CLASS(name, var)(args...):
* declare the variable @var as an instance of the named class
*
+ * CLASS_INIT(name, var, init_expr):
+ * declare the variable @var as an instance of the named class with
+ * custom initialization expression.
+ *
* Ex.
*
* DEFINE_CLASS(fdget, struct fd, fdput(_T), fdget(fd), int fd)
@@ -290,15 +294,19 @@ static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \
class_##_name##_t var __cleanup(class_##_name##_destructor) = \
class_##_name##_constructor
-#define scoped_class(_name, var, args) \
- for (CLASS(_name, var)(args); \
- __guard_ptr(_name)(&var) || !__is_cond_ptr(_name); \
- ({ goto _label; })) \
- if (0) { \
-_label: \
- break; \
+#define CLASS_INIT(_name, _var, _init_expr) \
+ class_##_name##_t _var __cleanup(class_##_name##_destructor) = (_init_expr)
+
+#define __scoped_class(_name, var, _label, args...) \
+ for (CLASS(_name, var)(args); ; ({ goto _label; })) \
+ if (0) { \
+_label: \
+ break; \
} else
+#define scoped_class(_name, var, args...) \
+ __scoped_class(_name, var, __UNIQUE_ID(label), args)
+
/*
* DEFINE_GUARD(name, type, lock, unlock):
* trivial wrapper around DEFINE_CLASS() above specifically
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 89ae50ad2ace..343a140a6ba2 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -20,6 +20,8 @@
struct cred;
struct inode;
+extern struct task_struct init_task;
+
/*
* COW Supplementary groups list
*/
@@ -156,6 +158,11 @@ extern struct cred *prepare_exec_creds(void);
extern int commit_creds(struct cred *);
extern void abort_creds(struct cred *);
extern struct cred *prepare_kernel_cred(struct task_struct *);
+static inline const struct cred *kernel_cred(void)
+{
+ /* shut up sparse */
+ return rcu_dereference_raw(init_task.cred);
+}
extern int set_security_override(struct cred *, u32);
extern int set_security_override_from_ctx(struct cred *, const char *);
extern int set_create_files_as(struct cred *, struct inode *);
@@ -180,6 +187,16 @@ static inline const struct cred *revert_creds(const struct cred *revert_cred)
return rcu_replace_pointer(current->cred, revert_cred, 1);
}
+DEFINE_CLASS(override_creds,
+ const struct cred *,
+ revert_creds(_T),
+ override_creds(override_cred), const struct cred *override_cred)
+
+#define scoped_with_creds(cred) \
+ scoped_class(override_creds, __UNIQUE_ID(label), cred)
+
+#define scoped_with_kernel_creds() scoped_with_creds(kernel_cred())
+
/**
* get_cred_many - Get references on a set of credentials
* @cred: The credentials to reference
@@ -263,6 +280,11 @@ static inline void put_cred(const struct cred *cred)
put_cred_many(cred, 1);
}
+DEFINE_CLASS(prepare_creds,
+ struct cred *,
+ if (_T) put_cred(_T),
+ prepare_creds(), void)
+
DEFINE_FREE(put_cred, struct cred *, if (!IS_ERR_OR_NULL(_T)) put_cred(_T))
/**
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 8248ff9363ee..2ceda49c609f 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -90,7 +90,7 @@
*/
#define DMA_MAPPING_ERROR (~(dma_addr_t)0)
-#define DMA_BIT_MASK(n) (((n) == 64) ? ~0ULL : ((1ULL<<(n))-1))
+#define DMA_BIT_MASK(n) GENMASK_ULL(n - 1, 0)
struct dma_iova_state {
dma_addr_t addr;
diff --git a/include/linux/entry-virt.h b/include/linux/entry-virt.h
index 42c89e3e5ca7..bfa767702d9a 100644
--- a/include/linux/entry-virt.h
+++ b/include/linux/entry-virt.h
@@ -32,7 +32,7 @@
*/
static inline int arch_xfer_to_guest_mode_handle_work(unsigned long ti_work);
-#ifndef arch_xfer_to_guest_mode_work
+#ifndef arch_xfer_to_guest_mode_handle_work
static inline int arch_xfer_to_guest_mode_handle_work(unsigned long ti_work)
{
return 0;
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index c2d8b4ec62eb..5c9162193d26 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -492,7 +492,7 @@ struct ethtool_pause_stats {
};
#define ETHTOOL_MAX_LANES 8
-/**
+/*
* IEEE 802.3ck/df defines 16 bins for FEC histogram plus one more for
* the end-of-list marker, total 17 items
*/
diff --git a/include/linux/file.h b/include/linux/file.h
index af1768d934a0..cf389fde9bc2 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -127,4 +127,130 @@ extern void __fput_sync(struct file *);
extern unsigned int sysctl_nr_open_min, sysctl_nr_open_max;
+/*
+ * fd_prepare: Combined fd + file allocation cleanup class.
+ * @err: Error code to indicate if allocation succeeded.
+ * @__fd: Allocated fd (may not be accessed directly)
+ * @__file: Allocated struct file pointer (may not be accessed directly)
+ *
+ * Allocates an fd and a file together. On error paths, automatically cleans
+ * up whichever resource was successfully allocated. Allows flexible file
+ * allocation with different functions per usage.
+ *
+ * Do not use directly.
+ */
+struct fd_prepare {
+ s32 err;
+ s32 __fd; /* do not access directly */
+ struct file *__file; /* do not access directly */
+};
+
+/* Typedef for fd_prepare cleanup guards. */
+typedef struct fd_prepare class_fd_prepare_t;
+
+/*
+ * Accessors for fd_prepare class members.
+ * _Generic() is used for zero-cost type safety.
+ */
+#define fd_prepare_fd(_fdf) \
+ (_Generic((_fdf), struct fd_prepare: (_fdf).__fd))
+
+#define fd_prepare_file(_fdf) \
+ (_Generic((_fdf), struct fd_prepare: (_fdf).__file))
+
+/* Do not use directly. */
+static inline void class_fd_prepare_destructor(const struct fd_prepare *fdf)
+{
+ if (unlikely(fdf->err)) {
+ if (likely(fdf->__fd >= 0))
+ put_unused_fd(fdf->__fd);
+ if (unlikely(!IS_ERR_OR_NULL(fdf->__file)))
+ fput(fdf->__file);
+ }
+}
+
+/* Do not use directly. */
+static inline int class_fd_prepare_lock_err(const struct fd_prepare *fdf)
+{
+ if (unlikely(fdf->err))
+ return fdf->err;
+ if (unlikely(fdf->__fd < 0))
+ return fdf->__fd;
+ if (unlikely(IS_ERR(fdf->__file)))
+ return PTR_ERR(fdf->__file);
+ if (unlikely(!fdf->__file))
+ return -ENOMEM;
+ return 0;
+}
+
+/*
+ * __FD_PREPARE_INIT - Helper to initialize fd_prepare class.
+ * @_fd_flags: flags for get_unused_fd_flags()
+ * @_file_owned: expression that returns struct file *
+ *
+ * Returns a struct fd_prepare with fd, file, and err set.
+ * If fd allocation fails, fd will be negative and err will be set. If
+ * fd succeeds but file_init_expr fails, file will be ERR_PTR and err
+ * will be set. The err field is the single source of truth for error
+ * checking.
+ */
+#define __FD_PREPARE_INIT(_fd_flags, _file_owned) \
+ ({ \
+ struct fd_prepare fdf = { \
+ .__fd = get_unused_fd_flags((_fd_flags)), \
+ }; \
+ if (likely(fdf.__fd >= 0)) \
+ fdf.__file = (_file_owned); \
+ fdf.err = ACQUIRE_ERR(fd_prepare, &fdf); \
+ fdf; \
+ })
+
+/*
+ * FD_PREPARE - Macro to declare and initialize an fd_prepare variable.
+ *
+ * Declares and initializes an fd_prepare variable with automatic
+ * cleanup. No separate scope required - cleanup happens when variable
+ * goes out of scope.
+ *
+ * @_fdf: name of struct fd_prepare variable to define
+ * @_fd_flags: flags for get_unused_fd_flags()
+ * @_file_owned: struct file to take ownership of (can be expression)
+ */
+#define FD_PREPARE(_fdf, _fd_flags, _file_owned) \
+ CLASS_INIT(fd_prepare, _fdf, __FD_PREPARE_INIT(_fd_flags, _file_owned))
+
+/*
+ * fd_publish - Publish prepared fd and file to the fd table.
+ * @_fdf: struct fd_prepare variable
+ */
+#define fd_publish(_fdf) \
+ ({ \
+ struct fd_prepare *fdp = &(_fdf); \
+ VFS_WARN_ON_ONCE(fdp->err); \
+ VFS_WARN_ON_ONCE(fdp->__fd < 0); \
+ VFS_WARN_ON_ONCE(IS_ERR_OR_NULL(fdp->__file)); \
+ fd_install(fdp->__fd, fdp->__file); \
+ fdp->__fd; \
+ })
+
+/* Do not use directly. */
+#define __FD_ADD(_fdf, _fd_flags, _file_owned) \
+ ({ \
+ FD_PREPARE(_fdf, _fd_flags, _file_owned); \
+ s32 ret = _fdf.err; \
+ if (likely(!ret)) \
+ ret = fd_publish(_fdf); \
+ ret; \
+ })
+
+/*
+ * FD_ADD - Allocate and install an fd and file in one step.
+ * @_fd_flags: flags for get_unused_fd_flags()
+ * @_file_owned: struct file to take ownership of
+ *
+ * Returns the allocated fd number, or negative error code on failure.
+ */
+#define FD_ADD(_fd_flags, _file_owned) \
+ __FD_ADD(__UNIQUE_ID(fd_prepare), _fd_flags, _file_owned)
+
#endif /* __LINUX_FILE_H */
diff --git a/include/linux/filelock.h b/include/linux/filelock.h
index c2ce8ba05d06..54b824c05299 100644
--- a/include/linux/filelock.h
+++ b/include/linux/filelock.h
@@ -159,6 +159,8 @@ int fcntl_setlk64(unsigned int, struct file *, unsigned int,
int fcntl_setlease(unsigned int fd, struct file *filp, int arg);
int fcntl_getlease(struct file *filp);
+int fcntl_setdeleg(unsigned int fd, struct file *filp, struct delegation *deleg);
+int fcntl_getdeleg(struct file *filp, struct delegation *deleg);
static inline bool lock_is_unlock(struct file_lock *fl)
{
@@ -212,7 +214,14 @@ int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl);
void locks_init_lease(struct file_lease *);
void locks_free_lease(struct file_lease *fl);
struct file_lease *locks_alloc_lease(void);
-int __break_lease(struct inode *inode, unsigned int flags, unsigned int type);
+
+#define LEASE_BREAK_LEASE BIT(0) // break leases and delegations
+#define LEASE_BREAK_DELEG BIT(1) // break delegations only
+#define LEASE_BREAK_LAYOUT BIT(2) // break layouts only
+#define LEASE_BREAK_NONBLOCK BIT(3) // non-blocking break
+#define LEASE_BREAK_OPEN_RDONLY BIT(4) // readonly open event
+
+int __break_lease(struct inode *inode, unsigned int flags);
void lease_get_mtime(struct inode *, struct timespec64 *time);
int generic_setlease(struct file *, int, struct file_lease **, void **priv);
int kernel_setlease(struct file *, int, struct file_lease **, void **);
@@ -271,6 +280,16 @@ static inline int fcntl_getlease(struct file *filp)
return F_UNLCK;
}
+static inline int fcntl_setdeleg(unsigned int fd, struct file *filp, struct delegation *deleg)
+{
+ return -EINVAL;
+}
+
+static inline int fcntl_getdeleg(struct file *filp, struct delegation *deleg)
+{
+ return -EINVAL;
+}
+
static inline bool lock_is_unlock(struct file_lock *fl)
{
return false;
@@ -367,7 +386,7 @@ static inline int locks_lock_inode_wait(struct inode *inode, struct file_lock *f
return -ENOLCK;
}
-static inline int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
+static inline int __break_lease(struct inode *inode, unsigned int flags)
{
return 0;
}
@@ -428,6 +447,17 @@ static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl)
}
#ifdef CONFIG_FILE_LOCKING
+static inline unsigned int openmode_to_lease_flags(unsigned int mode)
+{
+ unsigned int flags = 0;
+
+ if ((mode & O_ACCMODE) == O_RDONLY)
+ flags |= LEASE_BREAK_OPEN_RDONLY;
+ if (mode & O_NONBLOCK)
+ flags |= LEASE_BREAK_NONBLOCK;
+ return flags;
+}
+
static inline int break_lease(struct inode *inode, unsigned int mode)
{
struct file_lock_context *flctx;
@@ -443,11 +473,11 @@ static inline int break_lease(struct inode *inode, unsigned int mode)
return 0;
smp_mb();
if (!list_empty_careful(&flctx->flc_lease))
- return __break_lease(inode, mode, FL_LEASE);
+ return __break_lease(inode, LEASE_BREAK_LEASE | openmode_to_lease_flags(mode));
return 0;
}
-static inline int break_deleg(struct inode *inode, unsigned int mode)
+static inline int break_deleg(struct inode *inode, unsigned int flags)
{
struct file_lock_context *flctx;
@@ -461,60 +491,84 @@ static inline int break_deleg(struct inode *inode, unsigned int mode)
if (!flctx)
return 0;
smp_mb();
- if (!list_empty_careful(&flctx->flc_lease))
- return __break_lease(inode, mode, FL_DELEG);
+ if (!list_empty_careful(&flctx->flc_lease)) {
+ flags |= LEASE_BREAK_DELEG;
+ return __break_lease(inode, flags);
+ }
return 0;
}
-static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode)
+struct delegated_inode {
+ struct inode *di_inode;
+};
+
+static inline bool is_delegated(struct delegated_inode *di)
+{
+ return di->di_inode;
+}
+
+static inline int try_break_deleg(struct inode *inode,
+ struct delegated_inode *di)
{
int ret;
- ret = break_deleg(inode, O_WRONLY|O_NONBLOCK);
- if (ret == -EWOULDBLOCK && delegated_inode) {
- *delegated_inode = inode;
+ ret = break_deleg(inode, LEASE_BREAK_NONBLOCK);
+ if (ret == -EWOULDBLOCK && di) {
+ di->di_inode = inode;
ihold(inode);
}
return ret;
}
-static inline int break_deleg_wait(struct inode **delegated_inode)
+static inline int break_deleg_wait(struct delegated_inode *di)
{
int ret;
- ret = break_deleg(*delegated_inode, O_WRONLY);
- iput(*delegated_inode);
- *delegated_inode = NULL;
+ ret = break_deleg(di->di_inode, 0);
+ iput(di->di_inode);
+ di->di_inode = NULL;
return ret;
}
static inline int break_layout(struct inode *inode, bool wait)
{
smp_mb();
- if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease))
- return __break_lease(inode,
- wait ? O_WRONLY : O_WRONLY | O_NONBLOCK,
- FL_LAYOUT);
+ if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) {
+ unsigned int flags = LEASE_BREAK_LAYOUT;
+
+ if (!wait)
+ flags |= LEASE_BREAK_NONBLOCK;
+
+ return __break_lease(inode, flags);
+ }
return 0;
}
#else /* !CONFIG_FILE_LOCKING */
-static inline int break_lease(struct inode *inode, unsigned int mode)
+struct delegated_inode { };
+
+static inline bool is_delegated(struct delegated_inode *di)
+{
+ return false;
+}
+
+static inline int break_lease(struct inode *inode, bool wait)
{
return 0;
}
-static inline int break_deleg(struct inode *inode, unsigned int mode)
+static inline int break_deleg(struct inode *inode, unsigned int flags)
{
return 0;
}
-static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode)
+static inline int try_break_deleg(struct inode *inode,
+ struct delegated_inode *delegated_inode)
{
return 0;
}
-static inline int break_deleg_wait(struct inode **delegated_inode)
+static inline int break_deleg_wait(struct delegated_inode *delegated_inode)
{
BUG();
return 0;
diff --git a/include/linux/filter.h b/include/linux/filter.h
index f5c859b8131a..973233b82dc1 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -901,6 +901,26 @@ static inline void bpf_compute_data_pointers(struct sk_buff *skb)
cb->data_end = skb->data + skb_headlen(skb);
}
+static inline int bpf_prog_run_data_pointers(
+ const struct bpf_prog *prog,
+ struct sk_buff *skb)
+{
+ struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;
+ void *save_data_meta, *save_data_end;
+ int res;
+
+ save_data_meta = cb->data_meta;
+ save_data_end = cb->data_end;
+
+ bpf_compute_data_pointers(skb);
+ res = bpf_prog_run(prog, skb);
+
+ cb->data_meta = save_data_meta;
+ cb->data_end = save_data_end;
+
+ return res;
+}
+
/* Similar to bpf_compute_data_pointers(), except that save orginal
* data in cb->data and cb->meta_data for restore.
*/
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c895146c1444..ce25feb06727 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2,6 +2,7 @@
#ifndef _LINUX_FS_H
#define _LINUX_FS_H
+#include <linux/fs/super.h>
#include <linux/vfsdebug.h>
#include <linux/linkage.h>
#include <linux/wait_bit.h>
@@ -11,7 +12,6 @@
#include <linux/stat.h>
#include <linux/cache.h>
#include <linux/list.h>
-#include <linux/list_lru.h>
#include <linux/llist.h>
#include <linux/radix-tree.h>
#include <linux/xarray.h>
@@ -37,7 +37,6 @@
#include <linux/uuid.h>
#include <linux/errseq.h>
#include <linux/ioprio.h>
-#include <linux/fs_types.h>
#include <linux/build_bug.h>
#include <linux/stddef.h>
#include <linux/mount.h>
@@ -52,11 +51,9 @@
#include <asm/byteorder.h>
#include <uapi/linux/fs.h>
-struct backing_dev_info;
struct bdi_writeback;
struct bio;
struct io_comp_batch;
-struct export_operations;
struct fiemap_extent_info;
struct hd_geometry;
struct iovec;
@@ -70,16 +67,13 @@ struct vfsmount;
struct cred;
struct swap_info_struct;
struct seq_file;
-struct workqueue_struct;
struct iov_iter;
-struct fscrypt_operations;
-struct fsverity_operations;
struct fsnotify_mark_connector;
-struct fsnotify_sb_info;
struct fs_context;
struct fs_parameter_spec;
struct file_kattr;
struct iomap_ops;
+struct delegated_inode;
extern void __init inode_init(void);
extern void __init inode_init_early(void);
@@ -299,11 +293,6 @@ struct iattr {
};
/*
- * Includes for diskquotas.
- */
-#include <linux/quota.h>
-
-/*
* Maximum number of layers of fs stack. Needs to be limited to
* prevent kernel stack overflow
*/
@@ -367,23 +356,9 @@ struct readahead_control;
#define IOCB_NOIO (1 << 20)
/* can use bio alloc cache */
#define IOCB_ALLOC_CACHE (1 << 21)
-/*
- * IOCB_DIO_CALLER_COMP can be set by the iocb owner, to indicate that the
- * iocb completion can be passed back to the owner for execution from a safe
- * context rather than needing to be punted through a workqueue. If this
- * flag is set, the bio completion handling may set iocb->dio_complete to a
- * handler function and iocb->private to context information for that handler.
- * The issuer should call the handler with that context information from task
- * context to complete the processing of the iocb. Note that while this
- * provides a task context for the dio_complete() callback, it should only be
- * used on the completion side for non-IO generating completions. It's fine to
- * call blocking functions from this callback, but they should not wait for
- * unrelated IO (like cache flushing, new IO generation, etc).
- */
-#define IOCB_DIO_CALLER_COMP (1 << 22)
/* kiocb is a read or write operation submitted by fs/aio.c. */
-#define IOCB_AIO_RW (1 << 23)
-#define IOCB_HAS_METADATA (1 << 24)
+#define IOCB_AIO_RW (1 << 22)
+#define IOCB_HAS_METADATA (1 << 23)
/* for use in trace events */
#define TRACE_IOCB_STRINGS \
@@ -400,7 +375,6 @@ struct readahead_control;
{ IOCB_WAITQ, "WAITQ" }, \
{ IOCB_NOIO, "NOIO" }, \
{ IOCB_ALLOC_CACHE, "ALLOC_CACHE" }, \
- { IOCB_DIO_CALLER_COMP, "CALLER_COMP" }, \
{ IOCB_AIO_RW, "AIO_RW" }, \
{ IOCB_HAS_METADATA, "AIO_HAS_METADATA" }
@@ -412,23 +386,13 @@ struct kiocb {
int ki_flags;
u16 ki_ioprio; /* See linux/ioprio.h */
u8 ki_write_stream;
- union {
- /*
- * Only used for async buffered reads, where it denotes the
- * page waitqueue associated with completing the read. Valid
- * IFF IOCB_WAITQ is set.
- */
- struct wait_page_queue *ki_waitq;
- /*
- * Can be used for O_DIRECT IO, where the completion handling
- * is punted back to the issuer of the IO. May only be set
- * if IOCB_DIO_CALLER_COMP is set by the issuer, and the issuer
- * must then check for presence of this handler when ki_complete
- * is invoked. The data passed in to this handler must be
- * assigned to ->private when dio_complete is assigned.
- */
- ssize_t (*dio_complete)(void *data);
- };
+
+ /*
+ * Only used for async buffered reads, where it denotes the page
+ * waitqueue associated with completing the read.
+ * Valid IFF IOCB_WAITQ is set.
+ */
+ struct wait_page_queue *ki_waitq;
};
static inline bool is_sync_kiocb(struct kiocb *kiocb)
@@ -659,13 +623,14 @@ is_uncached_acl(struct posix_acl *acl)
return (long)acl & 1;
}
-#define IOP_FASTPERM 0x0001
-#define IOP_LOOKUP 0x0002
-#define IOP_NOFOLLOW 0x0004
-#define IOP_XATTR 0x0008
+#define IOP_FASTPERM 0x0001
+#define IOP_LOOKUP 0x0002
+#define IOP_NOFOLLOW 0x0004
+#define IOP_XATTR 0x0008
#define IOP_DEFAULT_READLINK 0x0010
-#define IOP_MGTIME 0x0020
-#define IOP_CACHED_LINK 0x0040
+#define IOP_MGTIME 0x0020
+#define IOP_CACHED_LINK 0x0040
+#define IOP_FASTPERM_MAY_EXEC 0x0080
/*
* Inode state bits. Protected by inode->i_lock
@@ -759,7 +724,7 @@ enum inode_state_bits {
/* reserved wait address bit 3 */
};
-enum inode_state_flags_t {
+enum inode_state_flags_enum {
I_NEW = (1U << __I_NEW),
I_SYNC = (1U << __I_SYNC),
I_LRU_ISOLATING = (1U << __I_LRU_ISOLATING),
@@ -786,6 +751,13 @@ enum inode_state_flags_t {
#define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME)
/*
+ * Use inode_state_read() & friends to access.
+ */
+struct inode_state_flags {
+ enum inode_state_flags_enum __state;
+};
+
+/*
* Keep mostly read-only and often accessed (especially for
* the RCU path lookup and 'stat' data) fields at the beginning
* of the 'struct inode'
@@ -793,14 +765,13 @@ enum inode_state_flags_t {
struct inode {
umode_t i_mode;
unsigned short i_opflags;
- kuid_t i_uid;
- kgid_t i_gid;
unsigned int i_flags;
-
#ifdef CONFIG_FS_POSIX_ACL
struct posix_acl *i_acl;
struct posix_acl *i_default_acl;
#endif
+ kuid_t i_uid;
+ kgid_t i_gid;
const struct inode_operations *i_op;
struct super_block *i_sb;
@@ -843,7 +814,7 @@ struct inode {
#endif
/* Misc */
- enum inode_state_flags_t i_state;
+ struct inode_state_flags i_state;
/* 32-bit hole */
struct rw_semaphore i_rwsem;
@@ -902,6 +873,80 @@ struct inode {
void *i_private; /* fs or device private pointer */
} __randomize_layout;
+/*
+ * i_state handling
+ *
+ * We hide all of it behind helpers so that we can validate consumers.
+ */
+static inline enum inode_state_flags_enum inode_state_read_once(struct inode *inode)
+{
+ return READ_ONCE(inode->i_state.__state);
+}
+
+static inline enum inode_state_flags_enum inode_state_read(struct inode *inode)
+{
+ lockdep_assert_held(&inode->i_lock);
+ return inode->i_state.__state;
+}
+
+static inline void inode_state_set_raw(struct inode *inode,
+ enum inode_state_flags_enum flags)
+{
+ WRITE_ONCE(inode->i_state.__state, inode->i_state.__state | flags);
+}
+
+static inline void inode_state_set(struct inode *inode,
+ enum inode_state_flags_enum flags)
+{
+ lockdep_assert_held(&inode->i_lock);
+ inode_state_set_raw(inode, flags);
+}
+
+static inline void inode_state_clear_raw(struct inode *inode,
+ enum inode_state_flags_enum flags)
+{
+ WRITE_ONCE(inode->i_state.__state, inode->i_state.__state & ~flags);
+}
+
+static inline void inode_state_clear(struct inode *inode,
+ enum inode_state_flags_enum flags)
+{
+ lockdep_assert_held(&inode->i_lock);
+ inode_state_clear_raw(inode, flags);
+}
+
+static inline void inode_state_assign_raw(struct inode *inode,
+ enum inode_state_flags_enum flags)
+{
+ WRITE_ONCE(inode->i_state.__state, flags);
+}
+
+static inline void inode_state_assign(struct inode *inode,
+ enum inode_state_flags_enum flags)
+{
+ lockdep_assert_held(&inode->i_lock);
+ inode_state_assign_raw(inode, flags);
+}
+
+static inline void inode_state_replace_raw(struct inode *inode,
+ enum inode_state_flags_enum clearflags,
+ enum inode_state_flags_enum setflags)
+{
+ enum inode_state_flags_enum flags;
+ flags = inode->i_state.__state;
+ flags &= ~clearflags;
+ flags |= setflags;
+ inode_state_assign_raw(inode, flags);
+}
+
+static inline void inode_state_replace(struct inode *inode,
+ enum inode_state_flags_enum clearflags,
+ enum inode_state_flags_enum setflags)
+{
+ lockdep_assert_held(&inode->i_lock);
+ inode_state_replace_raw(inode, clearflags, setflags);
+}
+
static inline void inode_set_cached_link(struct inode *inode, char *link, int linklen)
{
VFS_WARN_ON_INODE(strlen(link) != linklen, inode);
@@ -949,6 +994,8 @@ static inline void inode_fake_hash(struct inode *inode)
hlist_add_fake(&inode->i_hash);
}
+void wait_on_new_inode(struct inode *inode);
+
/*
* inode->i_rwsem nesting subclasses for the lock validator:
*
@@ -1348,49 +1395,6 @@ extern pid_t f_getown(struct file *filp);
extern int send_sigurg(struct file *file);
/*
- * sb->s_flags. Note that these mirror the equivalent MS_* flags where
- * represented in both.
- */
-#define SB_RDONLY BIT(0) /* Mount read-only */
-#define SB_NOSUID BIT(1) /* Ignore suid and sgid bits */
-#define SB_NODEV BIT(2) /* Disallow access to device special files */
-#define SB_NOEXEC BIT(3) /* Disallow program execution */
-#define SB_SYNCHRONOUS BIT(4) /* Writes are synced at once */
-#define SB_MANDLOCK BIT(6) /* Allow mandatory locks on an FS */
-#define SB_DIRSYNC BIT(7) /* Directory modifications are synchronous */
-#define SB_NOATIME BIT(10) /* Do not update access times. */
-#define SB_NODIRATIME BIT(11) /* Do not update directory access times */
-#define SB_SILENT BIT(15)
-#define SB_POSIXACL BIT(16) /* Supports POSIX ACLs */
-#define SB_INLINECRYPT BIT(17) /* Use blk-crypto for encrypted files */
-#define SB_KERNMOUNT BIT(22) /* this is a kern_mount call */
-#define SB_I_VERSION BIT(23) /* Update inode I_version field */
-#define SB_LAZYTIME BIT(25) /* Update the on-disk [acm]times lazily */
-
-/* These sb flags are internal to the kernel */
-#define SB_DEAD BIT(21)
-#define SB_DYING BIT(24)
-#define SB_FORCE BIT(27)
-#define SB_NOSEC BIT(28)
-#define SB_BORN BIT(29)
-#define SB_ACTIVE BIT(30)
-#define SB_NOUSER BIT(31)
-
-/* These flags relate to encoding and casefolding */
-#define SB_ENC_STRICT_MODE_FL (1 << 0)
-#define SB_ENC_NO_COMPAT_FALLBACK_FL (1 << 1)
-
-#define sb_has_strict_encoding(sb) \
- (sb->s_encoding_flags & SB_ENC_STRICT_MODE_FL)
-
-#if IS_ENABLED(CONFIG_UNICODE)
-#define sb_no_casefold_compat_fallback(sb) \
- (sb->s_encoding_flags & SB_ENC_NO_COMPAT_FALLBACK_FL)
-#else
-#define sb_no_casefold_compat_fallback(sb) (1)
-#endif
-
-/*
* Umount options
*/
@@ -1400,191 +1404,6 @@ extern int send_sigurg(struct file *file);
#define UMOUNT_NOFOLLOW 0x00000008 /* Don't follow symlink on umount */
#define UMOUNT_UNUSED 0x80000000 /* Flag guaranteed to be unused */
-/* sb->s_iflags */
-#define SB_I_CGROUPWB 0x00000001 /* cgroup-aware writeback enabled */
-#define SB_I_NOEXEC 0x00000002 /* Ignore executables on this fs */
-#define SB_I_NODEV 0x00000004 /* Ignore devices on this fs */
-#define SB_I_STABLE_WRITES 0x00000008 /* don't modify blks until WB is done */
-
-/* sb->s_iflags to limit user namespace mounts */
-#define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */
-#define SB_I_IMA_UNVERIFIABLE_SIGNATURE 0x00000020
-#define SB_I_UNTRUSTED_MOUNTER 0x00000040
-#define SB_I_EVM_HMAC_UNSUPPORTED 0x00000080
-
-#define SB_I_SKIP_SYNC 0x00000100 /* Skip superblock at global sync */
-#define SB_I_PERSB_BDI 0x00000200 /* has a per-sb bdi */
-#define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */
-#define SB_I_RETIRED 0x00000800 /* superblock shouldn't be reused */
-#define SB_I_NOUMASK 0x00001000 /* VFS does not apply umask */
-#define SB_I_NOIDMAP 0x00002000 /* No idmapped mounts on this superblock */
-#define SB_I_ALLOW_HSM 0x00004000 /* Allow HSM events on this superblock */
-
-/* Possible states of 'frozen' field */
-enum {
- SB_UNFROZEN = 0, /* FS is unfrozen */
- SB_FREEZE_WRITE = 1, /* Writes, dir ops, ioctls frozen */
- SB_FREEZE_PAGEFAULT = 2, /* Page faults stopped as well */
- SB_FREEZE_FS = 3, /* For internal FS use (e.g. to stop
- * internal threads if needed) */
- SB_FREEZE_COMPLETE = 4, /* ->freeze_fs finished successfully */
-};
-
-#define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1)
-
-struct sb_writers {
- unsigned short frozen; /* Is sb frozen? */
- int freeze_kcount; /* How many kernel freeze requests? */
- int freeze_ucount; /* How many userspace freeze requests? */
- const void *freeze_owner; /* Owner of the freeze */
- struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS];
-};
-
-struct mount;
-
-struct super_block {
- struct list_head s_list; /* Keep this first */
- dev_t s_dev; /* search index; _not_ kdev_t */
- unsigned char s_blocksize_bits;
- unsigned long s_blocksize;
- loff_t s_maxbytes; /* Max file size */
- struct file_system_type *s_type;
- const struct super_operations *s_op;
- const struct dquot_operations *dq_op;
- const struct quotactl_ops *s_qcop;
- const struct export_operations *s_export_op;
- unsigned long s_flags;
- unsigned long s_iflags; /* internal SB_I_* flags */
- unsigned long s_magic;
- struct dentry *s_root;
- struct rw_semaphore s_umount;
- int s_count;
- atomic_t s_active;
-#ifdef CONFIG_SECURITY
- void *s_security;
-#endif
- const struct xattr_handler * const *s_xattr;
-#ifdef CONFIG_FS_ENCRYPTION
- const struct fscrypt_operations *s_cop;
- struct fscrypt_keyring *s_master_keys; /* master crypto keys in use */
-#endif
-#ifdef CONFIG_FS_VERITY
- const struct fsverity_operations *s_vop;
-#endif
-#if IS_ENABLED(CONFIG_UNICODE)
- struct unicode_map *s_encoding;
- __u16 s_encoding_flags;
-#endif
- struct hlist_bl_head s_roots; /* alternate root dentries for NFS */
- struct mount *s_mounts; /* list of mounts; _not_ for fs use */
- struct block_device *s_bdev; /* can go away once we use an accessor for @s_bdev_file */
- struct file *s_bdev_file;
- struct backing_dev_info *s_bdi;
- struct mtd_info *s_mtd;
- struct hlist_node s_instances;
- unsigned int s_quota_types; /* Bitmask of supported quota types */
- struct quota_info s_dquot; /* Diskquota specific options */
-
- struct sb_writers s_writers;
-
- /*
- * Keep s_fs_info, s_time_gran, s_fsnotify_mask, and
- * s_fsnotify_info together for cache efficiency. They are frequently
- * accessed and rarely modified.
- */
- void *s_fs_info; /* Filesystem private info */
-
- /* Granularity of c/m/atime in ns (cannot be worse than a second) */
- u32 s_time_gran;
- /* Time limits for c/m/atime in seconds */
- time64_t s_time_min;
- time64_t s_time_max;
-#ifdef CONFIG_FSNOTIFY
- u32 s_fsnotify_mask;
- struct fsnotify_sb_info *s_fsnotify_info;
-#endif
-
- /*
- * q: why are s_id and s_sysfs_name not the same? both are human
- * readable strings that identify the filesystem
- * a: s_id is allowed to change at runtime; it's used in log messages,
- * and we want to when a device starts out as single device (s_id is dev
- * name) but then a device is hot added and we have to switch to
- * identifying it by UUID
- * but s_sysfs_name is a handle for programmatic access, and can't
- * change at runtime
- */
- char s_id[32]; /* Informational name */
- uuid_t s_uuid; /* UUID */
- u8 s_uuid_len; /* Default 16, possibly smaller for weird filesystems */
-
- /* if set, fs shows up under sysfs at /sys/fs/$FSTYP/s_sysfs_name */
- char s_sysfs_name[UUID_STRING_LEN + 1];
-
- unsigned int s_max_links;
- unsigned int s_d_flags; /* default d_flags for dentries */
-
- /*
- * The next field is for VFS *only*. No filesystems have any business
- * even looking at it. You had been warned.
- */
- struct mutex s_vfs_rename_mutex; /* Kludge */
-
- /*
- * Filesystem subtype. If non-empty the filesystem type field
- * in /proc/mounts will be "type.subtype"
- */
- const char *s_subtype;
-
- const struct dentry_operations *__s_d_op; /* default d_op for dentries */
-
- struct shrinker *s_shrink; /* per-sb shrinker handle */
-
- /* Number of inodes with nlink == 0 but still referenced */
- atomic_long_t s_remove_count;
-
- /* Read-only state of the superblock is being changed */
- int s_readonly_remount;
-
- /* per-sb errseq_t for reporting writeback errors via syncfs */
- errseq_t s_wb_err;
-
- /* AIO completions deferred from interrupt context */
- struct workqueue_struct *s_dio_done_wq;
- struct hlist_head s_pins;
-
- /*
- * Owning user namespace and default context in which to
- * interpret filesystem uids, gids, quotas, device nodes,
- * xattrs and security labels.
- */
- struct user_namespace *s_user_ns;
-
- /*
- * The list_lru structure is essentially just a pointer to a table
- * of per-node lru lists, each of which has its own spinlock.
- * There is no need to put them into separate cachelines.
- */
- struct list_lru s_dentry_lru;
- struct list_lru s_inode_lru;
- struct rcu_head rcu;
- struct work_struct destroy_work;
-
- struct mutex s_sync_lock; /* sync serialisation lock */
-
- /*
- * Indicates how deep in a filesystem stack this SB is
- */
- int s_stack_depth;
-
- /* s_inode_list_lock protects s_inodes */
- spinlock_t s_inode_list_lock ____cacheline_aligned_in_smp;
- struct list_head s_inodes; /* all inodes */
-
- spinlock_t s_inode_wblist_lock;
- struct list_head s_inodes_wb; /* writeback inodes */
-} __randomize_layout;
-
static inline struct user_namespace *i_user_ns(const struct inode *inode)
{
return inode->i_sb->s_user_ns;
@@ -1902,66 +1721,6 @@ struct timespec64 simple_inode_init_ts(struct inode *inode);
* Snapshotting support.
*/
-/*
- * These are internal functions, please use sb_start_{write,pagefault,intwrite}
- * instead.
- */
-static inline void __sb_end_write(struct super_block *sb, int level)
-{
- percpu_up_read(sb->s_writers.rw_sem + level-1);
-}
-
-static inline void __sb_start_write(struct super_block *sb, int level)
-{
- percpu_down_read_freezable(sb->s_writers.rw_sem + level - 1, true);
-}
-
-static inline bool __sb_start_write_trylock(struct super_block *sb, int level)
-{
- return percpu_down_read_trylock(sb->s_writers.rw_sem + level - 1);
-}
-
-#define __sb_writers_acquired(sb, lev) \
- percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_)
-#define __sb_writers_release(sb, lev) \
- percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], _THIS_IP_)
-
-/**
- * __sb_write_started - check if sb freeze level is held
- * @sb: the super we write to
- * @level: the freeze level
- *
- * * > 0 - sb freeze level is held
- * * 0 - sb freeze level is not held
- * * < 0 - !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN
- */
-static inline int __sb_write_started(const struct super_block *sb, int level)
-{
- return lockdep_is_held_type(sb->s_writers.rw_sem + level - 1, 1);
-}
-
-/**
- * sb_write_started - check if SB_FREEZE_WRITE is held
- * @sb: the super we write to
- *
- * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN.
- */
-static inline bool sb_write_started(const struct super_block *sb)
-{
- return __sb_write_started(sb, SB_FREEZE_WRITE);
-}
-
-/**
- * sb_write_not_started - check if SB_FREEZE_WRITE is not held
- * @sb: the super we write to
- *
- * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN.
- */
-static inline bool sb_write_not_started(const struct super_block *sb)
-{
- return __sb_write_started(sb, SB_FREEZE_WRITE) <= 0;
-}
-
/**
* file_write_started - check if SB_FREEZE_WRITE is held
* @file: the file we write to
@@ -1992,137 +1751,26 @@ static inline bool file_write_not_started(const struct file *file)
return sb_write_not_started(file_inode(file)->i_sb);
}
-/**
- * sb_end_write - drop write access to a superblock
- * @sb: the super we wrote to
- *
- * Decrement number of writers to the filesystem. Wake up possible waiters
- * wanting to freeze the filesystem.
- */
-static inline void sb_end_write(struct super_block *sb)
-{
- __sb_end_write(sb, SB_FREEZE_WRITE);
-}
-
-/**
- * sb_end_pagefault - drop write access to a superblock from a page fault
- * @sb: the super we wrote to
- *
- * Decrement number of processes handling write page fault to the filesystem.
- * Wake up possible waiters wanting to freeze the filesystem.
- */
-static inline void sb_end_pagefault(struct super_block *sb)
-{
- __sb_end_write(sb, SB_FREEZE_PAGEFAULT);
-}
-
-/**
- * sb_end_intwrite - drop write access to a superblock for internal fs purposes
- * @sb: the super we wrote to
- *
- * Decrement fs-internal number of writers to the filesystem. Wake up possible
- * waiters wanting to freeze the filesystem.
- */
-static inline void sb_end_intwrite(struct super_block *sb)
-{
- __sb_end_write(sb, SB_FREEZE_FS);
-}
-
-/**
- * sb_start_write - get write access to a superblock
- * @sb: the super we write to
- *
- * When a process wants to write data or metadata to a file system (i.e. dirty
- * a page or an inode), it should embed the operation in a sb_start_write() -
- * sb_end_write() pair to get exclusion against file system freezing. This
- * function increments number of writers preventing freezing. If the file
- * system is already frozen, the function waits until the file system is
- * thawed.
- *
- * Since freeze protection behaves as a lock, users have to preserve
- * ordering of freeze protection and other filesystem locks. Generally,
- * freeze protection should be the outermost lock. In particular, we have:
- *
- * sb_start_write
- * -> i_rwsem (write path, truncate, directory ops, ...)
- * -> s_umount (freeze_super, thaw_super)
- */
-static inline void sb_start_write(struct super_block *sb)
-{
- __sb_start_write(sb, SB_FREEZE_WRITE);
-}
-
-static inline bool sb_start_write_trylock(struct super_block *sb)
-{
- return __sb_start_write_trylock(sb, SB_FREEZE_WRITE);
-}
-
-/**
- * sb_start_pagefault - get write access to a superblock from a page fault
- * @sb: the super we write to
- *
- * When a process starts handling write page fault, it should embed the
- * operation into sb_start_pagefault() - sb_end_pagefault() pair to get
- * exclusion against file system freezing. This is needed since the page fault
- * is going to dirty a page. This function increments number of running page
- * faults preventing freezing. If the file system is already frozen, the
- * function waits until the file system is thawed.
- *
- * Since page fault freeze protection behaves as a lock, users have to preserve
- * ordering of freeze protection and other filesystem locks. It is advised to
- * put sb_start_pagefault() close to mmap_lock in lock ordering. Page fault
- * handling code implies lock dependency:
- *
- * mmap_lock
- * -> sb_start_pagefault
- */
-static inline void sb_start_pagefault(struct super_block *sb)
-{
- __sb_start_write(sb, SB_FREEZE_PAGEFAULT);
-}
-
-/**
- * sb_start_intwrite - get write access to a superblock for internal fs purposes
- * @sb: the super we write to
- *
- * This is the third level of protection against filesystem freezing. It is
- * free for use by a filesystem. The only requirement is that it must rank
- * below sb_start_pagefault.
- *
- * For example filesystem can call sb_start_intwrite() when starting a
- * transaction which somewhat eases handling of freezing for internal sources
- * of filesystem changes (internal fs threads, discarding preallocation on file
- * close, etc.).
- */
-static inline void sb_start_intwrite(struct super_block *sb)
-{
- __sb_start_write(sb, SB_FREEZE_FS);
-}
-
-static inline bool sb_start_intwrite_trylock(struct super_block *sb)
-{
- return __sb_start_write_trylock(sb, SB_FREEZE_FS);
-}
-
bool inode_owner_or_capable(struct mnt_idmap *idmap,
const struct inode *inode);
/*
* VFS helper functions..
*/
-int vfs_create(struct mnt_idmap *, struct inode *,
- struct dentry *, umode_t, bool);
+int vfs_create(struct mnt_idmap *, struct dentry *, umode_t,
+ struct delegated_inode *);
struct dentry *vfs_mkdir(struct mnt_idmap *, struct inode *,
- struct dentry *, umode_t);
+ struct dentry *, umode_t, struct delegated_inode *);
int vfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *,
- umode_t, dev_t);
+ umode_t, dev_t, struct delegated_inode *);
int vfs_symlink(struct mnt_idmap *, struct inode *,
- struct dentry *, const char *);
+ struct dentry *, const char *, struct delegated_inode *);
int vfs_link(struct dentry *, struct mnt_idmap *, struct inode *,
- struct dentry *, struct inode **);
-int vfs_rmdir(struct mnt_idmap *, struct inode *, struct dentry *);
+ struct dentry *, struct delegated_inode *);
+int vfs_rmdir(struct mnt_idmap *, struct inode *, struct dentry *,
+ struct delegated_inode *);
int vfs_unlink(struct mnt_idmap *, struct inode *, struct dentry *,
- struct inode **);
+ struct delegated_inode *);
/**
* struct renamedata - contains all information required for renaming
@@ -2140,7 +1788,7 @@ struct renamedata {
struct dentry *old_dentry;
struct dentry *new_parent;
struct dentry *new_dentry;
- struct inode **delegated_inode;
+ struct delegated_inode *delegated_inode;
unsigned int flags;
} __randomize_layout;
@@ -2150,7 +1798,7 @@ static inline int vfs_whiteout(struct mnt_idmap *idmap,
struct inode *dir, struct dentry *dentry)
{
return vfs_mknod(idmap, dir, dentry, S_IFCHR | WHITEOUT_MODE,
- WHITEOUT_DEV);
+ WHITEOUT_DEV, NULL);
}
struct file *kernel_tmpfile_open(struct mnt_idmap *idmap,
@@ -2431,72 +2079,6 @@ extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
struct file *dst_file, loff_t dst_pos,
loff_t len, unsigned int remap_flags);
-/**
- * enum freeze_holder - holder of the freeze
- * @FREEZE_HOLDER_KERNEL: kernel wants to freeze or thaw filesystem
- * @FREEZE_HOLDER_USERSPACE: userspace wants to freeze or thaw filesystem
- * @FREEZE_MAY_NEST: whether nesting freeze and thaw requests is allowed
- * @FREEZE_EXCL: a freeze that can only be undone by the owner
- *
- * Indicate who the owner of the freeze or thaw request is and whether
- * the freeze needs to be exclusive or can nest.
- * Without @FREEZE_MAY_NEST, multiple freeze and thaw requests from the
- * same holder aren't allowed. It is however allowed to hold a single
- * @FREEZE_HOLDER_USERSPACE and a single @FREEZE_HOLDER_KERNEL freeze at
- * the same time. This is relied upon by some filesystems during online
- * repair or similar.
- */
-enum freeze_holder {
- FREEZE_HOLDER_KERNEL = (1U << 0),
- FREEZE_HOLDER_USERSPACE = (1U << 1),
- FREEZE_MAY_NEST = (1U << 2),
- FREEZE_EXCL = (1U << 3),
-};
-
-struct super_operations {
- struct inode *(*alloc_inode)(struct super_block *sb);
- void (*destroy_inode)(struct inode *);
- void (*free_inode)(struct inode *);
-
- void (*dirty_inode) (struct inode *, int flags);
- int (*write_inode) (struct inode *, struct writeback_control *wbc);
- int (*drop_inode) (struct inode *);
- void (*evict_inode) (struct inode *);
- void (*put_super) (struct super_block *);
- int (*sync_fs)(struct super_block *sb, int wait);
- int (*freeze_super) (struct super_block *, enum freeze_holder who, const void *owner);
- int (*freeze_fs) (struct super_block *);
- int (*thaw_super) (struct super_block *, enum freeze_holder who, const void *owner);
- int (*unfreeze_fs) (struct super_block *);
- int (*statfs) (struct dentry *, struct kstatfs *);
- int (*remount_fs) (struct super_block *, int *, char *);
- void (*umount_begin) (struct super_block *);
-
- int (*show_options)(struct seq_file *, struct dentry *);
- int (*show_devname)(struct seq_file *, struct dentry *);
- int (*show_path)(struct seq_file *, struct dentry *);
- int (*show_stats)(struct seq_file *, struct dentry *);
-#ifdef CONFIG_QUOTA
- ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
- ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
- struct dquot __rcu **(*get_dquots)(struct inode *);
-#endif
- long (*nr_cached_objects)(struct super_block *,
- struct shrink_control *);
- long (*free_cached_objects)(struct super_block *,
- struct shrink_control *);
- /*
- * If a filesystem can support graceful removal of a device and
- * continue read-write operations, implement this callback.
- *
- * Return 0 if the filesystem can continue read-write.
- * Non-zero return value or no such callback means the fs will be shutdown
- * as usual.
- */
- int (*remove_bdev)(struct super_block *sb, struct block_device *bdev);
- void (*shutdown)(struct super_block *sb);
-};
-
/*
* Inode flags - they have no relation to superblock flags now
*/
@@ -2539,7 +2121,6 @@ struct super_operations {
*/
#define __IS_FLG(inode, flg) ((inode)->i_sb->s_flags & (flg))
-static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags & SB_RDONLY; }
#define IS_RDONLY(inode) sb_rdonly((inode)->i_sb)
#define IS_SYNC(inode) (__IS_FLG(inode, SB_SYNCHRONOUS) || \
((inode)->i_flags & S_SYNC))
@@ -2635,8 +2216,8 @@ static inline int icount_read(const struct inode *inode)
*/
static inline bool inode_is_dirtytime_only(struct inode *inode)
{
- return (inode->i_state & (I_DIRTY_TIME | I_NEW |
- I_FREEING | I_WILL_FREE)) == I_DIRTY_TIME;
+ return (inode_state_read_once(inode) &
+ (I_DIRTY_TIME | I_NEW | I_FREEING | I_WILL_FREE)) == I_DIRTY_TIME;
}
extern void inc_nlink(struct inode *inode);
@@ -2689,6 +2270,7 @@ struct file_system_type {
#define FS_ALLOW_IDMAP 32 /* FS has been updated to handle vfs idmappings. */
#define FS_MGTIME 64 /* FS uses multigrain timestamps */
#define FS_LBS 128 /* FS supports LBS */
+#define FS_POWER_FREEZE 256 /* Always freeze on suspend/hibernate */
#define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */
int (*init_fs_context)(struct fs_context *);
const struct fs_parameter_spec *parameters;
@@ -2773,10 +2355,6 @@ extern int unregister_filesystem(struct file_system_type *);
extern int vfs_statfs(const struct path *, struct kstatfs *);
extern int user_statfs(const char __user *, struct kstatfs *);
extern int fd_statfs(int, struct kstatfs *);
-int freeze_super(struct super_block *super, enum freeze_holder who,
- const void *freeze_owner);
-int thaw_super(struct super_block *super, enum freeze_holder who,
- const void *freeze_owner);
extern __printf(2, 3)
int super_setup_bdi_name(struct super_block *sb, char *fmt, ...);
extern int super_setup_bdi(struct super_block *sb);
@@ -2819,10 +2397,9 @@ static inline void super_set_sysfs_name_generic(struct super_block *sb, const ch
va_end(args);
}
-extern int current_umask(void);
-
extern void ihold(struct inode * inode);
extern void iput(struct inode *);
+void iput_not_last(struct inode *);
int inode_update_timestamps(struct inode *inode, int flags);
int generic_update_time(struct inode *, int);
@@ -2963,12 +2540,6 @@ extern struct kmem_cache *names_cachep;
#define __getname() kmem_cache_alloc(names_cachep, GFP_KERNEL)
#define __putname(name) kmem_cache_free(names_cachep, (void *)(name))
-extern struct super_block *blockdev_superblock;
-static inline bool sb_is_blkdev_sb(struct super_block *sb)
-{
- return IS_ENABLED(CONFIG_BLOCK) && sb == blockdev_superblock;
-}
-
void emergency_thaw_all(void);
extern int sync_filesystem(struct super_block *);
extern const struct file_operations def_blk_fops;
@@ -3014,7 +2585,7 @@ extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart,
extern int __must_check file_check_and_advance_wb_err(struct file *file);
extern int __must_check file_write_and_wait_range(struct file *file,
loff_t start, loff_t end);
-int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start,
+int filemap_flush_range(struct address_space *mapping, loff_t start,
loff_t end);
static inline int file_write_and_wait(struct file *file)
@@ -3051,8 +2622,8 @@ static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count)
} else if (iocb->ki_flags & IOCB_DONTCACHE) {
struct address_space *mapping = iocb->ki_filp->f_mapping;
- filemap_fdatawrite_range_kick(mapping, iocb->ki_pos - count,
- iocb->ki_pos - 1);
+ filemap_flush_range(mapping, iocb->ki_pos - count,
+ iocb->ki_pos - 1);
}
return count;
@@ -3071,7 +2642,7 @@ static inline int bmap(struct inode *inode, sector_t *block)
#endif
int notify_change(struct mnt_idmap *, struct dentry *,
- struct iattr *, struct inode **);
+ struct iattr *, struct delegated_inode *);
int inode_permission(struct mnt_idmap *, struct inode *, int);
int generic_permission(struct mnt_idmap *, struct inode *, int);
static inline int file_permission(struct file *file, int mask)
@@ -3101,7 +2672,7 @@ static inline bool inode_wrong_type(const struct inode *inode, umode_t mode)
* file_start_write - get write access to a superblock for regular file io
* @file: the file we want to write to
*
- * This is a variant of sb_start_write() which is a noop on non-regualr file.
+ * This is a variant of sb_start_write() which is a noop on non-regular file.
* Should be matched with a call to file_end_write().
*/
static inline void file_start_write(struct file *file)
@@ -3269,6 +2840,7 @@ extern struct file * open_exec(const char *);
/* fs/dcache.c -- generic fs support functions */
extern bool is_subdir(struct dentry *, struct dentry *);
extern bool path_is_under(const struct path *, const struct path *);
+u64 vfsmount_to_propagation_flags(struct vfsmount *mnt);
extern char *file_path(struct file *, char *, int);
@@ -3326,7 +2898,7 @@ extern void d_mark_dontcache(struct inode *inode);
extern struct inode *ilookup5_nowait(struct super_block *sb,
unsigned long hashval, int (*test)(struct inode *, void *),
- void *data);
+ void *data, bool *isnew);
extern struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
int (*test)(struct inode *, void *), void *data);
extern struct inode *ilookup(struct super_block *sb, unsigned long ino);
@@ -3378,11 +2950,9 @@ static inline bool is_zero_ino(ino_t ino)
return (u32)ino == 0;
}
-/*
- * inode->i_lock must be held
- */
static inline void __iget(struct inode *inode)
{
+ lockdep_assert_held(&inode->i_lock);
atomic_inc(&inode->i_count);
}
@@ -3421,10 +2991,7 @@ static inline void remove_inode_hash(struct inode *inode)
}
extern void inode_sb_list_add(struct inode *inode);
-extern void inode_add_lru(struct inode *inode);
-
-extern int sb_set_blocksize(struct super_block *, int);
-extern int sb_min_blocksize(struct super_block *, int);
+extern void inode_lru_list_add(struct inode *inode);
int generic_file_mmap(struct file *, struct vm_area_struct *);
int generic_file_mmap_prepare(struct vm_area_desc *desc);
@@ -3606,9 +3173,11 @@ extern void drop_super_exclusive(struct super_block *sb);
extern void iterate_supers(void (*f)(struct super_block *, void *), void *arg);
extern void iterate_supers_type(struct file_system_type *,
void (*)(struct super_block *, void *), void *);
-void filesystems_freeze(void);
+void filesystems_freeze(bool freeze_all);
void filesystems_thaw(void);
+void end_dirop(struct dentry *de);
+
extern int dcache_dir_open(struct inode *, struct file *);
extern int dcache_dir_close(struct inode *, struct file *);
extern loff_t dcache_dir_lseek(struct file *, loff_t, int);
@@ -3745,38 +3314,6 @@ static inline bool generic_ci_validate_strict_name(struct inode *dir,
}
#endif
-static inline struct unicode_map *sb_encoding(const struct super_block *sb)
-{
-#if IS_ENABLED(CONFIG_UNICODE)
- return sb->s_encoding;
-#else
- return NULL;
-#endif
-}
-
-static inline bool sb_has_encoding(const struct super_block *sb)
-{
- return !!sb_encoding(sb);
-}
-
-/*
- * Compare if two super blocks have the same encoding and flags
- */
-static inline bool sb_same_encoding(const struct super_block *sb1,
- const struct super_block *sb2)
-{
-#if IS_ENABLED(CONFIG_UNICODE)
- if (sb1->s_encoding == sb2->s_encoding)
- return true;
-
- return (sb1->s_encoding && sb2->s_encoding &&
- (sb1->s_encoding->version == sb2->s_encoding->version) &&
- (sb1->s_encoding_flags == sb2->s_encoding_flags));
-#else
- return true;
-#endif
-}
-
int may_setattr(struct mnt_idmap *idmap, struct inode *inode,
unsigned int ia_valid);
int setattr_prepare(struct mnt_idmap *, struct dentry *, struct iattr *);
diff --git a/include/linux/fs/super.h b/include/linux/fs/super.h
new file mode 100644
index 000000000000..f21ffbb6dea5
--- /dev/null
+++ b/include/linux/fs/super.h
@@ -0,0 +1,238 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_FS_SUPER_H
+#define _LINUX_FS_SUPER_H
+
+#include <linux/fs/super_types.h>
+#include <linux/unicode.h>
+
+/*
+ * These are internal functions, please use sb_start_{write,pagefault,intwrite}
+ * instead.
+ */
+static inline void __sb_end_write(struct super_block *sb, int level)
+{
+ percpu_up_read(sb->s_writers.rw_sem + level - 1);
+}
+
+static inline void __sb_start_write(struct super_block *sb, int level)
+{
+ percpu_down_read_freezable(sb->s_writers.rw_sem + level - 1, true);
+}
+
+static inline bool __sb_start_write_trylock(struct super_block *sb, int level)
+{
+ return percpu_down_read_trylock(sb->s_writers.rw_sem + level - 1);
+}
+
+#define __sb_writers_acquired(sb, lev) \
+ percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev) - 1], 1, _THIS_IP_)
+#define __sb_writers_release(sb, lev) \
+ percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev) - 1], _THIS_IP_)
+
+/**
+ * __sb_write_started - check if sb freeze level is held
+ * @sb: the super we write to
+ * @level: the freeze level
+ *
+ * * > 0 - sb freeze level is held
+ * * 0 - sb freeze level is not held
+ * * < 0 - !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN
+ */
+static inline int __sb_write_started(const struct super_block *sb, int level)
+{
+ return lockdep_is_held_type(sb->s_writers.rw_sem + level - 1, 1);
+}
+
+/**
+ * sb_write_started - check if SB_FREEZE_WRITE is held
+ * @sb: the super we write to
+ *
+ * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN.
+ */
+static inline bool sb_write_started(const struct super_block *sb)
+{
+ return __sb_write_started(sb, SB_FREEZE_WRITE);
+}
+
+/**
+ * sb_write_not_started - check if SB_FREEZE_WRITE is not held
+ * @sb: the super we write to
+ *
+ * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN.
+ */
+static inline bool sb_write_not_started(const struct super_block *sb)
+{
+ return __sb_write_started(sb, SB_FREEZE_WRITE) <= 0;
+}
+
+/**
+ * sb_end_write - drop write access to a superblock
+ * @sb: the super we wrote to
+ *
+ * Decrement number of writers to the filesystem. Wake up possible waiters
+ * wanting to freeze the filesystem.
+ */
+static inline void sb_end_write(struct super_block *sb)
+{
+ __sb_end_write(sb, SB_FREEZE_WRITE);
+}
+
+/**
+ * sb_end_pagefault - drop write access to a superblock from a page fault
+ * @sb: the super we wrote to
+ *
+ * Decrement number of processes handling write page fault to the filesystem.
+ * Wake up possible waiters wanting to freeze the filesystem.
+ */
+static inline void sb_end_pagefault(struct super_block *sb)
+{
+ __sb_end_write(sb, SB_FREEZE_PAGEFAULT);
+}
+
+/**
+ * sb_end_intwrite - drop write access to a superblock for internal fs purposes
+ * @sb: the super we wrote to
+ *
+ * Decrement fs-internal number of writers to the filesystem. Wake up possible
+ * waiters wanting to freeze the filesystem.
+ */
+static inline void sb_end_intwrite(struct super_block *sb)
+{
+ __sb_end_write(sb, SB_FREEZE_FS);
+}
+
+/**
+ * sb_start_write - get write access to a superblock
+ * @sb: the super we write to
+ *
+ * When a process wants to write data or metadata to a file system (i.e. dirty
+ * a page or an inode), it should embed the operation in a sb_start_write() -
+ * sb_end_write() pair to get exclusion against file system freezing. This
+ * function increments number of writers preventing freezing. If the file
+ * system is already frozen, the function waits until the file system is
+ * thawed.
+ *
+ * Since freeze protection behaves as a lock, users have to preserve
+ * ordering of freeze protection and other filesystem locks. Generally,
+ * freeze protection should be the outermost lock. In particular, we have:
+ *
+ * sb_start_write
+ * -> i_rwsem (write path, truncate, directory ops, ...)
+ * -> s_umount (freeze_super, thaw_super)
+ */
+static inline void sb_start_write(struct super_block *sb)
+{
+ __sb_start_write(sb, SB_FREEZE_WRITE);
+}
+
+DEFINE_GUARD(super_write,
+ struct super_block *,
+ sb_start_write(_T),
+ sb_end_write(_T))
+
+static inline bool sb_start_write_trylock(struct super_block *sb)
+{
+ return __sb_start_write_trylock(sb, SB_FREEZE_WRITE);
+}
+
+/**
+ * sb_start_pagefault - get write access to a superblock from a page fault
+ * @sb: the super we write to
+ *
+ * When a process starts handling write page fault, it should embed the
+ * operation into sb_start_pagefault() - sb_end_pagefault() pair to get
+ * exclusion against file system freezing. This is needed since the page fault
+ * is going to dirty a page. This function increments number of running page
+ * faults preventing freezing. If the file system is already frozen, the
+ * function waits until the file system is thawed.
+ *
+ * Since page fault freeze protection behaves as a lock, users have to preserve
+ * ordering of freeze protection and other filesystem locks. It is advised to
+ * put sb_start_pagefault() close to mmap_lock in lock ordering. Page fault
+ * handling code implies lock dependency:
+ *
+ * mmap_lock
+ * -> sb_start_pagefault
+ */
+static inline void sb_start_pagefault(struct super_block *sb)
+{
+ __sb_start_write(sb, SB_FREEZE_PAGEFAULT);
+}
+
+/**
+ * sb_start_intwrite - get write access to a superblock for internal fs purposes
+ * @sb: the super we write to
+ *
+ * This is the third level of protection against filesystem freezing. It is
+ * free for use by a filesystem. The only requirement is that it must rank
+ * below sb_start_pagefault.
+ *
+ * For example filesystem can call sb_start_intwrite() when starting a
+ * transaction which somewhat eases handling of freezing for internal sources
+ * of filesystem changes (internal fs threads, discarding preallocation on file
+ * close, etc.).
+ */
+static inline void sb_start_intwrite(struct super_block *sb)
+{
+ __sb_start_write(sb, SB_FREEZE_FS);
+}
+
+static inline bool sb_start_intwrite_trylock(struct super_block *sb)
+{
+ return __sb_start_write_trylock(sb, SB_FREEZE_FS);
+}
+
+static inline bool sb_rdonly(const struct super_block *sb)
+{
+ return sb->s_flags & SB_RDONLY;
+}
+
+static inline bool sb_is_blkdev_sb(struct super_block *sb)
+{
+ return IS_ENABLED(CONFIG_BLOCK) && sb == blockdev_superblock;
+}
+
+#if IS_ENABLED(CONFIG_UNICODE)
+static inline struct unicode_map *sb_encoding(const struct super_block *sb)
+{
+ return sb->s_encoding;
+}
+
+/* Compare if two super blocks have the same encoding and flags */
+static inline bool sb_same_encoding(const struct super_block *sb1,
+ const struct super_block *sb2)
+{
+ if (sb1->s_encoding == sb2->s_encoding)
+ return true;
+
+ return (sb1->s_encoding && sb2->s_encoding &&
+ (sb1->s_encoding->version == sb2->s_encoding->version) &&
+ (sb1->s_encoding_flags == sb2->s_encoding_flags));
+}
+#else
+static inline struct unicode_map *sb_encoding(const struct super_block *sb)
+{
+ return NULL;
+}
+
+static inline bool sb_same_encoding(const struct super_block *sb1,
+ const struct super_block *sb2)
+{
+ return true;
+}
+#endif
+
+static inline bool sb_has_encoding(const struct super_block *sb)
+{
+ return !!sb_encoding(sb);
+}
+
+int sb_set_blocksize(struct super_block *sb, int size);
+int __must_check sb_min_blocksize(struct super_block *sb, int size);
+
+int freeze_super(struct super_block *super, enum freeze_holder who,
+ const void *freeze_owner);
+int thaw_super(struct super_block *super, enum freeze_holder who,
+ const void *freeze_owner);
+
+#endif /* _LINUX_FS_SUPER_H */
diff --git a/include/linux/fs/super_types.h b/include/linux/fs/super_types.h
new file mode 100644
index 000000000000..6bd3009e09b3
--- /dev/null
+++ b/include/linux/fs/super_types.h
@@ -0,0 +1,336 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_FS_SUPER_TYPES_H
+#define _LINUX_FS_SUPER_TYPES_H
+
+#include <linux/fs_dirent.h>
+#include <linux/errseq.h>
+#include <linux/list_lru.h>
+#include <linux/list.h>
+#include <linux/list_bl.h>
+#include <linux/llist.h>
+#include <linux/uidgid.h>
+#include <linux/uuid.h>
+#include <linux/percpu-rwsem.h>
+#include <linux/workqueue_types.h>
+#include <linux/quota.h>
+
+struct backing_dev_info;
+struct block_device;
+struct dentry;
+struct dentry_operations;
+struct dquot_operations;
+struct export_operations;
+struct file;
+struct file_system_type;
+struct fscrypt_operations;
+struct fsnotify_sb_info;
+struct fsverity_operations;
+struct kstatfs;
+struct mount;
+struct mtd_info;
+struct quotactl_ops;
+struct shrinker;
+struct unicode_map;
+struct user_namespace;
+struct workqueue_struct;
+struct writeback_control;
+struct xattr_handler;
+
+extern struct super_block *blockdev_superblock;
+
+/* Possible states of 'frozen' field */
+enum {
+ SB_UNFROZEN = 0, /* FS is unfrozen */
+ SB_FREEZE_WRITE = 1, /* Writes, dir ops, ioctls frozen */
+ SB_FREEZE_PAGEFAULT = 2, /* Page faults stopped as well */
+ SB_FREEZE_FS = 3, /* For internal FS use (e.g. to stop internal threads if needed) */
+ SB_FREEZE_COMPLETE = 4, /* ->freeze_fs finished successfully */
+};
+
+#define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1)
+
+struct sb_writers {
+ unsigned short frozen; /* Is sb frozen? */
+ int freeze_kcount; /* How many kernel freeze requests? */
+ int freeze_ucount; /* How many userspace freeze requests? */
+ const void *freeze_owner; /* Owner of the freeze */
+ struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS];
+};
+
+/**
+ * enum freeze_holder - holder of the freeze
+ * @FREEZE_HOLDER_KERNEL: kernel wants to freeze or thaw filesystem
+ * @FREEZE_HOLDER_USERSPACE: userspace wants to freeze or thaw filesystem
+ * @FREEZE_MAY_NEST: whether nesting freeze and thaw requests is allowed
+ * @FREEZE_EXCL: a freeze that can only be undone by the owner
+ *
+ * Indicate who the owner of the freeze or thaw request is and whether
+ * the freeze needs to be exclusive or can nest.
+ * Without @FREEZE_MAY_NEST, multiple freeze and thaw requests from the
+ * same holder aren't allowed. It is however allowed to hold a single
+ * @FREEZE_HOLDER_USERSPACE and a single @FREEZE_HOLDER_KERNEL freeze at
+ * the same time. This is relied upon by some filesystems during online
+ * repair or similar.
+ */
+enum freeze_holder {
+ FREEZE_HOLDER_KERNEL = (1U << 0),
+ FREEZE_HOLDER_USERSPACE = (1U << 1),
+ FREEZE_MAY_NEST = (1U << 2),
+ FREEZE_EXCL = (1U << 3),
+};
+
+struct super_operations {
+ struct inode *(*alloc_inode)(struct super_block *sb);
+ void (*destroy_inode)(struct inode *inode);
+ void (*free_inode)(struct inode *inode);
+ void (*dirty_inode)(struct inode *inode, int flags);
+ int (*write_inode)(struct inode *inode, struct writeback_control *wbc);
+ int (*drop_inode)(struct inode *inode);
+ void (*evict_inode)(struct inode *inode);
+ void (*put_super)(struct super_block *sb);
+ int (*sync_fs)(struct super_block *sb, int wait);
+ int (*freeze_super)(struct super_block *sb, enum freeze_holder who,
+ const void *owner);
+ int (*freeze_fs)(struct super_block *sb);
+ int (*thaw_super)(struct super_block *sb, enum freeze_holder who,
+ const void *owner);
+ int (*unfreeze_fs)(struct super_block *sb);
+ int (*statfs)(struct dentry *dentry, struct kstatfs *kstatfs);
+ int (*remount_fs) (struct super_block *, int *, char *);
+ void (*umount_begin)(struct super_block *sb);
+
+ int (*show_options)(struct seq_file *seq, struct dentry *dentry);
+ int (*show_devname)(struct seq_file *seq, struct dentry *dentry);
+ int (*show_path)(struct seq_file *seq, struct dentry *dentry);
+ int (*show_stats)(struct seq_file *seq, struct dentry *dentry);
+#ifdef CONFIG_QUOTA
+ ssize_t (*quota_read)(struct super_block *sb, int type, char *data,
+ size_t len, loff_t off);
+ ssize_t (*quota_write)(struct super_block *sb, int type,
+ const char *data, size_t len, loff_t off);
+ struct dquot __rcu **(*get_dquots)(struct inode *inode);
+#endif
+ long (*nr_cached_objects)(struct super_block *sb,
+ struct shrink_control *sc);
+ long (*free_cached_objects)(struct super_block *sb,
+ struct shrink_control *sc);
+ /*
+ * If a filesystem can support graceful removal of a device and
+ * continue read-write operations, implement this callback.
+ *
+ * Return 0 if the filesystem can continue read-write.
+ * Non-zero return value or no such callback means the fs will be shutdown
+ * as usual.
+ */
+ int (*remove_bdev)(struct super_block *sb, struct block_device *bdev);
+ void (*shutdown)(struct super_block *sb);
+};
+
+struct super_block {
+ struct list_head s_list; /* Keep this first */
+ dev_t s_dev; /* search index; _not_ kdev_t */
+ unsigned char s_blocksize_bits;
+ unsigned long s_blocksize;
+ loff_t s_maxbytes; /* Max file size */
+ struct file_system_type *s_type;
+ const struct super_operations *s_op;
+ const struct dquot_operations *dq_op;
+ const struct quotactl_ops *s_qcop;
+ const struct export_operations *s_export_op;
+ unsigned long s_flags;
+ unsigned long s_iflags; /* internal SB_I_* flags */
+ unsigned long s_magic;
+ struct dentry *s_root;
+ struct rw_semaphore s_umount;
+ int s_count;
+ atomic_t s_active;
+#ifdef CONFIG_SECURITY
+ void *s_security;
+#endif
+ const struct xattr_handler *const *s_xattr;
+#ifdef CONFIG_FS_ENCRYPTION
+ const struct fscrypt_operations *s_cop;
+ struct fscrypt_keyring *s_master_keys; /* master crypto keys in use */
+#endif
+#ifdef CONFIG_FS_VERITY
+ const struct fsverity_operations *s_vop;
+#endif
+#if IS_ENABLED(CONFIG_UNICODE)
+ struct unicode_map *s_encoding;
+ __u16 s_encoding_flags;
+#endif
+ struct hlist_bl_head s_roots; /* alternate root dentries for NFS */
+ struct mount *s_mounts; /* list of mounts; _not_ for fs use */
+ struct block_device *s_bdev; /* can go away once we use an accessor for @s_bdev_file */
+ struct file *s_bdev_file;
+ struct backing_dev_info *s_bdi;
+ struct mtd_info *s_mtd;
+ struct hlist_node s_instances;
+ unsigned int s_quota_types; /* Bitmask of supported quota types */
+ struct quota_info s_dquot; /* Diskquota specific options */
+
+ struct sb_writers s_writers;
+
+ /*
+ * Keep s_fs_info, s_time_gran, s_fsnotify_mask, and
+ * s_fsnotify_info together for cache efficiency. They are frequently
+ * accessed and rarely modified.
+ */
+ void *s_fs_info; /* Filesystem private info */
+
+ /* Granularity of c/m/atime in ns (cannot be worse than a second) */
+ u32 s_time_gran;
+ /* Time limits for c/m/atime in seconds */
+ time64_t s_time_min;
+ time64_t s_time_max;
+#ifdef CONFIG_FSNOTIFY
+ u32 s_fsnotify_mask;
+ struct fsnotify_sb_info *s_fsnotify_info;
+#endif
+
+ /*
+ * q: why are s_id and s_sysfs_name not the same? both are human
+ * readable strings that identify the filesystem
+ * a: s_id is allowed to change at runtime; it's used in log messages,
+ * and we want to when a device starts out as single device (s_id is dev
+ * name) but then a device is hot added and we have to switch to
+ * identifying it by UUID
+ * but s_sysfs_name is a handle for programmatic access, and can't
+ * change at runtime
+ */
+ char s_id[32]; /* Informational name */
+ uuid_t s_uuid; /* UUID */
+ u8 s_uuid_len; /* Default 16, possibly smaller for weird filesystems */
+
+ /* if set, fs shows up under sysfs at /sys/fs/$FSTYP/s_sysfs_name */
+ char s_sysfs_name[UUID_STRING_LEN + 1];
+
+ unsigned int s_max_links;
+ unsigned int s_d_flags; /* default d_flags for dentries */
+
+ /*
+ * The next field is for VFS *only*. No filesystems have any business
+ * even looking at it. You had been warned.
+ */
+ struct mutex s_vfs_rename_mutex; /* Kludge */
+
+ /*
+ * Filesystem subtype. If non-empty the filesystem type field
+ * in /proc/mounts will be "type.subtype"
+ */
+ const char *s_subtype;
+
+ const struct dentry_operations *__s_d_op; /* default d_op for dentries */
+
+ struct shrinker *s_shrink; /* per-sb shrinker handle */
+
+ /* Number of inodes with nlink == 0 but still referenced */
+ atomic_long_t s_remove_count;
+
+ /* Read-only state of the superblock is being changed */
+ int s_readonly_remount;
+
+ /* per-sb errseq_t for reporting writeback errors via syncfs */
+ errseq_t s_wb_err;
+
+ /* AIO completions deferred from interrupt context */
+ struct workqueue_struct *s_dio_done_wq;
+ struct hlist_head s_pins;
+
+ /*
+ * Owning user namespace and default context in which to
+ * interpret filesystem uids, gids, quotas, device nodes,
+ * xattrs and security labels.
+ */
+ struct user_namespace *s_user_ns;
+
+ /*
+ * The list_lru structure is essentially just a pointer to a table
+ * of per-node lru lists, each of which has its own spinlock.
+ * There is no need to put them into separate cachelines.
+ */
+ struct list_lru s_dentry_lru;
+ struct list_lru s_inode_lru;
+ struct rcu_head rcu;
+ struct work_struct destroy_work;
+
+ struct mutex s_sync_lock; /* sync serialisation lock */
+
+ /*
+ * Indicates how deep in a filesystem stack this SB is
+ */
+ int s_stack_depth;
+
+ /* s_inode_list_lock protects s_inodes */
+ spinlock_t s_inode_list_lock ____cacheline_aligned_in_smp;
+ struct list_head s_inodes; /* all inodes */
+
+ spinlock_t s_inode_wblist_lock;
+ struct list_head s_inodes_wb; /* writeback inodes */
+ long s_min_writeback_pages;
+} __randomize_layout;
+
+/*
+ * sb->s_flags. Note that these mirror the equivalent MS_* flags where
+ * represented in both.
+ */
+#define SB_RDONLY BIT(0) /* Mount read-only */
+#define SB_NOSUID BIT(1) /* Ignore suid and sgid bits */
+#define SB_NODEV BIT(2) /* Disallow access to device special files */
+#define SB_NOEXEC BIT(3) /* Disallow program execution */
+#define SB_SYNCHRONOUS BIT(4) /* Writes are synced at once */
+#define SB_MANDLOCK BIT(6) /* Allow mandatory locks on an FS */
+#define SB_DIRSYNC BIT(7) /* Directory modifications are synchronous */
+#define SB_NOATIME BIT(10) /* Do not update access times. */
+#define SB_NODIRATIME BIT(11) /* Do not update directory access times */
+#define SB_SILENT BIT(15)
+#define SB_POSIXACL BIT(16) /* Supports POSIX ACLs */
+#define SB_INLINECRYPT BIT(17) /* Use blk-crypto for encrypted files */
+#define SB_KERNMOUNT BIT(22) /* this is a kern_mount call */
+#define SB_I_VERSION BIT(23) /* Update inode I_version field */
+#define SB_LAZYTIME BIT(25) /* Update the on-disk [acm]times lazily */
+
+/* These sb flags are internal to the kernel */
+#define SB_DEAD BIT(21)
+#define SB_DYING BIT(24)
+#define SB_FORCE BIT(27)
+#define SB_NOSEC BIT(28)
+#define SB_BORN BIT(29)
+#define SB_ACTIVE BIT(30)
+#define SB_NOUSER BIT(31)
+
+/* These flags relate to encoding and casefolding */
+#define SB_ENC_STRICT_MODE_FL (1 << 0)
+#define SB_ENC_NO_COMPAT_FALLBACK_FL (1 << 1)
+
+#define sb_has_strict_encoding(sb) \
+ (sb->s_encoding_flags & SB_ENC_STRICT_MODE_FL)
+
+#if IS_ENABLED(CONFIG_UNICODE)
+#define sb_no_casefold_compat_fallback(sb) \
+ (sb->s_encoding_flags & SB_ENC_NO_COMPAT_FALLBACK_FL)
+#else
+#define sb_no_casefold_compat_fallback(sb) (1)
+#endif
+
+/* sb->s_iflags */
+#define SB_I_CGROUPWB 0x00000001 /* cgroup-aware writeback enabled */
+#define SB_I_NOEXEC 0x00000002 /* Ignore executables on this fs */
+#define SB_I_NODEV 0x00000004 /* Ignore devices on this fs */
+#define SB_I_STABLE_WRITES 0x00000008 /* don't modify blks until WB is done */
+
+/* sb->s_iflags to limit user namespace mounts */
+#define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */
+#define SB_I_IMA_UNVERIFIABLE_SIGNATURE 0x00000020
+#define SB_I_UNTRUSTED_MOUNTER 0x00000040
+#define SB_I_EVM_HMAC_UNSUPPORTED 0x00000080
+
+#define SB_I_SKIP_SYNC 0x00000100 /* Skip superblock at global sync */
+#define SB_I_PERSB_BDI 0x00000200 /* has a per-sb bdi */
+#define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */
+#define SB_I_RETIRED 0x00000800 /* superblock shouldn't be reused */
+#define SB_I_NOUMASK 0x00001000 /* VFS does not apply umask */
+#define SB_I_NOIDMAP 0x00002000 /* No idmapped mounts on this superblock */
+#define SB_I_ALLOW_HSM 0x00004000 /* Allow HSM events on this superblock */
+
+#endif /* _LINUX_FS_SUPER_TYPES_H */
diff --git a/include/linux/fs_types.h b/include/linux/fs_dirent.h
index 54816791196f..92f75c5bac19 100644
--- a/include/linux/fs_types.h
+++ b/include/linux/fs_dirent.h
@@ -1,6 +1,9 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_FS_TYPES_H
-#define _LINUX_FS_TYPES_H
+#ifndef _LINUX_FS_DIRENT_H
+#define _LINUX_FS_DIRENT_H
+
+#include <linux/stat.h>
+#include <linux/types.h>
/*
* This is a header for the common implementation of dirent
@@ -66,10 +69,10 @@
/*
* declarations for helper functions, accompanying implementation
- * is in fs/fs_types.c
+ * is in fs/fs_dirent.c
*/
extern unsigned char fs_ftype_to_dtype(unsigned int filetype);
extern unsigned char fs_umode_to_ftype(umode_t mode);
extern unsigned char fs_umode_to_dtype(umode_t mode);
-#endif
+#endif /* _LINUX_FS_DIRENT_H */
diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h
index baf200ab5c77..0070764b790a 100644
--- a/include/linux/fs_struct.h
+++ b/include/linux/fs_struct.h
@@ -2,6 +2,7 @@
#ifndef _LINUX_FS_STRUCT_H
#define _LINUX_FS_STRUCT_H
+#include <linux/sched.h>
#include <linux/path.h>
#include <linux/spinlock.h>
#include <linux/seqlock.h>
@@ -41,4 +42,9 @@ static inline void get_fs_pwd(struct fs_struct *fs, struct path *pwd)
extern bool current_chrooted(void);
+static inline int current_umask(void)
+{
+ return current->fs->umask;
+}
+
#endif /* _LINUX_FS_STRUCT_H */
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 7ded7df6e9b5..07f8c309e432 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -193,6 +193,10 @@ static __always_inline struct pt_regs *ftrace_get_regs(struct ftrace_regs *fregs
#if !defined(CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS) || \
defined(CONFIG_HAVE_FTRACE_REGS_HAVING_PT_REGS)
+#ifndef arch_ftrace_partial_regs
+#define arch_ftrace_partial_regs(regs) do {} while (0)
+#endif
+
static __always_inline struct pt_regs *
ftrace_partial_regs(struct ftrace_regs *fregs, struct pt_regs *regs)
{
@@ -202,7 +206,11 @@ ftrace_partial_regs(struct ftrace_regs *fregs, struct pt_regs *regs)
* Since arch_ftrace_get_regs() will check some members and may return
* NULL, we can not use it.
*/
- return &arch_ftrace_regs(fregs)->regs;
+ regs = &arch_ftrace_regs(fregs)->regs;
+
+ /* Allow arch specific updates to regs. */
+ arch_ftrace_partial_regs(regs);
+ return regs;
}
#endif /* !CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS || CONFIG_HAVE_FTRACE_REGS_HAVING_PT_REGS */
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 0ceb4e09306c..623bee335383 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -7,6 +7,7 @@
#include <linux/mmzone.h>
#include <linux/topology.h>
#include <linux/alloc_tag.h>
+#include <linux/cleanup.h>
#include <linux/sched.h>
struct vm_area_struct;
@@ -463,4 +464,6 @@ static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp,
/* This should be paired with folio_put() rather than free_contig_range(). */
#define folio_alloc_gigantic(...) alloc_hooks(folio_alloc_gigantic_noprof(__VA_ARGS__))
+DEFINE_FREE(free_page, void *, free_page((unsigned long)_T))
+
#endif /* __LINUX_GFP_H */
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 105cc4c00cc3..abc20f9810fd 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -249,10 +249,12 @@ static inline void clear_highpage_kasan_tagged(struct page *page)
kunmap_local(kaddr);
}
-#ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGE
+#ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGES
-static inline void tag_clear_highpage(struct page *page)
+/* Return false to let people know we did not initialize the pages */
+static inline bool tag_clear_highpages(struct page *page, int numpages)
{
+ return false;
}
#endif
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index f327d62fc985..71ac78b9f834 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -376,45 +376,30 @@ bool non_uniform_split_supported(struct folio *folio, unsigned int new_order,
int folio_split(struct folio *folio, unsigned int new_order, struct page *page,
struct list_head *list);
/*
- * try_folio_split - try to split a @folio at @page using non uniform split.
+ * try_folio_split_to_order - try to split a @folio at @page to @new_order using
+ * non uniform split.
* @folio: folio to be split
- * @page: split to order-0 at the given page
- * @list: store the after-split folios
+ * @page: split to @new_order at the given page
+ * @new_order: the target split order
*
- * Try to split a @folio at @page using non uniform split to order-0, if
- * non uniform split is not supported, fall back to uniform split.
+ * Try to split a @folio at @page using non uniform split to @new_order, if
+ * non uniform split is not supported, fall back to uniform split. After-split
+ * folios are put back to LRU list. Use min_order_for_split() to get the lower
+ * bound of @new_order.
*
* Return: 0: split is successful, otherwise split failed.
*/
-static inline int try_folio_split(struct folio *folio, struct page *page,
- struct list_head *list)
+static inline int try_folio_split_to_order(struct folio *folio,
+ struct page *page, unsigned int new_order)
{
- int ret = min_order_for_split(folio);
-
- if (ret < 0)
- return ret;
-
- if (!non_uniform_split_supported(folio, 0, false))
- return split_huge_page_to_list_to_order(&folio->page, list,
- ret);
- return folio_split(folio, ret, page, list);
+ if (!non_uniform_split_supported(folio, new_order, /* warns= */ false))
+ return split_huge_page_to_list_to_order(&folio->page, NULL,
+ new_order);
+ return folio_split(folio, new_order, page, NULL);
}
static inline int split_huge_page(struct page *page)
{
- struct folio *folio = page_folio(page);
- int ret = min_order_for_split(folio);
-
- if (ret < 0)
- return ret;
-
- /*
- * split_huge_page() locks the page before splitting and
- * expects the same page that has been split to be locked when
- * returned. split_folio(page_folio(page)) cannot be used here
- * because it converts the page to folio and passes the head
- * page to be split.
- */
- return split_huge_page_to_list_to_order(page, NULL, ret);
+ return split_huge_page_to_list_to_order(page, NULL, 0);
}
void deferred_split_folio(struct folio *folio, bool partially_mapped);
@@ -597,14 +582,20 @@ static inline int split_huge_page(struct page *page)
return -EINVAL;
}
+static inline int min_order_for_split(struct folio *folio)
+{
+ VM_WARN_ON_ONCE_FOLIO(1, folio);
+ return -EINVAL;
+}
+
static inline int split_folio_to_list(struct folio *folio, struct list_head *list)
{
VM_WARN_ON_ONCE_FOLIO(1, folio);
return -EINVAL;
}
-static inline int try_folio_split(struct folio *folio, struct page *page,
- struct list_head *list)
+static inline int try_folio_split_to_order(struct folio *folio,
+ struct page *page, unsigned int new_order)
{
VM_WARN_ON_ONCE_FOLIO(1, folio);
return -EINVAL;
diff --git a/include/linux/iio/buffer-dma.h b/include/linux/iio/buffer-dma.h
index 5eb66a399002..4f33e6a39797 100644
--- a/include/linux/iio/buffer-dma.h
+++ b/include/linux/iio/buffer-dma.h
@@ -174,5 +174,6 @@ int iio_dma_buffer_enqueue_dmabuf(struct iio_buffer *buffer,
size_t size, bool cyclic);
void iio_dma_buffer_lock_queue(struct iio_buffer *buffer);
void iio_dma_buffer_unlock_queue(struct iio_buffer *buffer);
+struct device *iio_dma_buffer_get_dma_dev(struct iio_buffer *buffer);
#endif
diff --git a/include/linux/iio/buffer_impl.h b/include/linux/iio/buffer_impl.h
index e72552e026f3..8d770ced66b2 100644
--- a/include/linux/iio/buffer_impl.h
+++ b/include/linux/iio/buffer_impl.h
@@ -50,6 +50,7 @@ struct sg_table;
* @enqueue_dmabuf: called from userspace via ioctl to queue this DMABUF
* object to this buffer. Requires a valid DMABUF fd, that
* was previouly attached to this buffer.
+ * @get_dma_dev: called to get the DMA channel associated with this buffer.
* @lock_queue: called when the core needs to lock the buffer queue;
* it is used when enqueueing DMABUF objects.
* @unlock_queue: used to unlock a previously locked buffer queue
@@ -90,6 +91,7 @@ struct iio_buffer_access_funcs {
struct iio_dma_buffer_block *block,
struct dma_fence *fence, struct sg_table *sgt,
size_t size, bool cyclic);
+ struct device * (*get_dma_dev)(struct iio_buffer *buffer);
void (*lock_queue)(struct iio_buffer *buffer);
void (*unlock_queue)(struct iio_buffer *buffer);
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index bccb3f1f6262..a6cb241ea00c 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -25,7 +25,6 @@
extern struct files_struct init_files;
extern struct fs_struct init_fs;
extern struct nsproxy init_nsproxy;
-extern struct cred init_cred;
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
#define INIT_PREV_CPUTIME(x) .prev_cputime = { \
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 73dceabc21c8..520e967cb501 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -9,6 +9,7 @@
#include <linux/types.h>
#include <linux/mm_types.h>
#include <linux/blkdev.h>
+#include <linux/pagevec.h>
struct address_space;
struct fiemap_extent_info;
@@ -16,6 +17,7 @@ struct inode;
struct iomap_iter;
struct iomap_dio;
struct iomap_writepage_ctx;
+struct iomap_read_folio_ctx;
struct iov_iter;
struct kiocb;
struct page;
@@ -241,11 +243,12 @@ struct iomap_iter {
unsigned flags;
struct iomap iomap;
struct iomap srcmap;
+ struct folio_batch *fbatch;
void *private;
};
int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops);
-int iomap_iter_advance(struct iomap_iter *iter, u64 *count);
+int iomap_iter_advance(struct iomap_iter *iter, u64 count);
/**
* iomap_length_trim - trimmed length of the current iomap iteration
@@ -282,9 +285,7 @@ static inline u64 iomap_length(const struct iomap_iter *iter)
*/
static inline int iomap_iter_advance_full(struct iomap_iter *iter)
{
- u64 length = iomap_length(iter);
-
- return iomap_iter_advance(iter, &length);
+ return iomap_iter_advance(iter, iomap_length(iter));
}
/**
@@ -339,8 +340,10 @@ static inline bool iomap_want_unshare_iter(const struct iomap_iter *iter)
ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
const struct iomap_ops *ops,
const struct iomap_write_ops *write_ops, void *private);
-int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops);
-void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops);
+void iomap_read_folio(const struct iomap_ops *ops,
+ struct iomap_read_folio_ctx *ctx);
+void iomap_readahead(const struct iomap_ops *ops,
+ struct iomap_read_folio_ctx *ctx);
bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count);
struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len);
bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags);
@@ -349,6 +352,8 @@ bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio);
int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
const struct iomap_ops *ops,
const struct iomap_write_ops *write_ops);
+loff_t iomap_fill_dirty_folios(struct iomap_iter *iter, loff_t offset,
+ loff_t length);
int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
bool *did_zero, const struct iomap_ops *ops,
const struct iomap_write_ops *write_ops, void *private);
@@ -430,6 +435,10 @@ struct iomap_writeback_ops {
* An existing mapping from a previous call to this method can be reused
* by the file system if it is still valid.
*
+ * If this succeeds, iomap_finish_folio_write() must be called once
+ * writeback completes for the range, regardless of whether the
+ * writeback succeeded or failed.
+ *
* Returns the number of bytes processed or a negative errno.
*/
ssize_t (*writeback_range)(struct iomap_writepage_ctx *wpc,
@@ -467,14 +476,41 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio,
loff_t pos, loff_t end_pos, unsigned int dirty_len);
int iomap_ioend_writeback_submit(struct iomap_writepage_ctx *wpc, int error);
-void iomap_start_folio_write(struct inode *inode, struct folio *folio,
- size_t len);
+void iomap_finish_folio_read(struct folio *folio, size_t off, size_t len,
+ int error);
void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
size_t len);
int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio);
int iomap_writepages(struct iomap_writepage_ctx *wpc);
+struct iomap_read_folio_ctx {
+ const struct iomap_read_ops *ops;
+ struct folio *cur_folio;
+ struct readahead_control *rac;
+ void *read_ctx;
+};
+
+struct iomap_read_ops {
+ /*
+ * Read in a folio range.
+ *
+ * If this succeeds, iomap_finish_folio_read() must be called after the
+ * range is read in, regardless of whether the read succeeded or failed.
+ *
+ * Returns 0 on success or a negative error on failure.
+ */
+ int (*read_folio_range)(const struct iomap_iter *iter,
+ struct iomap_read_folio_ctx *ctx, size_t len);
+
+ /*
+ * Submit any pending read requests.
+ *
+ * This is optional.
+ */
+ void (*submit_read)(struct iomap_read_folio_ctx *ctx);
+};
+
/*
* Flags for direct I/O ->end_io:
*/
@@ -518,6 +554,14 @@ struct iomap_dio_ops {
*/
#define IOMAP_DIO_PARTIAL (1 << 2)
+/*
+ * Ensure each bio is aligned to fs block size.
+ *
+ * For filesystems which need to calculate/verify the checksum of each fs
+ * block. Otherwise they may not be able to handle unaligned bios.
+ */
+#define IOMAP_DIO_FSBLOCK_ALIGNED (1 << 3)
+
ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
unsigned int dio_flags, void *private, size_t done_before);
@@ -540,4 +584,30 @@ int iomap_swapfile_activate(struct swap_info_struct *sis,
extern struct bio_set iomap_ioend_bioset;
+#ifdef CONFIG_BLOCK
+extern const struct iomap_read_ops iomap_bio_read_ops;
+
+static inline void iomap_bio_read_folio(struct folio *folio,
+ const struct iomap_ops *ops)
+{
+ struct iomap_read_folio_ctx ctx = {
+ .ops = &iomap_bio_read_ops,
+ .cur_folio = folio,
+ };
+
+ iomap_read_folio(ops, &ctx);
+}
+
+static inline void iomap_bio_readahead(struct readahead_control *rac,
+ const struct iomap_ops *ops)
+{
+ struct iomap_read_folio_ctx ctx = {
+ .ops = &iomap_bio_read_ops,
+ .rac = rac,
+ };
+
+ iomap_readahead(ops, &ctx);
+}
+#endif /* CONFIG_BLOCK */
+
#endif /* LINUX_IOMAP_H */
diff --git a/include/linux/local_lock.h b/include/linux/local_lock.h
index 0d91d060e3e9..b0e6ab329b00 100644
--- a/include/linux/local_lock.h
+++ b/include/linux/local_lock.h
@@ -6,6 +6,7 @@
/**
* local_lock_init - Runtime initialize a lock instance
+ * @lock: The lock variable
*/
#define local_lock_init(lock) __local_lock_init(lock)
@@ -52,7 +53,8 @@
__local_unlock_irqrestore(this_cpu_ptr(lock), flags)
/**
- * local_lock_init - Runtime initialize a lock instance
+ * local_trylock_init - Runtime initialize a lock instance
+ * @lock: The lock variable
*/
#define local_trylock_init(lock) __local_trylock_init(lock)
diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h
index a4dc479157b5..8f82b4eb542f 100644
--- a/include/linux/local_lock_internal.h
+++ b/include/linux/local_lock_internal.h
@@ -99,18 +99,18 @@ do { \
#define __local_lock_acquire(lock) \
do { \
- local_trylock_t *tl; \
- local_lock_t *l; \
+ local_trylock_t *__tl; \
+ local_lock_t *__l; \
\
- l = (local_lock_t *)(lock); \
- tl = (local_trylock_t *)l; \
+ __l = (local_lock_t *)(lock); \
+ __tl = (local_trylock_t *)__l; \
_Generic((lock), \
local_trylock_t *: ({ \
- lockdep_assert(tl->acquired == 0); \
- WRITE_ONCE(tl->acquired, 1); \
+ lockdep_assert(__tl->acquired == 0); \
+ WRITE_ONCE(__tl->acquired, 1); \
}), \
local_lock_t *: (void)0); \
- local_lock_acquire(l); \
+ local_lock_acquire(__l); \
} while (0)
#define __local_lock(lock) \
@@ -133,36 +133,36 @@ do { \
#define __local_trylock(lock) \
({ \
- local_trylock_t *tl; \
+ local_trylock_t *__tl; \
\
preempt_disable(); \
- tl = (lock); \
- if (READ_ONCE(tl->acquired)) { \
+ __tl = (lock); \
+ if (READ_ONCE(__tl->acquired)) { \
preempt_enable(); \
- tl = NULL; \
+ __tl = NULL; \
} else { \
- WRITE_ONCE(tl->acquired, 1); \
+ WRITE_ONCE(__tl->acquired, 1); \
local_trylock_acquire( \
- (local_lock_t *)tl); \
+ (local_lock_t *)__tl); \
} \
- !!tl; \
+ !!__tl; \
})
#define __local_trylock_irqsave(lock, flags) \
({ \
- local_trylock_t *tl; \
+ local_trylock_t *__tl; \
\
local_irq_save(flags); \
- tl = (lock); \
- if (READ_ONCE(tl->acquired)) { \
+ __tl = (lock); \
+ if (READ_ONCE(__tl->acquired)) { \
local_irq_restore(flags); \
- tl = NULL; \
+ __tl = NULL; \
} else { \
- WRITE_ONCE(tl->acquired, 1); \
+ WRITE_ONCE(__tl->acquired, 1); \
local_trylock_acquire( \
- (local_lock_t *)tl); \
+ (local_lock_t *)__tl); \
} \
- !!tl; \
+ !!__tl; \
})
/* preemption or migration must be disabled before calling __local_lock_is_locked */
@@ -170,16 +170,16 @@ do { \
#define __local_lock_release(lock) \
do { \
- local_trylock_t *tl; \
- local_lock_t *l; \
+ local_trylock_t *__tl; \
+ local_lock_t *__l; \
\
- l = (local_lock_t *)(lock); \
- tl = (local_trylock_t *)l; \
- local_lock_release(l); \
+ __l = (local_lock_t *)(lock); \
+ __tl = (local_trylock_t *)__l; \
+ local_lock_release(__l); \
_Generic((lock), \
local_trylock_t *: ({ \
- lockdep_assert(tl->acquired == 1); \
- WRITE_ONCE(tl->acquired, 0); \
+ lockdep_assert(__tl->acquired == 1); \
+ WRITE_ONCE(__tl->acquired, 0); \
}), \
local_lock_t *: (void)0); \
} while (0)
@@ -223,12 +223,12 @@ typedef spinlock_t local_trylock_t;
#define INIT_LOCAL_LOCK(lockname) __LOCAL_SPIN_LOCK_UNLOCKED((lockname))
#define INIT_LOCAL_TRYLOCK(lockname) __LOCAL_SPIN_LOCK_UNLOCKED((lockname))
-#define __local_lock_init(l) \
+#define __local_lock_init(__l) \
do { \
- local_spin_lock_init((l)); \
+ local_spin_lock_init((__l)); \
} while (0)
-#define __local_trylock_init(l) __local_lock_init(l)
+#define __local_trylock_init(__l) __local_lock_init(__l)
#define __local_lock(__lock) \
do { \
diff --git a/include/linux/mailbox/mtk-cmdq-mailbox.h b/include/linux/mailbox/mtk-cmdq-mailbox.h
index 4c1a91b07de3..e1555e06e7e5 100644
--- a/include/linux/mailbox/mtk-cmdq-mailbox.h
+++ b/include/linux/mailbox/mtk-cmdq-mailbox.h
@@ -77,6 +77,16 @@ struct cmdq_pkt {
size_t buf_size; /* real buffer size */
};
+/**
+ * cmdq_get_shift_pa() - get the shift bits of physical address
+ * @chan: mailbox channel
+ *
+ * GCE can only fetch the command buffer address from a 32-bit register.
+ * Some SOCs support more than 32-bit command buffer address for GCE, which
+ * requires some shift bits to make the address fit into the 32-bit register.
+ *
+ * Return: the shift bits of physical address
+ */
u8 cmdq_get_shift_pa(struct mbox_chan *chan);
#endif /* __MTK_CMDQ_MAILBOX_H__ */
diff --git a/include/linux/map_benchmark.h b/include/linux/map_benchmark.h
index 62674c83bde4..48e2ff95332f 100644
--- a/include/linux/map_benchmark.h
+++ b/include/linux/map_benchmark.h
@@ -27,5 +27,6 @@ struct map_benchmark {
__u32 dma_dir; /* DMA data direction */
__u32 dma_trans_ns; /* time for DMA transmission in ns */
__u32 granule; /* how many PAGE_SIZE will do map/unmap once a time */
+ __u8 expansion[76]; /* For future use */
};
#endif /* _KERNEL_DMA_BENCHMARK_H */
diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h
index 7ef2c7c7d803..9d47cdc727ad 100644
--- a/include/linux/mlx5/cq.h
+++ b/include/linux/mlx5/cq.h
@@ -183,6 +183,7 @@ static inline void mlx5_cq_put(struct mlx5_core_cq *cq)
complete(&cq->free);
}
+void mlx5_add_cq_to_tasklet(struct mlx5_core_cq *cq, struct mlx5_eqe *eqe);
int mlx5_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
u32 *in, int inlen, u32 *out, int outlen);
int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 04fa27718cd1..170594b5cb6b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2074,7 +2074,7 @@ static inline unsigned long folio_nr_pages(const struct folio *folio)
return folio_large_nr_pages(folio);
}
-#if !defined(CONFIG_ARCH_HAS_GIGANTIC_PAGE)
+#if !defined(CONFIG_HAVE_GIGANTIC_FOLIOS)
/*
* We don't expect any folios that exceed buddy sizes (and consequently
* memory sections).
@@ -2087,10 +2087,17 @@ static inline unsigned long folio_nr_pages(const struct folio *folio)
* pages are guaranteed to be contiguous.
*/
#define MAX_FOLIO_ORDER PFN_SECTION_SHIFT
-#else
+#elif defined(CONFIG_HUGETLB_PAGE)
/*
* There is no real limit on the folio size. We limit them to the maximum we
- * currently expect (e.g., hugetlb, dax).
+ * currently expect (see CONFIG_HAVE_GIGANTIC_FOLIOS): with hugetlb, we expect
+ * no folios larger than 16 GiB on 64bit and 1 GiB on 32bit.
+ */
+#define MAX_FOLIO_ORDER get_order(IS_ENABLED(CONFIG_64BIT) ? SZ_16G : SZ_1G)
+#else
+/*
+ * Without hugetlb, gigantic folios that are bigger than a single PUD are
+ * currently impossible.
*/
#define MAX_FOLIO_ORDER PUD_ORDER
#endif
@@ -3497,10 +3504,10 @@ struct vm_unmapped_area_info {
extern unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info);
/* truncate.c */
-extern void truncate_inode_pages(struct address_space *, loff_t);
-extern void truncate_inode_pages_range(struct address_space *,
- loff_t lstart, loff_t lend);
-extern void truncate_inode_pages_final(struct address_space *);
+void truncate_inode_pages(struct address_space *mapping, loff_t lstart);
+void truncate_inode_pages_range(struct address_space *mapping, loff_t lstart,
+ uoff_t lend);
+void truncate_inode_pages_final(struct address_space *mapping);
/* generic vm_area_ops exported for stackable file systems */
extern vm_fault_t filemap_fault(struct vm_fault *vmf);
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 847b81ca6436..bf535f0118bb 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -86,8 +86,23 @@ do { \
#define DEFINE_MUTEX(mutexname) \
struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)
-extern void __mutex_init(struct mutex *lock, const char *name,
- struct lock_class_key *key);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void mutex_init_lockep(struct mutex *lock, const char *name, struct lock_class_key *key);
+
+static inline void __mutex_init(struct mutex *lock, const char *name,
+ struct lock_class_key *key)
+{
+ mutex_init_lockep(lock, name, key);
+}
+#else
+extern void mutex_init_generic(struct mutex *lock);
+
+static inline void __mutex_init(struct mutex *lock, const char *name,
+ struct lock_class_key *key)
+{
+ mutex_init_generic(lock);
+}
+#endif /* !CONFIG_DEBUG_LOCK_ALLOC */
/**
* mutex_is_locked - is the mutex locked
@@ -111,17 +126,27 @@ extern bool mutex_is_locked(struct mutex *lock);
#define DEFINE_MUTEX(mutexname) \
struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)
-extern void __mutex_rt_init(struct mutex *lock, const char *name,
- struct lock_class_key *key);
-
#define mutex_is_locked(l) rt_mutex_base_is_locked(&(l)->rtmutex)
-#define __mutex_init(mutex, name, key) \
-do { \
- rt_mutex_base_init(&(mutex)->rtmutex); \
- __mutex_rt_init((mutex), name, key); \
-} while (0)
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+extern void mutex_rt_init_lockdep(struct mutex *mutex, const char *name,
+ struct lock_class_key *key);
+
+static inline void __mutex_init(struct mutex *lock, const char *name,
+ struct lock_class_key *key)
+{
+ mutex_rt_init_lockdep(lock, name, key);
+}
+#else
+extern void mutex_rt_init_generic(struct mutex *mutex);
+
+static inline void __mutex_init(struct mutex *lock, const char *name,
+ struct lock_class_key *key)
+{
+ mutex_rt_init_generic(lock);
+}
+#endif /* !CONFIG_LOCKDEP */
#endif /* CONFIG_PREEMPT_RT */
#ifdef CONFIG_DEBUG_MUTEXES
diff --git a/include/linux/namei.h b/include/linux/namei.h
index fed86221c69c..58600cf234bc 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -7,6 +7,7 @@
#include <linux/path.h>
#include <linux/fcntl.h>
#include <linux/errno.h>
+#include <linux/fs_struct.h>
enum { MAX_NESTED_LINKS = 8 };
@@ -88,6 +89,81 @@ struct dentry *lookup_one_positive_killable(struct mnt_idmap *idmap,
struct qstr *name,
struct dentry *base);
+struct dentry *start_creating(struct mnt_idmap *idmap, struct dentry *parent,
+ struct qstr *name);
+struct dentry *start_removing(struct mnt_idmap *idmap, struct dentry *parent,
+ struct qstr *name);
+struct dentry *start_creating_killable(struct mnt_idmap *idmap,
+ struct dentry *parent,
+ struct qstr *name);
+struct dentry *start_removing_killable(struct mnt_idmap *idmap,
+ struct dentry *parent,
+ struct qstr *name);
+struct dentry *start_creating_noperm(struct dentry *parent, struct qstr *name);
+struct dentry *start_removing_noperm(struct dentry *parent, struct qstr *name);
+struct dentry *start_creating_dentry(struct dentry *parent,
+ struct dentry *child);
+struct dentry *start_removing_dentry(struct dentry *parent,
+ struct dentry *child);
+
+/* end_creating - finish action started with start_creating
+ * @child: dentry returned by start_creating() or vfs_mkdir()
+ *
+ * Unlock and release the child. This can be called after
+ * start_creating() whether that function succeeded or not,
+ * but it is not needed on failure.
+ *
+ * If vfs_mkdir() was called then the value returned from that function
+ * should be given for @child rather than the original dentry, as vfs_mkdir()
+ * may have provided a new dentry.
+ *
+ *
+ * If vfs_mkdir() was not called, then @child will be a valid dentry and
+ * @parent will be ignored.
+ */
+static inline void end_creating(struct dentry *child)
+{
+ end_dirop(child);
+}
+
+/* end_creating_keep - finish action started with start_creating() and return result
+ * @child: dentry returned by start_creating() or vfs_mkdir()
+ *
+ * Unlock and return the child. This can be called after
+ * start_creating() whether that function succeeded or not,
+ * but it is not needed on failure.
+ *
+ * If vfs_mkdir() was called then the value returned from that function
+ * should be given for @child rather than the original dentry, as vfs_mkdir()
+ * may have provided a new dentry.
+ *
+ * Returns: @child, which may be a dentry or an error.
+ *
+ */
+static inline struct dentry *end_creating_keep(struct dentry *child)
+{
+ if (!IS_ERR(child))
+ dget(child);
+ end_dirop(child);
+ return child;
+}
+
+/**
+ * end_removing - finish action started with start_removing
+ * @child: dentry returned by start_removing()
+ * @parent: dentry given to start_removing()
+ *
+ * Unlock and release the child.
+ *
+ * This is identical to end_dirop(). It can be passed the result of
+ * start_removing() whether that was successful or not, but it not needed
+ * if start_removing() failed.
+ */
+static inline void end_removing(struct dentry *child)
+{
+ end_dirop(child);
+}
+
extern int follow_down_one(struct path *);
extern int follow_down(struct path *path, unsigned int flags);
extern int follow_up(struct path *);
@@ -95,6 +171,13 @@ extern int follow_up(struct path *);
extern struct dentry *lock_rename(struct dentry *, struct dentry *);
extern struct dentry *lock_rename_child(struct dentry *, struct dentry *);
extern void unlock_rename(struct dentry *, struct dentry *);
+int start_renaming(struct renamedata *rd, int lookup_flags,
+ struct qstr *old_last, struct qstr *new_last);
+int start_renaming_dentry(struct renamedata *rd, int lookup_flags,
+ struct dentry *old_dentry, struct qstr *new_last);
+int start_renaming_two_dentries(struct renamedata *rd,
+ struct dentry *old_dentry, struct dentry *new_dentry);
+void end_renaming(struct renamedata *rd);
/**
* mode_strip_umask - handle vfs umask stripping
diff --git a/include/linux/ns/ns_common_types.h b/include/linux/ns/ns_common_types.h
new file mode 100644
index 000000000000..b332b019b29c
--- /dev/null
+++ b/include/linux/ns/ns_common_types.h
@@ -0,0 +1,196 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_NS_COMMON_TYPES_H
+#define _LINUX_NS_COMMON_TYPES_H
+
+#include <linux/atomic.h>
+#include <linux/ns/nstree_types.h>
+#include <linux/rbtree.h>
+#include <linux/refcount.h>
+#include <linux/types.h>
+
+struct cgroup_namespace;
+struct dentry;
+struct ipc_namespace;
+struct mnt_namespace;
+struct net;
+struct pid_namespace;
+struct proc_ns_operations;
+struct time_namespace;
+struct user_namespace;
+struct uts_namespace;
+
+extern struct cgroup_namespace init_cgroup_ns;
+extern struct ipc_namespace init_ipc_ns;
+extern struct mnt_namespace init_mnt_ns;
+extern struct net init_net;
+extern struct pid_namespace init_pid_ns;
+extern struct time_namespace init_time_ns;
+extern struct user_namespace init_user_ns;
+extern struct uts_namespace init_uts_ns;
+
+extern const struct proc_ns_operations cgroupns_operations;
+extern const struct proc_ns_operations ipcns_operations;
+extern const struct proc_ns_operations mntns_operations;
+extern const struct proc_ns_operations netns_operations;
+extern const struct proc_ns_operations pidns_operations;
+extern const struct proc_ns_operations pidns_for_children_operations;
+extern const struct proc_ns_operations timens_operations;
+extern const struct proc_ns_operations timens_for_children_operations;
+extern const struct proc_ns_operations userns_operations;
+extern const struct proc_ns_operations utsns_operations;
+
+/*
+ * Namespace lifetimes are managed via a two-tier reference counting model:
+ *
+ * (1) __ns_ref (refcount_t): Main reference count tracking memory
+ * lifetime. Controls when the namespace structure itself is freed.
+ * It also pins the namespace on the namespace trees whereas (2)
+ * only regulates their visibility to userspace.
+ *
+ * (2) __ns_ref_active (atomic_t): Reference count tracking active users.
+ * Controls visibility of the namespace in the namespace trees.
+ * Any live task that uses the namespace (via nsproxy or cred) holds
+ * an active reference. Any open file descriptor or bind-mount of
+ * the namespace holds an active reference. Once all tasks have
+ * called exited their namespaces and all file descriptors and
+ * bind-mounts have been released the active reference count drops
+ * to zero and the namespace becomes inactive. IOW, the namespace
+ * cannot be listed or opened via file handles anymore.
+ *
+ * Note that it is valid to transition from active to inactive and
+ * back from inactive to active e.g., when resurrecting an inactive
+ * namespace tree via the SIOCGSKNS ioctl().
+ *
+ * Relationship and lifecycle states:
+ *
+ * - Active (__ns_ref_active > 0):
+ * Namespace is actively used and visible to userspace. The namespace
+ * can be reopened via /proc/<pid>/ns/<ns_type>, via namespace file
+ * handles, or discovered via listns().
+ *
+ * - Inactive (__ns_ref_active == 0, __ns_ref > 0):
+ * No tasks are actively using the namespace and it isn't pinned by
+ * any bind-mounts or open file descriptors anymore. But the namespace
+ * is still kept alive by internal references. For example, the user
+ * namespace could be pinned by an open file through file->f_cred
+ * references when one of the now defunct tasks had opened a file and
+ * handed the file descriptor off to another process via a UNIX
+ * sockets. Such references keep the namespace structure alive through
+ * __ns_ref but will not hold an active reference.
+ *
+ * - Destroyed (__ns_ref == 0):
+ * No references remain. The namespace is removed from the tree and freed.
+ *
+ * State transitions:
+ *
+ * Active -> Inactive:
+ * When the last task using the namespace exits it drops its active
+ * references to all namespaces. However, user and pid namespaces
+ * remain accessible until the task has been reaped.
+ *
+ * Inactive -> Active:
+ * An inactive namespace tree might be resurrected due to e.g., the
+ * SIOCGSKNS ioctl() on a socket.
+ *
+ * Inactive -> Destroyed:
+ * When __ns_ref drops to zero the namespace is removed from the
+ * namespaces trees and the memory is freed (after RCU grace period).
+ *
+ * Initial namespaces:
+ * Boot-time namespaces (init_net, init_pid_ns, etc.) start with
+ * __ns_ref_active = 1 and remain active forever.
+ *
+ * @ns_type: type of namespace (e.g., CLONE_NEWNET)
+ * @stashed: cached dentry to be used by the vfs
+ * @ops: namespace operations
+ * @inum: namespace inode number (quickly recycled for non-initial namespaces)
+ * @__ns_ref: main reference count (do not use directly)
+ * @ns_tree: namespace tree nodes and active reference count
+ */
+struct ns_common {
+ u32 ns_type;
+ struct dentry *stashed;
+ const struct proc_ns_operations *ops;
+ unsigned int inum;
+ refcount_t __ns_ref; /* do not use directly */
+ union {
+ struct ns_tree;
+ struct rcu_head ns_rcu;
+ };
+};
+
+#define to_ns_common(__ns) \
+ _Generic((__ns), \
+ struct cgroup_namespace *: &(__ns)->ns, \
+ const struct cgroup_namespace *: &(__ns)->ns, \
+ struct ipc_namespace *: &(__ns)->ns, \
+ const struct ipc_namespace *: &(__ns)->ns, \
+ struct mnt_namespace *: &(__ns)->ns, \
+ const struct mnt_namespace *: &(__ns)->ns, \
+ struct net *: &(__ns)->ns, \
+ const struct net *: &(__ns)->ns, \
+ struct pid_namespace *: &(__ns)->ns, \
+ const struct pid_namespace *: &(__ns)->ns, \
+ struct time_namespace *: &(__ns)->ns, \
+ const struct time_namespace *: &(__ns)->ns, \
+ struct user_namespace *: &(__ns)->ns, \
+ const struct user_namespace *: &(__ns)->ns, \
+ struct uts_namespace *: &(__ns)->ns, \
+ const struct uts_namespace *: &(__ns)->ns)
+
+#define ns_init_inum(__ns) \
+ _Generic((__ns), \
+ struct cgroup_namespace *: CGROUP_NS_INIT_INO, \
+ struct ipc_namespace *: IPC_NS_INIT_INO, \
+ struct mnt_namespace *: MNT_NS_INIT_INO, \
+ struct net *: NET_NS_INIT_INO, \
+ struct pid_namespace *: PID_NS_INIT_INO, \
+ struct time_namespace *: TIME_NS_INIT_INO, \
+ struct user_namespace *: USER_NS_INIT_INO, \
+ struct uts_namespace *: UTS_NS_INIT_INO)
+
+#define ns_init_ns(__ns) \
+ _Generic((__ns), \
+ struct cgroup_namespace *: &init_cgroup_ns, \
+ struct ipc_namespace *: &init_ipc_ns, \
+ struct mnt_namespace *: &init_mnt_ns, \
+ struct net *: &init_net, \
+ struct pid_namespace *: &init_pid_ns, \
+ struct time_namespace *: &init_time_ns, \
+ struct user_namespace *: &init_user_ns, \
+ struct uts_namespace *: &init_uts_ns)
+
+#define ns_init_id(__ns) \
+ _Generic((__ns), \
+ struct cgroup_namespace *: CGROUP_NS_INIT_ID, \
+ struct ipc_namespace *: IPC_NS_INIT_ID, \
+ struct mnt_namespace *: MNT_NS_INIT_ID, \
+ struct net *: NET_NS_INIT_ID, \
+ struct pid_namespace *: PID_NS_INIT_ID, \
+ struct time_namespace *: TIME_NS_INIT_ID, \
+ struct user_namespace *: USER_NS_INIT_ID, \
+ struct uts_namespace *: UTS_NS_INIT_ID)
+
+#define to_ns_operations(__ns) \
+ _Generic((__ns), \
+ struct cgroup_namespace *: (IS_ENABLED(CONFIG_CGROUPS) ? &cgroupns_operations : NULL), \
+ struct ipc_namespace *: (IS_ENABLED(CONFIG_IPC_NS) ? &ipcns_operations : NULL), \
+ struct mnt_namespace *: &mntns_operations, \
+ struct net *: (IS_ENABLED(CONFIG_NET_NS) ? &netns_operations : NULL), \
+ struct pid_namespace *: (IS_ENABLED(CONFIG_PID_NS) ? &pidns_operations : NULL), \
+ struct time_namespace *: (IS_ENABLED(CONFIG_TIME_NS) ? &timens_operations : NULL), \
+ struct user_namespace *: (IS_ENABLED(CONFIG_USER_NS) ? &userns_operations : NULL), \
+ struct uts_namespace *: (IS_ENABLED(CONFIG_UTS_NS) ? &utsns_operations : NULL))
+
+#define ns_common_type(__ns) \
+ _Generic((__ns), \
+ struct cgroup_namespace *: CLONE_NEWCGROUP, \
+ struct ipc_namespace *: CLONE_NEWIPC, \
+ struct mnt_namespace *: CLONE_NEWNS, \
+ struct net *: CLONE_NEWNET, \
+ struct pid_namespace *: CLONE_NEWPID, \
+ struct time_namespace *: CLONE_NEWTIME, \
+ struct user_namespace *: CLONE_NEWUSER, \
+ struct uts_namespace *: CLONE_NEWUTS)
+
+#endif /* _LINUX_NS_COMMON_TYPES_H */
diff --git a/include/linux/ns/nstree_types.h b/include/linux/ns/nstree_types.h
new file mode 100644
index 000000000000..2fb28ee31efb
--- /dev/null
+++ b/include/linux/ns/nstree_types.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */
+#ifndef _LINUX_NSTREE_TYPES_H
+#define _LINUX_NSTREE_TYPES_H
+
+#include <linux/rbtree.h>
+#include <linux/list.h>
+
+/**
+ * struct ns_tree_root - Root of a namespace tree
+ * @ns_rb: Red-black tree root for efficient lookups
+ * @ns_list_head: List head for sequential iteration
+ *
+ * Each namespace tree maintains both an rbtree (for O(log n) lookups)
+ * and a list (for efficient sequential iteration). The list is kept in
+ * the same sorted order as the rbtree.
+ */
+struct ns_tree_root {
+ struct rb_root ns_rb;
+ struct list_head ns_list_head;
+};
+
+/**
+ * struct ns_tree_node - Node in a namespace tree
+ * @ns_node: Red-black tree node
+ * @ns_list_entry: List entry for sequential iteration
+ *
+ * Represents a namespace's position in a tree. Each namespace has
+ * multiple tree nodes for different trees (unified, per-type, owner).
+ */
+struct ns_tree_node {
+ struct rb_node ns_node;
+ struct list_head ns_list_entry;
+};
+
+/**
+ * struct ns_tree - Namespace tree nodes and active reference count
+ * @ns_id: Unique namespace identifier
+ * @__ns_ref_active: Active reference count (do not use directly)
+ * @ns_unified_node: Node in the global namespace tree
+ * @ns_tree_node: Node in the per-type namespace tree
+ * @ns_owner_node: Node in the owner namespace's tree of owned namespaces
+ * @ns_owner_root: Root of the tree of namespaces owned by this namespace
+ * (only used when this namespace is an owner)
+ */
+struct ns_tree {
+ u64 ns_id;
+ atomic_t __ns_ref_active;
+ struct ns_tree_node ns_unified_node;
+ struct ns_tree_node ns_tree_node;
+ struct ns_tree_node ns_owner_node;
+ struct ns_tree_root ns_owner_root;
+};
+
+#endif /* _LINUX_NSTREE_TYPES_H */
diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h
index f5b68b8abb54..825f5865bfc5 100644
--- a/include/linux/ns_common.h
+++ b/include/linux/ns_common.h
@@ -2,122 +2,44 @@
#ifndef _LINUX_NS_COMMON_H
#define _LINUX_NS_COMMON_H
+#include <linux/ns/ns_common_types.h>
#include <linux/refcount.h>
-#include <linux/rbtree.h>
+#include <linux/vfsdebug.h>
#include <uapi/linux/sched.h>
+#include <uapi/linux/nsfs.h>
-struct proc_ns_operations;
-
-struct cgroup_namespace;
-struct ipc_namespace;
-struct mnt_namespace;
-struct net;
-struct pid_namespace;
-struct time_namespace;
-struct user_namespace;
-struct uts_namespace;
-
-extern struct cgroup_namespace init_cgroup_ns;
-extern struct ipc_namespace init_ipc_ns;
-extern struct mnt_namespace init_mnt_ns;
-extern struct net init_net;
-extern struct pid_namespace init_pid_ns;
-extern struct time_namespace init_time_ns;
-extern struct user_namespace init_user_ns;
-extern struct uts_namespace init_uts_ns;
-
-extern const struct proc_ns_operations netns_operations;
-extern const struct proc_ns_operations utsns_operations;
-extern const struct proc_ns_operations ipcns_operations;
-extern const struct proc_ns_operations pidns_operations;
-extern const struct proc_ns_operations pidns_for_children_operations;
-extern const struct proc_ns_operations userns_operations;
-extern const struct proc_ns_operations mntns_operations;
-extern const struct proc_ns_operations cgroupns_operations;
-extern const struct proc_ns_operations timens_operations;
-extern const struct proc_ns_operations timens_for_children_operations;
-
-struct ns_common {
- u32 ns_type;
- struct dentry *stashed;
- const struct proc_ns_operations *ops;
- unsigned int inum;
- refcount_t __ns_ref; /* do not use directly */
- union {
- struct {
- u64 ns_id;
- struct rb_node ns_tree_node;
- struct list_head ns_list_node;
- };
- struct rcu_head ns_rcu;
- };
-};
-
+bool is_current_namespace(struct ns_common *ns);
int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum);
void __ns_common_free(struct ns_common *ns);
+struct ns_common *__must_check ns_owner(struct ns_common *ns);
+
+static __always_inline bool is_ns_init_inum(const struct ns_common *ns)
+{
+ VFS_WARN_ON_ONCE(ns->inum == 0);
+ return unlikely(in_range(ns->inum, MNT_NS_INIT_INO,
+ IPC_NS_INIT_INO - MNT_NS_INIT_INO + 1));
+}
+
+static __always_inline bool is_ns_init_id(const struct ns_common *ns)
+{
+ VFS_WARN_ON_ONCE(ns->ns_id == 0);
+ return ns->ns_id <= NS_LAST_INIT_ID;
+}
-#define to_ns_common(__ns) \
- _Generic((__ns), \
- struct cgroup_namespace *: &(__ns)->ns, \
- const struct cgroup_namespace *: &(__ns)->ns, \
- struct ipc_namespace *: &(__ns)->ns, \
- const struct ipc_namespace *: &(__ns)->ns, \
- struct mnt_namespace *: &(__ns)->ns, \
- const struct mnt_namespace *: &(__ns)->ns, \
- struct net *: &(__ns)->ns, \
- const struct net *: &(__ns)->ns, \
- struct pid_namespace *: &(__ns)->ns, \
- const struct pid_namespace *: &(__ns)->ns, \
- struct time_namespace *: &(__ns)->ns, \
- const struct time_namespace *: &(__ns)->ns, \
- struct user_namespace *: &(__ns)->ns, \
- const struct user_namespace *: &(__ns)->ns, \
- struct uts_namespace *: &(__ns)->ns, \
- const struct uts_namespace *: &(__ns)->ns)
-
-#define ns_init_inum(__ns) \
- _Generic((__ns), \
- struct cgroup_namespace *: CGROUP_NS_INIT_INO, \
- struct ipc_namespace *: IPC_NS_INIT_INO, \
- struct mnt_namespace *: MNT_NS_INIT_INO, \
- struct net *: NET_NS_INIT_INO, \
- struct pid_namespace *: PID_NS_INIT_INO, \
- struct time_namespace *: TIME_NS_INIT_INO, \
- struct user_namespace *: USER_NS_INIT_INO, \
- struct uts_namespace *: UTS_NS_INIT_INO)
-
-#define ns_init_ns(__ns) \
- _Generic((__ns), \
- struct cgroup_namespace *: &init_cgroup_ns, \
- struct ipc_namespace *: &init_ipc_ns, \
- struct mnt_namespace *: &init_mnt_ns, \
- struct net *: &init_net, \
- struct pid_namespace *: &init_pid_ns, \
- struct time_namespace *: &init_time_ns, \
- struct user_namespace *: &init_user_ns, \
- struct uts_namespace *: &init_uts_ns)
-
-#define to_ns_operations(__ns) \
- _Generic((__ns), \
- struct cgroup_namespace *: (IS_ENABLED(CONFIG_CGROUPS) ? &cgroupns_operations : NULL), \
- struct ipc_namespace *: (IS_ENABLED(CONFIG_IPC_NS) ? &ipcns_operations : NULL), \
- struct mnt_namespace *: &mntns_operations, \
- struct net *: (IS_ENABLED(CONFIG_NET_NS) ? &netns_operations : NULL), \
- struct pid_namespace *: (IS_ENABLED(CONFIG_PID_NS) ? &pidns_operations : NULL), \
- struct time_namespace *: (IS_ENABLED(CONFIG_TIME_NS) ? &timens_operations : NULL), \
- struct user_namespace *: (IS_ENABLED(CONFIG_USER_NS) ? &userns_operations : NULL), \
- struct uts_namespace *: (IS_ENABLED(CONFIG_UTS_NS) ? &utsns_operations : NULL))
-
-#define ns_common_type(__ns) \
- _Generic((__ns), \
- struct cgroup_namespace *: CLONE_NEWCGROUP, \
- struct ipc_namespace *: CLONE_NEWIPC, \
- struct mnt_namespace *: CLONE_NEWNS, \
- struct net *: CLONE_NEWNET, \
- struct pid_namespace *: CLONE_NEWPID, \
- struct time_namespace *: CLONE_NEWTIME, \
- struct user_namespace *: CLONE_NEWUSER, \
- struct uts_namespace *: CLONE_NEWUTS)
+#define NS_COMMON_INIT(nsname) \
+{ \
+ .ns_type = ns_common_type(&nsname), \
+ .ns_id = ns_init_id(&nsname), \
+ .inum = ns_init_inum(&nsname), \
+ .ops = to_ns_operations(&nsname), \
+ .stashed = NULL, \
+ .__ns_ref = REFCOUNT_INIT(1), \
+ .__ns_ref_active = ATOMIC_INIT(1), \
+ .ns_unified_node.ns_list_entry = LIST_HEAD_INIT(nsname.ns.ns_unified_node.ns_list_entry), \
+ .ns_tree_node.ns_list_entry = LIST_HEAD_INIT(nsname.ns.ns_tree_node.ns_list_entry), \
+ .ns_owner_node.ns_list_entry = LIST_HEAD_INIT(nsname.ns.ns_owner_node.ns_list_entry), \
+ .ns_owner_root.ns_list_head = LIST_HEAD_INIT(nsname.ns.ns_owner_root.ns_list_head), \
+}
#define ns_common_init(__ns) \
__ns_common_init(to_ns_common(__ns), \
@@ -133,21 +55,96 @@ void __ns_common_free(struct ns_common *ns);
#define ns_common_free(__ns) __ns_common_free(to_ns_common((__ns)))
+static __always_inline __must_check int __ns_ref_active_read(const struct ns_common *ns)
+{
+ return atomic_read(&ns->__ns_ref_active);
+}
+
+static __always_inline __must_check int __ns_ref_read(const struct ns_common *ns)
+{
+ return refcount_read(&ns->__ns_ref);
+}
+
static __always_inline __must_check bool __ns_ref_put(struct ns_common *ns)
{
- return refcount_dec_and_test(&ns->__ns_ref);
+ if (is_ns_init_id(ns)) {
+ VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1);
+ VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1);
+ return false;
+ }
+ if (refcount_dec_and_test(&ns->__ns_ref)) {
+ VFS_WARN_ON_ONCE(__ns_ref_active_read(ns));
+ return true;
+ }
+ return false;
}
static __always_inline __must_check bool __ns_ref_get(struct ns_common *ns)
{
- return refcount_inc_not_zero(&ns->__ns_ref);
+ if (is_ns_init_id(ns)) {
+ VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1);
+ VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1);
+ return true;
+ }
+ if (refcount_inc_not_zero(&ns->__ns_ref))
+ return true;
+ VFS_WARN_ON_ONCE(__ns_ref_active_read(ns));
+ return false;
}
-#define ns_ref_read(__ns) refcount_read(&to_ns_common((__ns))->__ns_ref)
-#define ns_ref_inc(__ns) refcount_inc(&to_ns_common((__ns))->__ns_ref)
-#define ns_ref_get(__ns) __ns_ref_get(to_ns_common((__ns)))
-#define ns_ref_put(__ns) __ns_ref_put(to_ns_common((__ns)))
-#define ns_ref_put_and_lock(__ns, __lock) \
- refcount_dec_and_lock(&to_ns_common((__ns))->__ns_ref, (__lock))
+static __always_inline void __ns_ref_inc(struct ns_common *ns)
+{
+ if (is_ns_init_id(ns)) {
+ VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1);
+ VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1);
+ return;
+ }
+ refcount_inc(&ns->__ns_ref);
+}
+
+static __always_inline __must_check bool __ns_ref_dec_and_lock(struct ns_common *ns,
+ spinlock_t *ns_lock)
+{
+ if (is_ns_init_id(ns)) {
+ VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1);
+ VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1);
+ return false;
+ }
+ return refcount_dec_and_lock(&ns->__ns_ref, ns_lock);
+}
+
+#define ns_ref_read(__ns) __ns_ref_read(to_ns_common((__ns)))
+#define ns_ref_inc(__ns) \
+ do { if (__ns) __ns_ref_inc(to_ns_common((__ns))); } while (0)
+#define ns_ref_get(__ns) \
+ ((__ns) ? __ns_ref_get(to_ns_common((__ns))) : false)
+#define ns_ref_put(__ns) \
+ ((__ns) ? __ns_ref_put(to_ns_common((__ns))) : false)
+#define ns_ref_put_and_lock(__ns, __ns_lock) \
+ ((__ns) ? __ns_ref_dec_and_lock(to_ns_common((__ns)), __ns_lock) : false)
+
+#define ns_ref_active_read(__ns) \
+ ((__ns) ? __ns_ref_active_read(to_ns_common(__ns)) : 0)
+
+void __ns_ref_active_put(struct ns_common *ns);
+
+#define ns_ref_active_put(__ns) \
+ do { if (__ns) __ns_ref_active_put(to_ns_common(__ns)); } while (0)
+
+static __always_inline struct ns_common *__must_check ns_get_unless_inactive(struct ns_common *ns)
+{
+ if (!__ns_ref_active_read(ns)) {
+ VFS_WARN_ON_ONCE(is_ns_init_id(ns));
+ return NULL;
+ }
+ if (!__ns_ref_get(ns))
+ return NULL;
+ return ns;
+}
+
+void __ns_ref_active_get(struct ns_common *ns);
+
+#define ns_ref_active_get(__ns) \
+ do { if (__ns) __ns_ref_active_get(to_ns_common(__ns)); } while (0)
#endif
diff --git a/include/linux/nsfs.h b/include/linux/nsfs.h
index e5a5fa83d36b..731b67fc2fec 100644
--- a/include/linux/nsfs.h
+++ b/include/linux/nsfs.h
@@ -37,4 +37,7 @@ void nsfs_init(void);
#define current_in_namespace(__ns) (__current_namespace_from_type(__ns) == __ns)
+void nsproxy_ns_active_get(struct nsproxy *ns);
+void nsproxy_ns_active_put(struct nsproxy *ns);
+
#endif /* _LINUX_NSFS_H */
diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index bd118a187dec..5a67648721c7 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -93,10 +93,13 @@ static inline struct cred *nsset_cred(struct nsset *set)
*/
int copy_namespaces(u64 flags, struct task_struct *tsk);
-void exit_task_namespaces(struct task_struct *tsk);
+void switch_cred_namespaces(const struct cred *old, const struct cred *new);
+void exit_nsproxy_namespaces(struct task_struct *tsk);
+void get_cred_namespaces(struct task_struct *tsk);
+void exit_cred_namespaces(struct task_struct *tsk);
void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new);
int exec_task_namespaces(void);
-void free_nsproxy(struct nsproxy *ns);
+void deactivate_nsproxy(struct nsproxy *ns);
int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **,
struct cred *, struct fs_struct *);
int __init nsproxy_cache_init(void);
@@ -104,7 +107,7 @@ int __init nsproxy_cache_init(void);
static inline void put_nsproxy(struct nsproxy *ns)
{
if (refcount_dec_and_test(&ns->count))
- free_nsproxy(ns);
+ deactivate_nsproxy(ns);
}
static inline void get_nsproxy(struct nsproxy *ns)
diff --git a/include/linux/nstree.h b/include/linux/nstree.h
index 8b8636690473..175e4625bfa6 100644
--- a/include/linux/nstree.h
+++ b/include/linux/nstree.h
@@ -1,22 +1,34 @@
/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */
#ifndef _LINUX_NSTREE_H
#define _LINUX_NSTREE_H
-#include <linux/ns_common.h>
+#include <linux/ns/nstree_types.h>
#include <linux/nsproxy.h>
#include <linux/rbtree.h>
#include <linux/seqlock.h>
#include <linux/rculist.h>
#include <linux/cookie.h>
+#include <uapi/linux/nsfs.h>
-extern struct ns_tree cgroup_ns_tree;
-extern struct ns_tree ipc_ns_tree;
-extern struct ns_tree mnt_ns_tree;
-extern struct ns_tree net_ns_tree;
-extern struct ns_tree pid_ns_tree;
-extern struct ns_tree time_ns_tree;
-extern struct ns_tree user_ns_tree;
-extern struct ns_tree uts_ns_tree;
+struct ns_common;
+
+extern struct ns_tree_root cgroup_ns_tree;
+extern struct ns_tree_root ipc_ns_tree;
+extern struct ns_tree_root mnt_ns_tree;
+extern struct ns_tree_root net_ns_tree;
+extern struct ns_tree_root pid_ns_tree;
+extern struct ns_tree_root time_ns_tree;
+extern struct ns_tree_root user_ns_tree;
+extern struct ns_tree_root uts_ns_tree;
+
+void ns_tree_node_init(struct ns_tree_node *node);
+void ns_tree_root_init(struct ns_tree_root *root);
+bool ns_tree_node_empty(const struct ns_tree_node *node);
+struct rb_node *ns_tree_node_add(struct ns_tree_node *node,
+ struct ns_tree_root *root,
+ int (*cmp)(struct rb_node *, const struct rb_node *));
+void ns_tree_node_del(struct ns_tree_node *node, struct ns_tree_root *root);
#define to_ns_tree(__ns) \
_Generic((__ns), \
@@ -29,17 +41,21 @@ extern struct ns_tree uts_ns_tree;
struct user_namespace *: &(user_ns_tree), \
struct uts_namespace *: &(uts_ns_tree))
-u64 ns_tree_gen_id(struct ns_common *ns);
-void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree);
-void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree);
+#define ns_tree_gen_id(__ns) \
+ __ns_tree_gen_id(to_ns_common(__ns), \
+ (((__ns) == ns_init_ns(__ns)) ? ns_init_id(__ns) : 0))
+
+u64 __ns_tree_gen_id(struct ns_common *ns, u64 id);
+void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree_root *ns_tree);
+void __ns_tree_remove(struct ns_common *ns, struct ns_tree_root *ns_tree);
struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type);
struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns,
- struct ns_tree *ns_tree,
+ struct ns_tree_root *ns_tree,
bool previous);
-static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree *ns_tree)
+static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree_root *ns_tree, u64 id)
{
- ns_tree_gen_id(ns);
+ __ns_tree_gen_id(ns, id);
__ns_tree_add_raw(ns, ns_tree);
}
@@ -59,7 +75,9 @@ static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree *ns_tree)
* This function assigns a new id to the namespace and adds it to the
* appropriate namespace tree and list.
*/
-#define ns_tree_add(__ns) __ns_tree_add(to_ns_common(__ns), to_ns_tree(__ns))
+#define ns_tree_add(__ns) \
+ __ns_tree_add(to_ns_common(__ns), to_ns_tree(__ns), \
+ (((__ns) == ns_init_ns(__ns)) ? ns_init_id(__ns) : 0))
/**
* ns_tree_remove - Remove a namespace from a namespace tree
@@ -73,6 +91,6 @@ static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree *ns_tree)
#define ns_tree_adjoined_rcu(__ns, __previous) \
__ns_tree_adjoined_rcu(to_ns_common(__ns), to_ns_tree(__ns), __previous)
-#define ns_tree_active(__ns) (!RB_EMPTY_NODE(&to_ns_common(__ns)->ns_tree_node))
+#define ns_tree_active(__ns) (!RB_EMPTY_NODE(&to_ns_common(__ns)->ns_tree_node.ns_node))
#endif /* _LINUX_NSTREE_H */
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 09b581c1d878..e601a3144f28 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -38,6 +38,7 @@ int filemap_invalidate_pages(struct address_space *mapping,
int write_inode_now(struct inode *, int sync);
int filemap_fdatawrite(struct address_space *);
int filemap_flush(struct address_space *);
+int filemap_flush_nr(struct address_space *mapping, long *nr_to_write);
int filemap_fdatawait_keep_errors(struct address_space *mapping);
int filemap_fdatawait_range(struct address_space *, loff_t lstart, loff_t lend);
int filemap_fdatawait_range_keep_errors(struct address_space *mapping,
@@ -53,14 +54,10 @@ static inline int filemap_fdatawait(struct address_space *mapping)
bool filemap_range_has_page(struct address_space *, loff_t lstart, loff_t lend);
int filemap_write_and_wait_range(struct address_space *mapping,
loff_t lstart, loff_t lend);
-int __filemap_fdatawrite_range(struct address_space *mapping,
- loff_t start, loff_t end, int sync_mode);
int filemap_fdatawrite_range(struct address_space *mapping,
loff_t start, loff_t end);
int filemap_check_errors(struct address_space *mapping);
void __filemap_set_wb_err(struct address_space *mapping, int err);
-int filemap_fdatawrite_wbc(struct address_space *mapping,
- struct writeback_control *wbc);
int kiocb_write_and_wait(struct kiocb *iocb, size_t count);
static inline int filemap_write_and_wait(struct address_space *mapping)
@@ -942,6 +939,17 @@ static inline pgoff_t folio_next_index(const struct folio *folio)
}
/**
+ * folio_next_pos - Get the file position of the next folio.
+ * @folio: The current folio.
+ *
+ * Return: The position of the folio which follows this folio in the file.
+ */
+static inline loff_t folio_next_pos(const struct folio *folio)
+{
+ return (loff_t)folio_next_index(folio) << PAGE_SHIFT;
+}
+
+/**
* folio_file_page - The page for a particular index.
* @folio: The folio which contains this index.
* @index: The index we want to look up.
@@ -977,6 +985,8 @@ unsigned filemap_get_folios_contig(struct address_space *mapping,
pgoff_t *start, pgoff_t end, struct folio_batch *fbatch);
unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start,
pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch);
+unsigned filemap_get_folios_dirty(struct address_space *mapping,
+ pgoff_t *start, pgoff_t end, struct folio_batch *fbatch);
struct folio *read_cache_folio(struct address_space *, pgoff_t index,
filler_t *filler, struct file *file);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index d1fdf81fbe1e..bf97d49c23cf 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -412,6 +412,8 @@ struct pci_dev {
u16 l1ss; /* L1SS Capability pointer */
#ifdef CONFIG_PCIEASPM
struct pcie_link_state *link_state; /* ASPM link state */
+ unsigned int aspm_l0s_support:1; /* ASPM L0s support */
+ unsigned int aspm_l1_support:1; /* ASPM L1 support */
unsigned int ltr_path:1; /* Latency Tolerance Reporting
supported from root to here */
#endif
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 445517a72ad0..0e7ae12c96d2 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -61,8 +61,7 @@ static inline struct pid_namespace *to_pid_ns(struct ns_common *ns)
static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns)
{
- if (ns != &init_pid_ns)
- ns_ref_inc(ns);
+ ns_ref_inc(ns);
return ns;
}
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 9d42d473d201..7f6a92ac9704 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -44,11 +44,11 @@ typedef unsigned int pipe_index_t;
typedef unsigned short pipe_index_t;
#endif
-/*
- * We have to declare this outside 'struct pipe_inode_info',
- * but then we can't use 'union pipe_index' for an anonymous
- * union, so we end up having to duplicate this declaration
- * below. Annoying.
+/**
+ * struct pipe_index - pipe indeces
+ * @head: The point of buffer production
+ * @tail: The point of buffer consumption
+ * @head_tail: unsigned long union of @head and @tail
*/
union pipe_index {
unsigned long head_tail;
@@ -63,9 +63,7 @@ union pipe_index {
* @mutex: mutex protecting the whole thing
* @rd_wait: reader wait point in case of empty pipe
* @wr_wait: writer wait point in case of full pipe
- * @head: The point of buffer production
- * @tail: The point of buffer consumption
- * @head_tail: unsigned long union of @head and @tail
+ * @pipe_index: the pipe indeces
* @note_loss: The next read() should insert a data-lost message
* @max_usage: The maximum number of slots that may be used in the ring
* @ring_size: total number of buffers (should be a power of 2)
@@ -87,14 +85,7 @@ struct pipe_inode_info {
struct mutex mutex;
wait_queue_head_t rd_wait, wr_wait;
- /* This has to match the 'union pipe_index' above */
- union {
- unsigned long head_tail;
- struct {
- pipe_index_t head;
- pipe_index_t tail;
- };
- };
+ union pipe_index;
unsigned int max_usage;
unsigned int ring_size;
diff --git a/include/linux/pseudo_fs.h b/include/linux/pseudo_fs.h
index 2503f7625d65..a651e60d9410 100644
--- a/include/linux/pseudo_fs.h
+++ b/include/linux/pseudo_fs.h
@@ -9,6 +9,7 @@ struct pseudo_fs_context {
const struct xattr_handler * const *xattr;
const struct dentry_operations *dops;
unsigned long magic;
+ unsigned int s_d_flags;
};
struct pseudo_fs_context *init_pseudo(struct fs_context *fc,
diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
index b7fafe999073..624fda17a785 100644
--- a/include/linux/sched/coredump.h
+++ b/include/linux/sched/coredump.h
@@ -8,7 +8,7 @@
#define SUID_DUMP_USER 1 /* Dump as user of process */
#define SUID_DUMP_ROOT 2 /* Dump as root */
-static inline unsigned long __mm_flags_get_dumpable(struct mm_struct *mm)
+static inline unsigned long __mm_flags_get_dumpable(const struct mm_struct *mm)
{
/*
* By convention, dumpable bits are contained in first 32 bits of the
diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 5ce48eab7a2a..a8a8661839b6 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -1209,4 +1209,118 @@ done_seqretry_irqrestore(seqlock_t *lock, int seq, unsigned long flags)
if (seq & 1)
read_sequnlock_excl_irqrestore(lock, flags);
}
+
+enum ss_state {
+ ss_done = 0,
+ ss_lock,
+ ss_lock_irqsave,
+ ss_lockless,
+};
+
+struct ss_tmp {
+ enum ss_state state;
+ unsigned long data;
+ spinlock_t *lock;
+ spinlock_t *lock_irqsave;
+};
+
+static inline void __scoped_seqlock_cleanup(struct ss_tmp *sst)
+{
+ if (sst->lock)
+ spin_unlock(sst->lock);
+ if (sst->lock_irqsave)
+ spin_unlock_irqrestore(sst->lock_irqsave, sst->data);
+}
+
+extern void __scoped_seqlock_invalid_target(void);
+
+#if (defined(CONFIG_CC_IS_GCC) && CONFIG_GCC_VERSION < 90000) || defined(CONFIG_KASAN)
+/*
+ * For some reason some GCC-8 architectures (nios2, alpha) have trouble
+ * determining that the ss_done state is impossible in __scoped_seqlock_next()
+ * below.
+ *
+ * Similarly KASAN is known to confuse compilers enough to break this. But we
+ * don't care about code quality for KASAN builds anyway.
+ */
+static inline void __scoped_seqlock_bug(void) { }
+#else
+/*
+ * Canary for compiler optimization -- if the compiler doesn't realize this is
+ * an impossible state, it very likely generates sub-optimal code here.
+ */
+extern void __scoped_seqlock_bug(void);
+#endif
+
+static inline void
+__scoped_seqlock_next(struct ss_tmp *sst, seqlock_t *lock, enum ss_state target)
+{
+ switch (sst->state) {
+ case ss_done:
+ __scoped_seqlock_bug();
+ return;
+
+ case ss_lock:
+ case ss_lock_irqsave:
+ sst->state = ss_done;
+ return;
+
+ case ss_lockless:
+ if (!read_seqretry(lock, sst->data)) {
+ sst->state = ss_done;
+ return;
+ }
+ break;
+ }
+
+ switch (target) {
+ case ss_done:
+ __scoped_seqlock_invalid_target();
+ return;
+
+ case ss_lock:
+ sst->lock = &lock->lock;
+ spin_lock(sst->lock);
+ sst->state = ss_lock;
+ return;
+
+ case ss_lock_irqsave:
+ sst->lock_irqsave = &lock->lock;
+ spin_lock_irqsave(sst->lock_irqsave, sst->data);
+ sst->state = ss_lock_irqsave;
+ return;
+
+ case ss_lockless:
+ sst->data = read_seqbegin(lock);
+ return;
+ }
+}
+
+#define __scoped_seqlock_read(_seqlock, _target, _s) \
+ for (struct ss_tmp _s __cleanup(__scoped_seqlock_cleanup) = \
+ { .state = ss_lockless, .data = read_seqbegin(_seqlock) }; \
+ _s.state != ss_done; \
+ __scoped_seqlock_next(&_s, _seqlock, _target))
+
+/**
+ * scoped_seqlock_read (lock, ss_state) - execute the read side critical
+ * section without manual sequence
+ * counter handling or calls to other
+ * helpers
+ * @lock: pointer to seqlock_t protecting the data
+ * @ss_state: one of {ss_lock, ss_lock_irqsave, ss_lockless} indicating
+ * the type of critical read section
+ *
+ * Example:
+ *
+ * scoped_seqlock_read (&lock, ss_lock) {
+ * // read-side critical section
+ * }
+ *
+ * Starts with a lockess pass first. If it fails, restarts the critical
+ * section with the lock held.
+ */
+#define scoped_seqlock_read(_seqlock, _target) \
+ __scoped_seqlock_read(_seqlock, _target, __UNIQUE_ID(seqlock))
+
#endif /* __LINUX_SEQLOCK_H */
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 0e47465ef0fd..774efe592a9a 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -111,7 +111,7 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
pgoff_t index, gfp_t gfp_mask);
int shmem_writeout(struct folio *folio, struct swap_iocb **plug,
struct list_head *folio_list);
-void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
+void shmem_truncate_range(struct inode *inode, loff_t start, uoff_t end);
int shmem_unuse(unsigned int type);
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 66c06fcdfe19..cf84d98964b2 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -77,6 +77,7 @@ struct cachestat_range;
struct cachestat;
struct statmount;
struct mnt_id_req;
+struct ns_id_req;
struct xattr_args;
struct file_attr;
@@ -437,6 +438,9 @@ asmlinkage long sys_statmount(const struct mnt_id_req __user *req,
asmlinkage long sys_listmount(const struct mnt_id_req __user *req,
u64 __user *mnt_ids, size_t nr_mnt_ids,
unsigned int flags);
+asmlinkage long sys_listns(const struct ns_id_req __user *req,
+ u64 __user *ns_ids, size_t nr_ns_ids,
+ unsigned int flags);
asmlinkage long sys_truncate(const char __user *path, long length);
asmlinkage long sys_ftruncate(unsigned int fd, off_t length);
#if BITS_PER_LONG == 32
diff --git a/include/linux/types.h b/include/linux/types.h
index 6dfdb8e8e4c3..d4437e9c452c 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -50,6 +50,7 @@ typedef __kernel_old_gid_t old_gid_t;
#if defined(__GNUC__)
typedef __kernel_loff_t loff_t;
+typedef __kernel_uoff_t uoff_t;
#endif
/*
diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index 3aaf19e77558..8285b19a25e0 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -376,6 +376,9 @@ struct usb_gadget_ops {
* can handle. The UDC must support this and all slower speeds and lower
* number of lanes.
* @state: the state we are now (attached, suspended, configured, etc)
+ * @state_lock: Spinlock protecting the `state` and `teardown` members.
+ * @teardown: True if the device is undergoing teardown, used to prevent
+ * new work from being scheduled during cleanup.
* @name: Identifies the controller hardware type. Used in diagnostics
* and sometimes configuration.
* @dev: Driver model state for this abstract device.
@@ -451,6 +454,8 @@ struct usb_gadget {
enum usb_ssp_rate max_ssp_rate;
enum usb_device_state state;
+ spinlock_t state_lock;
+ bool teardown;
const char *name;
struct device dev;
unsigned isoch_delay;
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 9a9aebbf96b9..9c3be157397e 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -166,13 +166,13 @@ static inline void set_userns_rlimit_max(struct user_namespace *ns,
ns->rlimit_max[type] = max <= LONG_MAX ? max : LONG_MAX;
}
-#ifdef CONFIG_USER_NS
-
static inline struct user_namespace *to_user_ns(struct ns_common *ns)
{
return container_of(ns, struct user_namespace, ns);
}
+#ifdef CONFIG_USER_NS
+
static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
{
if (ns)
diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index b673c31569f3..75dabb763c65 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -384,7 +384,8 @@ virtio_net_hdr_tnl_from_skb(const struct sk_buff *skb,
struct virtio_net_hdr_v1_hash_tunnel *vhdr,
bool tnl_hdr_negotiated,
bool little_endian,
- int vlan_hlen)
+ int vlan_hlen,
+ bool has_data_valid)
{
struct virtio_net_hdr *hdr = (struct virtio_net_hdr *)vhdr;
unsigned int inner_nh, outer_th;
@@ -394,8 +395,8 @@ virtio_net_hdr_tnl_from_skb(const struct sk_buff *skb,
tnl_gso_type = skb_shinfo(skb)->gso_type & (SKB_GSO_UDP_TUNNEL |
SKB_GSO_UDP_TUNNEL_CSUM);
if (!tnl_gso_type)
- return virtio_net_hdr_from_skb(skb, hdr, little_endian, false,
- vlan_hlen);
+ return virtio_net_hdr_from_skb(skb, hdr, little_endian,
+ has_data_valid, vlan_hlen);
/* Tunnel support not negotiated but skb ask for it. */
if (!tnl_hdr_negotiated)
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 22dd4adc5667..f48e8ccffe81 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -189,11 +189,11 @@ void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
void inode_wait_for_writeback(struct inode *inode);
void inode_io_list_del(struct inode *inode);
-/* writeback.h requires fs.h; it, too, is not included from here. */
-static inline void wait_on_inode(struct inode *inode)
+static inline xa_mark_t wbc_to_tag(struct writeback_control *wbc)
{
- wait_var_event(inode_state_wait_address(inode, __I_NEW),
- !(READ_ONCE(inode->i_state) & I_NEW));
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+ return PAGECACHE_TAG_TOWRITE;
+ return PAGECACHE_TAG_DIRTY;
}
#ifdef CONFIG_CGROUP_WRITEBACK
@@ -234,7 +234,7 @@ static inline void inode_attach_wb(struct inode *inode, struct folio *folio)
static inline void inode_detach_wb(struct inode *inode)
{
if (inode->i_wb) {
- WARN_ON_ONCE(!(inode->i_state & I_CLEAR));
+ WARN_ON_ONCE(!(inode_state_read_once(inode) & I_CLEAR));
wb_put(inode->i_wb);
inode->i_wb = NULL;
}
@@ -374,4 +374,9 @@ bool redirty_page_for_writepage(struct writeback_control *, struct page *);
void sb_mark_inode_writeback(struct inode *inode);
void sb_clear_inode_writeback(struct inode *inode);
+/*
+ * 4MB minimal write chunk size
+ */
+#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10))
+
#endif /* WRITEBACK_H */
diff --git a/include/linux/xattr.h b/include/linux/xattr.h
index 86b0d47984a1..64e9afe7d647 100644
--- a/include/linux/xattr.h
+++ b/include/linux/xattr.h
@@ -85,12 +85,12 @@ int __vfs_setxattr_noperm(struct mnt_idmap *, struct dentry *,
const char *, const void *, size_t, int);
int __vfs_setxattr_locked(struct mnt_idmap *, struct dentry *,
const char *, const void *, size_t, int,
- struct inode **);
+ struct delegated_inode *);
int vfs_setxattr(struct mnt_idmap *, struct dentry *, const char *,
const void *, size_t, int);
int __vfs_removexattr(struct mnt_idmap *, struct dentry *, const char *);
int __vfs_removexattr_locked(struct mnt_idmap *, struct dentry *,
- const char *, struct inode **);
+ const char *, struct delegated_inode *);
int vfs_removexattr(struct mnt_idmap *, struct dentry *, const char *);
ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size);
diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 8d0e703bc929..cb4c02d00759 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -2783,6 +2783,11 @@ struct hci_ev_le_per_adv_report {
__u8 data[];
} __packed;
+#define HCI_EV_LE_PA_SYNC_LOST 0x10
+struct hci_ev_le_pa_sync_lost {
+ __le16 handle;
+} __packed;
+
#define LE_PA_DATA_COMPLETE 0x00
#define LE_PA_DATA_MORE_TO_COME 0x01
#define LE_PA_DATA_TRUNCATED 0x02
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index b8100dbfe5d7..0cb87687837f 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -749,7 +749,6 @@ struct hci_conn {
__u8 remote_cap;
__u8 remote_auth;
- __u8 remote_id;
unsigned int sent;
@@ -857,11 +856,12 @@ extern struct mutex hci_cb_list_lock;
/* ----- HCI interface to upper protocols ----- */
int l2cap_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr);
int l2cap_disconn_ind(struct hci_conn *hcon);
-void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags);
+int l2cap_recv_acldata(struct hci_dev *hdev, u16 handle, struct sk_buff *skb,
+ u16 flags);
#if IS_ENABLED(CONFIG_BT_BREDR)
int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags);
-void sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb);
+int sco_recv_scodata(struct hci_dev *hdev, u16 handle, struct sk_buff *skb);
#else
static inline int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr,
__u8 *flags)
@@ -869,23 +869,30 @@ static inline int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr,
return 0;
}
-static inline void sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb)
+static inline int sco_recv_scodata(struct hci_dev *hdev, u16 handle,
+ struct sk_buff *skb)
{
+ kfree_skb(skb);
+ return -ENOENT;
}
#endif
#if IS_ENABLED(CONFIG_BT_LE)
int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags);
-void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags);
+int iso_recv(struct hci_dev *hdev, u16 handle, struct sk_buff *skb,
+ u16 flags);
#else
static inline int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr,
__u8 *flags)
{
return 0;
}
-static inline void iso_recv(struct hci_conn *hcon, struct sk_buff *skb,
- u16 flags)
+
+static inline int iso_recv(struct hci_dev *hdev, u16 handle,
+ struct sk_buff *skb, u16 flags)
{
+ kfree_skb(skb);
+ return -ENOENT;
}
#endif
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index c64fd896b1f9..99ac747b7906 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -536,6 +536,8 @@ static inline unsigned char * tcf_get_base_ptr(struct sk_buff *skb, int layer)
case TCF_LAYER_NETWORK:
return skb_network_header(skb);
case TCF_LAYER_TRANSPORT:
+ if (!skb_transport_header_was_set(skb))
+ break;
return skb_transport_header(skb);
}
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index f3014e4f54fc..0a14daaa5dd4 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -536,7 +536,8 @@ static inline int xfrm_af2proto(unsigned int family)
static inline const struct xfrm_mode *xfrm_ip2inner_mode(struct xfrm_state *x, int ipproto)
{
- if ((ipproto == IPPROTO_IPIP && x->props.family == AF_INET) ||
+ if ((x->sel.family != AF_UNSPEC) ||
+ (ipproto == IPPROTO_IPIP && x->props.family == AF_INET) ||
(ipproto == IPPROTO_IPV6 && x->props.family == AF_INET6))
return &x->inner_mode;
else
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index c08aff044e80..311a341e6fe4 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -120,7 +120,7 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template,
/* may be called for files on pseudo FSes w/ unregistered bdi */
strscpy_pad(__entry->name, bdi_dev_name(bdi), 32);
__entry->ino = inode->i_ino;
- __entry->state = inode->i_state;
+ __entry->state = inode_state_read_once(inode);
__entry->flags = flags;
),
@@ -748,7 +748,7 @@ TRACE_EVENT(writeback_sb_inodes_requeue,
strscpy_pad(__entry->name,
bdi_dev_name(inode_to_bdi(inode)), 32);
__entry->ino = inode->i_ino;
- __entry->state = inode->i_state;
+ __entry->state = inode_state_read_once(inode);
__entry->dirtied_when = inode->dirtied_when;
__entry->cgroup_ino = __trace_wb_assign_cgroup(inode_to_wb(inode));
),
@@ -787,7 +787,7 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
strscpy_pad(__entry->name,
bdi_dev_name(inode_to_bdi(inode)), 32);
__entry->ino = inode->i_ino;
- __entry->state = inode->i_state;
+ __entry->state = inode_state_read_once(inode);
__entry->dirtied_when = inode->dirtied_when;
__entry->writeback_index = inode->i_mapping->writeback_index;
__entry->nr_to_write = nr_to_write;
@@ -839,7 +839,7 @@ DECLARE_EVENT_CLASS(writeback_inode_template,
TP_fast_assign(
__entry->dev = inode->i_sb->s_dev;
__entry->ino = inode->i_ino;
- __entry->state = inode->i_state;
+ __entry->state = inode_state_read_once(inode);
__entry->mode = inode->i_mode;
__entry->dirtied_when = inode->dirtied_when;
),
diff --git a/include/uapi/asm-generic/posix_types.h b/include/uapi/asm-generic/posix_types.h
index b5f7594eee7a..0a90ad92dbf3 100644
--- a/include/uapi/asm-generic/posix_types.h
+++ b/include/uapi/asm-generic/posix_types.h
@@ -86,6 +86,7 @@ typedef struct {
*/
typedef __kernel_long_t __kernel_off_t;
typedef long long __kernel_loff_t;
+typedef unsigned long long __kernel_uoff_t;
typedef __kernel_long_t __kernel_old_time_t;
#ifndef __KERNEL__
typedef __kernel_long_t __kernel_time_t;
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 04e0077fb4c9..942370b3f5d2 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -857,9 +857,11 @@ __SYSCALL(__NR_open_tree_attr, sys_open_tree_attr)
__SYSCALL(__NR_file_getattr, sys_file_getattr)
#define __NR_file_setattr 469
__SYSCALL(__NR_file_setattr, sys_file_setattr)
+#define __NR_listns 470
+__SYSCALL(__NR_listns, sys_listns)
#undef __NR_syscalls
-#define __NR_syscalls 470
+#define __NR_syscalls 471
/*
* 32 bit systems traditionally used different
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 3741ea1b73d8..5e277fd955aa 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -4,6 +4,11 @@
#include <asm/fcntl.h>
#include <linux/openat2.h>
+#ifdef __KERNEL__
+#include <linux/types.h>
+#else
+#include <stdint.h>
+#endif
#define F_SETLEASE (F_LINUX_SPECIFIC_BASE + 0)
#define F_GETLEASE (F_LINUX_SPECIFIC_BASE + 1)
@@ -79,6 +84,17 @@
*/
#define RWF_WRITE_LIFE_NOT_SET RWH_WRITE_LIFE_NOT_SET
+/* Set/Get delegations */
+#define F_GETDELEG (F_LINUX_SPECIFIC_BASE + 15)
+#define F_SETDELEG (F_LINUX_SPECIFIC_BASE + 16)
+
+/* Argument structure for F_GETDELEG and F_SETDELEG */
+struct delegation {
+ uint32_t d_flags; /* Must be 0 */
+ uint16_t d_type; /* F_RDLCK, F_WRLCK, F_UNLCK */
+ uint16_t __pad; /* Must be 0 */
+};
+
/*
* Types of directory notifications that may be requested.
*/
diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h
index 9cd89bcc1d9c..30f3c9eaafaa 100644
--- a/include/uapi/linux/input-event-codes.h
+++ b/include/uapi/linux/input-event-codes.h
@@ -27,7 +27,7 @@
#define INPUT_PROP_TOPBUTTONPAD 0x04 /* softbuttons at top of pad */
#define INPUT_PROP_POINTING_STICK 0x05 /* is a pointing stick */
#define INPUT_PROP_ACCELEROMETER 0x06 /* has accelerometer */
-#define INPUT_PROP_HAPTIC_TOUCHPAD 0x07 /* is a haptic touchpad */
+#define INPUT_PROP_PRESSUREPAD 0x07 /* pressure triggers clicks */
#define INPUT_PROP_MAX 0x1f
#define INPUT_PROP_CNT (INPUT_PROP_MAX + 1)
diff --git a/include/uapi/linux/io_uring/query.h b/include/uapi/linux/io_uring/query.h
index 5d754322a27c..3539ccbfd064 100644
--- a/include/uapi/linux/io_uring/query.h
+++ b/include/uapi/linux/io_uring/query.h
@@ -36,6 +36,9 @@ struct io_uring_query_opcode {
__u64 enter_flags;
/* Bitmask of all supported IOSQE_* flags */
__u64 sqe_flags;
+ /* The number of available query opcodes */
+ __u32 nr_query_opcodes;
+ __u32 __pad;
};
#endif
diff --git a/include/uapi/linux/isst_if.h b/include/uapi/linux/isst_if.h
index 8197a4800604..40aa545101a3 100644
--- a/include/uapi/linux/isst_if.h
+++ b/include/uapi/linux/isst_if.h
@@ -52,7 +52,7 @@ struct isst_if_cpu_map {
/**
* struct isst_if_cpu_maps - structure for CPU map IOCTL
* @cmd_count: Number of CPU mapping command in cpu_map[]
- * @cpu_map[]: Holds one or more CPU map data structure
+ * @cpu_map: Holds one or more CPU map data structure
*
* This structure used with ioctl ISST_IF_GET_PHY_ID to send
* one or more CPU mapping commands. Here IOCTL return value indicates
@@ -82,8 +82,8 @@ struct isst_if_io_reg {
/**
* struct isst_if_io_regs - structure for IO register commands
- * @cmd_count: Number of io reg commands in io_reg[]
- * @io_reg[]: Holds one or more io_reg command structure
+ * @req_count: Number of io reg commands in io_reg[]
+ * @io_reg: Holds one or more io_reg command structure
*
* This structure used with ioctl ISST_IF_IO_CMD to send
* one or more read/write commands to PUNIT. Here IOCTL return value
@@ -120,7 +120,7 @@ struct isst_if_mbox_cmd {
/**
* struct isst_if_mbox_cmds - structure for mailbox commands
* @cmd_count: Number of mailbox commands in mbox_cmd[]
- * @mbox_cmd[]: Holds one or more mbox commands
+ * @mbox_cmd: Holds one or more mbox commands
*
* This structure used with ioctl ISST_IF_MBOX_COMMAND to send
* one or more mailbox commands to PUNIT. Here IOCTL return value
@@ -152,7 +152,7 @@ struct isst_if_msr_cmd {
/**
* struct isst_if_msr_cmds - structure for msr commands
* @cmd_count: Number of mailbox commands in msr_cmd[]
- * @msr_cmd[]: Holds one or more msr commands
+ * @msr_cmd: Holds one or more msr commands
*
* This structure used with ioctl ISST_IF_MSR_COMMAND to send
* one or more MSR commands. IOCTL return value indicates number of
@@ -167,8 +167,9 @@ struct isst_if_msr_cmds {
* struct isst_core_power - Structure to get/set core_power feature
* @get_set: 0: Get, 1: Set
* @socket_id: Socket/package id
- * @power_domain: Power Domain id
+ * @power_domain_id: Power Domain id
* @enable: Feature enable status
+ * @supported: Power domain supports SST_CP interface
* @priority_type: Priority type for the feature (ordered/proportional)
*
* Structure to get/set core_power feature state using IOCTL
@@ -187,11 +188,11 @@ struct isst_core_power {
* struct isst_clos_param - Structure to get/set clos praram
* @get_set: 0: Get, 1: Set
* @socket_id: Socket/package id
- * @power_domain: Power Domain id
- * clos: Clos ID for the parameters
- * min_freq_mhz: Minimum frequency in MHz
- * max_freq_mhz: Maximum frequency in MHz
- * prop_prio: Proportional priority from 0-15
+ * @power_domain_id: Power Domain id
+ * @clos: Clos ID for the parameters
+ * @min_freq_mhz: Minimum frequency in MHz
+ * @max_freq_mhz: Maximum frequency in MHz
+ * @prop_prio: Proportional priority from 0-15
*
* Structure to get/set per clos property using IOCTL
* ISST_IF_CLOS_PARAM.
@@ -209,7 +210,7 @@ struct isst_clos_param {
/**
* struct isst_if_clos_assoc - Structure to assign clos to a CPU
* @socket_id: Socket/package id
- * @power_domain: Power Domain id
+ * @power_domain_id: Power Domain id
* @logical_cpu: CPU number
* @clos: Clos ID to assign to the logical CPU
*
@@ -228,6 +229,7 @@ struct isst_if_clos_assoc {
* @get_set: Request is for get or set
* @punit_cpu_map: Set to 1 if the CPU number is punit numbering not
* Linux CPU number
+ * @assoc_info: CLOS data for this CPU
*
* Structure used to get/set associate CPUs to clos using IOCTL
* ISST_IF_CLOS_ASSOC.
@@ -257,7 +259,7 @@ struct isst_tpmi_instance_count {
/**
* struct isst_perf_level_info - Structure to get information on SST-PP levels
* @socket_id: Socket/package id
- * @power_domain: Power Domain id
+ * @power_domain_id: Power Domain id
* @logical_cpu: CPU number
* @clos: Clos ID to assign to the logical CPU
* @max_level: Maximum performance level supported by the platform
@@ -267,8 +269,8 @@ struct isst_tpmi_instance_count {
* @feature_state: SST-BF and SST-TF (enabled/disabled) status at current level
* @locked: SST-PP performance level change is locked/unlocked
* @enabled: SST-PP feature is enabled or not
- * @sst-tf_support: SST-TF support status at this level
- * @sst-bf_support: SST-BF support status at this level
+ * @sst_tf_support: SST-TF support status at this level
+ * @sst_bf_support: SST-BF support status at this level
*
* Structure to get SST-PP details using IOCTL ISST_IF_PERF_LEVELS.
*/
@@ -289,7 +291,7 @@ struct isst_perf_level_info {
/**
* struct isst_perf_level_control - Structure to set SST-PP level
* @socket_id: Socket/package id
- * @power_domain: Power Domain id
+ * @power_domain_id: Power Domain id
* @level: level to set
*
* Structure used change SST-PP level using IOCTL ISST_IF_PERF_SET_LEVEL.
@@ -303,7 +305,7 @@ struct isst_perf_level_control {
/**
* struct isst_perf_feature_control - Structure to activate SST-BF/SST-TF
* @socket_id: Socket/package id
- * @power_domain: Power Domain id
+ * @power_domain_id: Power Domain id
* @feature: bit 0 = SST-BF state, bit 1 = SST-TF state
*
* Structure used to enable SST-BF/SST-TF using IOCTL ISST_IF_PERF_SET_FEATURE.
@@ -320,7 +322,7 @@ struct isst_perf_feature_control {
/**
* struct isst_perf_level_data_info - Structure to get SST-PP level details
* @socket_id: Socket/package id
- * @power_domain: Power Domain id
+ * @power_domain_id: Power Domain id
* @level: SST-PP level for which caller wants to get information
* @tdp_ratio: TDP Ratio
* @base_freq_mhz: Base frequency in MHz
@@ -341,8 +343,8 @@ struct isst_perf_feature_control {
* @pm_fabric_freq_mhz: Fabric (Uncore) minimum frequency
* @max_buckets: Maximum trl buckets
* @max_trl_levels: Maximum trl levels
- * @bucket_core_counts[TRL_MAX_BUCKETS]: Number of cores per bucket
- * @trl_freq_mhz[TRL_MAX_LEVELS][TRL_MAX_BUCKETS]: maximum frequency
+ * @bucket_core_counts: Number of cores per bucket
+ * @trl_freq_mhz: maximum frequency
* for a bucket and trl level
*
* Structure used to get information on frequencies and TDP for a SST-PP
@@ -402,7 +404,7 @@ struct isst_perf_level_fabric_info {
/**
* struct isst_perf_level_cpu_mask - Structure to get SST-PP level CPU mask
* @socket_id: Socket/package id
- * @power_domain: Power Domain id
+ * @power_domain_id: Power Domain id
* @level: SST-PP level for which caller wants to get information
* @punit_cpu_map: Set to 1 if the CPU number is punit numbering not
* Linux CPU number. If 0 CPU buffer is copied to user space
@@ -430,7 +432,7 @@ struct isst_perf_level_cpu_mask {
/**
* struct isst_base_freq_info - Structure to get SST-BF frequencies
* @socket_id: Socket/package id
- * @power_domain: Power Domain id
+ * @power_domain_id: Power Domain id
* @level: SST-PP level for which caller wants to get information
* @high_base_freq_mhz: High priority CPU base frequency
* @low_base_freq_mhz: Low priority CPU base frequency
@@ -453,9 +455,11 @@ struct isst_base_freq_info {
/**
* struct isst_turbo_freq_info - Structure to get SST-TF frequencies
* @socket_id: Socket/package id
- * @power_domain: Power Domain id
+ * @power_domain_id: Power Domain id
* @level: SST-PP level for which caller wants to get information
* @max_clip_freqs: Maximum number of low priority core clipping frequencies
+ * @max_buckets: Maximum trl buckets
+ * @max_trl_levels: Maximum trl levels
* @lp_clip_freq_mhz: Clip frequencies per trl level
* @bucket_core_counts: Maximum number of cores for a bucket
* @trl_freq_mhz: Frequencies per trl level for each bucket
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index 7fa67c2031a5..5d3f8c9e3a62 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -197,7 +197,7 @@ struct statmount {
*/
struct mnt_id_req {
__u32 size;
- __u32 spare;
+ __u32 mnt_ns_fd;
__u64 mnt_id;
__u64 param;
__u64 mnt_ns_id;
diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h
index e098759ec917..a25e38d1c874 100644
--- a/include/uapi/linux/nsfs.h
+++ b/include/uapi/linux/nsfs.h
@@ -67,4 +67,62 @@ struct nsfs_file_handle {
#define NSFS_FILE_HANDLE_SIZE_VER0 16 /* sizeof first published struct */
#define NSFS_FILE_HANDLE_SIZE_LATEST sizeof(struct nsfs_file_handle) /* sizeof latest published struct */
+enum init_ns_id {
+ IPC_NS_INIT_ID = 1ULL,
+ UTS_NS_INIT_ID = 2ULL,
+ USER_NS_INIT_ID = 3ULL,
+ PID_NS_INIT_ID = 4ULL,
+ CGROUP_NS_INIT_ID = 5ULL,
+ TIME_NS_INIT_ID = 6ULL,
+ NET_NS_INIT_ID = 7ULL,
+ MNT_NS_INIT_ID = 8ULL,
+#ifdef __KERNEL__
+ NS_LAST_INIT_ID = MNT_NS_INIT_ID,
+#endif
+};
+
+enum ns_type {
+ TIME_NS = (1ULL << 7), /* CLONE_NEWTIME */
+ MNT_NS = (1ULL << 17), /* CLONE_NEWNS */
+ CGROUP_NS = (1ULL << 25), /* CLONE_NEWCGROUP */
+ UTS_NS = (1ULL << 26), /* CLONE_NEWUTS */
+ IPC_NS = (1ULL << 27), /* CLONE_NEWIPC */
+ USER_NS = (1ULL << 28), /* CLONE_NEWUSER */
+ PID_NS = (1ULL << 29), /* CLONE_NEWPID */
+ NET_NS = (1ULL << 30), /* CLONE_NEWNET */
+};
+
+/**
+ * struct ns_id_req - namespace ID request structure
+ * @size: size of this structure
+ * @spare: reserved for future use
+ * @filter: filter mask
+ * @ns_id: last namespace id
+ * @user_ns_id: owning user namespace ID
+ *
+ * Structure for passing namespace ID and miscellaneous parameters to
+ * statns(2) and listns(2).
+ *
+ * For statns(2) @param represents the request mask.
+ * For listns(2) @param represents the last listed mount id (or zero).
+ */
+struct ns_id_req {
+ __u32 size;
+ __u32 spare;
+ __u64 ns_id;
+ struct /* listns */ {
+ __u32 ns_type;
+ __u32 spare2;
+ __u64 user_ns_id;
+ };
+};
+
+/*
+ * Special @user_ns_id value that can be passed to listns()
+ */
+#define LISTNS_CURRENT_USER 0xffffffffffffffff /* Caller's userns */
+
+/* List of all ns_id_req versions. */
+#define NS_ID_REQ_SIZE_VER0 32 /* sizeof first published struct */
+
#endif /* __LINUX_NSFS_H */
diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h
index 957db425d459..ea9a6811fc76 100644
--- a/include/uapi/linux/pidfd.h
+++ b/include/uapi/linux/pidfd.h
@@ -26,8 +26,12 @@
#define PIDFD_INFO_CGROUPID (1UL << 2) /* Always returned if available, even if not requested */
#define PIDFD_INFO_EXIT (1UL << 3) /* Only returned if requested. */
#define PIDFD_INFO_COREDUMP (1UL << 4) /* Only returned if requested. */
+#define PIDFD_INFO_SUPPORTED_MASK (1UL << 5) /* Want/got supported mask flags */
+#define PIDFD_INFO_COREDUMP_SIGNAL (1UL << 6) /* Always returned if PIDFD_INFO_COREDUMP is requested. */
#define PIDFD_INFO_SIZE_VER0 64 /* sizeof first published struct */
+#define PIDFD_INFO_SIZE_VER1 72 /* sizeof second published struct */
+#define PIDFD_INFO_SIZE_VER2 80 /* sizeof third published struct */
/*
* Values for @coredump_mask in pidfd_info.
@@ -91,8 +95,11 @@ struct pidfd_info {
__u32 fsuid;
__u32 fsgid;
__s32 exit_code;
- __u32 coredump_mask;
- __u32 __spare1;
+ struct /* coredump info */ {
+ __u32 coredump_mask;
+ __u32 coredump_signal;
+ };
+ __u64 supported_mask; /* Mask flags that this kernel supports */
};
#define PIDFS_IOCTL_MAGIC 0xFF
diff --git a/include/uapi/linux/tee.h b/include/uapi/linux/tee.h
index 386ad36f1a0a..cab5cadca8ef 100644
--- a/include/uapi/linux/tee.h
+++ b/include/uapi/linux/tee.h
@@ -249,8 +249,9 @@ struct tee_ioctl_param {
* @cancel_id: [in] Cancellation id, a unique value to identify this request
* @session: [out] Session id
* @ret: [out] return value
- * @ret_origin [out] origin of the return value
- * @num_params [in] number of parameters following this struct
+ * @ret_origin: [out] origin of the return value
+ * @num_params: [in] number of &struct tee_ioctl_param entries in @params
+ * @params: array of ioctl parameters
*/
struct tee_ioctl_open_session_arg {
__u8 uuid[TEE_IOCTL_UUID_LEN];
@@ -276,14 +277,14 @@ struct tee_ioctl_open_session_arg {
struct tee_ioctl_buf_data)
/**
- * struct tee_ioctl_invoke_func_arg - Invokes a function in a Trusted
- * Application
+ * struct tee_ioctl_invoke_arg - Invokes a function in a Trusted Application
* @func: [in] Trusted Application function, specific to the TA
* @session: [in] Session id
* @cancel_id: [in] Cancellation id, a unique value to identify this request
* @ret: [out] return value
- * @ret_origin [out] origin of the return value
- * @num_params [in] number of parameters following this struct
+ * @ret_origin: [out] origin of the return value
+ * @num_params: [in] number of parameters following this struct
+ * @params: array of ioctl parameters
*/
struct tee_ioctl_invoke_arg {
__u32 func;
@@ -338,7 +339,8 @@ struct tee_ioctl_close_session_arg {
/**
* struct tee_iocl_supp_recv_arg - Receive a request for a supplicant function
* @func: [in] supplicant function
- * @num_params [in/out] number of parameters following this struct
+ * @num_params: [in/out] number of &struct tee_ioctl_param entries in @params
+ * @params: array of ioctl parameters
*
* @num_params is the number of params that tee-supplicant has room to
* receive when input, @num_params is the number of actual params
@@ -363,7 +365,8 @@ struct tee_iocl_supp_recv_arg {
/**
* struct tee_iocl_supp_send_arg - Send a response to a received request
* @ret: [out] return value
- * @num_params [in] number of parameters following this struct
+ * @num_params: [in] number of &struct tee_ioctl_param entries in @params
+ * @params: array of ioctl parameters
*/
struct tee_iocl_supp_send_arg {
__u32 ret;
@@ -454,11 +457,13 @@ struct tee_ioctl_shm_register_fd_data {
*/
/**
- * struct tee_ioctl_invoke_func_arg - Invokes an object in a Trusted Application
+ * struct tee_ioctl_object_invoke_arg - Invokes an object in a
+ * Trusted Application
* @id: [in] Object id
* @op: [in] Object operation, specific to the object
* @ret: [out] return value
* @num_params: [in] number of parameters following this struct
+ * @params: array of ioctl parameters
*/
struct tee_ioctl_object_invoke_arg {
__u64 id;
diff --git a/init/do_mounts.c b/init/do_mounts.c
index 6af29da8889e..64d5e25a2cb5 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -120,7 +120,8 @@ static int __init fs_names_setup(char *str)
static unsigned int __initdata root_delay;
static int __init root_delay_setup(char *str)
{
- root_delay = simple_strtoul(str, NULL, 0);
+ if (kstrtouint(str, 0, &root_delay))
+ return 0;
return 1;
}
diff --git a/init/do_mounts_rd.c b/init/do_mounts_rd.c
index 19d9f33dcacf..eddbe5cb0413 100644
--- a/init/do_mounts_rd.c
+++ b/init/do_mounts_rd.c
@@ -29,8 +29,7 @@ int __initdata rd_image_start; /* starting block # of image */
static int __init ramdisk_start_setup(char *str)
{
- rd_image_start = simple_strtol(str,NULL,0);
- return 1;
+ return kstrtoint(str, 0, &rd_image_start) == 0;
}
__setup("ramdisk_start=", ramdisk_start_setup);
diff --git a/init/init_task.c b/init/init_task.c
index a55e2189206f..d970a847b657 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -62,6 +62,33 @@ unsigned long init_shadow_call_stack[SCS_SIZE / sizeof(long)] = {
};
#endif
+/* init to 2 - one for init_task, one to ensure it is never freed */
+static struct group_info init_groups = { .usage = REFCOUNT_INIT(2) };
+
+/*
+ * The initial credentials for the initial task
+ */
+static struct cred init_cred = {
+ .usage = ATOMIC_INIT(4),
+ .uid = GLOBAL_ROOT_UID,
+ .gid = GLOBAL_ROOT_GID,
+ .suid = GLOBAL_ROOT_UID,
+ .sgid = GLOBAL_ROOT_GID,
+ .euid = GLOBAL_ROOT_UID,
+ .egid = GLOBAL_ROOT_GID,
+ .fsuid = GLOBAL_ROOT_UID,
+ .fsgid = GLOBAL_ROOT_GID,
+ .securebits = SECUREBITS_DEFAULT,
+ .cap_inheritable = CAP_EMPTY_SET,
+ .cap_permitted = CAP_FULL_SET,
+ .cap_effective = CAP_FULL_SET,
+ .cap_bset = CAP_FULL_SET,
+ .user = INIT_USER,
+ .user_ns = &init_user_ns,
+ .group_info = &init_groups,
+ .ucounts = &init_ucounts,
+};
+
/*
* Set up the first task table, touch at your own risk!. Base=0,
* limit=0x1fffff (=2MB)
diff --git a/init/version-timestamp.c b/init/version-timestamp.c
index d071835121c2..375726e05f69 100644
--- a/init/version-timestamp.c
+++ b/init/version-timestamp.c
@@ -8,8 +8,7 @@
#include <linux/utsname.h>
struct uts_namespace init_uts_ns = {
- .ns.ns_type = ns_common_type(&init_uts_ns),
- .ns.__ns_ref = REFCOUNT_INIT(2),
+ .ns = NS_COMMON_INIT(init_uts_ns),
.name = {
.sysname = UTS_SYSNAME,
.nodename = UTS_NODENAME,
@@ -19,10 +18,6 @@ struct uts_namespace init_uts_ns = {
.domainname = UTS_DOMAINNAME,
},
.user_ns = &init_user_ns,
- .ns.inum = ns_init_inum(&init_uts_ns),
-#ifdef CONFIG_UTS_NS
- .ns.ops = &utsns_operations,
-#endif
};
/* FIXED STRINGS! Don't touch! */
diff --git a/io_uring/cmd_net.c b/io_uring/cmd_net.c
index 27a09aa4c9d0..3b75931bd569 100644
--- a/io_uring/cmd_net.c
+++ b/io_uring/cmd_net.c
@@ -127,7 +127,7 @@ static int io_uring_cmd_timestamp(struct socket *sock,
if (!unlikely(skb_queue_empty(&list))) {
scoped_guard(spinlock_irqsave, &q->lock)
- skb_queue_splice(q, &list);
+ skb_queue_splice(&list, q);
}
return -EAGAIN;
}
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 296667ba712c..02339b74ba8d 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -634,6 +634,8 @@ static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool dying)
is_cqe32 = true;
cqe_size <<= 1;
}
+ if (ctx->flags & IORING_SETUP_CQE32)
+ is_cqe32 = false;
if (!dying) {
if (!io_get_cqe_overflow(ctx, &cqe, true, is_cqe32))
diff --git a/io_uring/mock_file.c b/io_uring/mock_file.c
index 45d3735b2708..3ffac8f72974 100644
--- a/io_uring/mock_file.c
+++ b/io_uring/mock_file.c
@@ -211,10 +211,9 @@ static int io_create_mock_file(struct io_uring_cmd *cmd, unsigned int issue_flag
const struct file_operations *fops = &io_mock_fops;
const struct io_uring_sqe *sqe = cmd->sqe;
struct io_uring_mock_create mc, __user *uarg;
- struct io_mock_file *mf = NULL;
- struct file *file = NULL;
+ struct file *file;
+ struct io_mock_file *mf __free(kfree) = NULL;
size_t uarg_size;
- int fd = -1, ret;
/*
* It's a testing only driver that allows exercising edge cases
@@ -246,10 +245,6 @@ static int io_create_mock_file(struct io_uring_cmd *cmd, unsigned int issue_flag
if (!mf)
return -ENOMEM;
- ret = fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
- if (fd < 0)
- goto fail;
-
init_waitqueue_head(&mf->poll_wq);
mf->size = mc.file_size;
mf->rw_delay_ns = mc.rw_delay_ns;
@@ -258,33 +253,25 @@ static int io_create_mock_file(struct io_uring_cmd *cmd, unsigned int issue_flag
mf->pollable = true;
}
- file = anon_inode_create_getfile("[io_uring_mock]", fops,
- mf, O_RDWR | O_CLOEXEC, NULL);
- if (IS_ERR(file)) {
- ret = PTR_ERR(file);
- goto fail;
- }
+ FD_PREPARE(fdf, O_RDWR | O_CLOEXEC,
+ anon_inode_create_getfile("[io_uring_mock]", fops, mf,
+ O_RDWR | O_CLOEXEC, NULL));
+ if (fdf.err)
+ return fdf.err;
- file->f_mode |= FMODE_READ | FMODE_CAN_READ |
- FMODE_WRITE | FMODE_CAN_WRITE |
- FMODE_LSEEK;
+ retain_and_null_ptr(mf);
+ file = fd_prepare_file(fdf);
+ file->f_mode |= FMODE_READ | FMODE_CAN_READ | FMODE_WRITE |
+ FMODE_CAN_WRITE | FMODE_LSEEK;
if (mc.flags & IORING_MOCK_CREATE_F_SUPPORT_NOWAIT)
file->f_mode |= FMODE_NOWAIT;
- mc.out_fd = fd;
- if (copy_to_user(uarg, &mc, uarg_size)) {
- fput(file);
- ret = -EFAULT;
- goto fail;
- }
+ mc.out_fd = fd_prepare_fd(fdf);
+ if (copy_to_user(uarg, &mc, uarg_size))
+ return -EFAULT;
- fd_install(fd, file);
+ fd_publish(fdf);
return 0;
-fail:
- if (fd >= 0)
- put_unused_fd(fd);
- kfree(mf);
- return ret;
}
static int io_probe_mock(struct io_uring_cmd *cmd)
diff --git a/io_uring/net.c b/io_uring/net.c
index a95cc9ca2a4d..43d77f95db51 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -1532,8 +1532,10 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
unsigned uvec_segs = kmsg->msg.msg_iter.nr_segs;
int ret;
- ret = io_import_reg_vec(ITER_SOURCE, &kmsg->msg.msg_iter, req,
- &kmsg->vec, uvec_segs, issue_flags);
+ sr->notif->buf_index = req->buf_index;
+ ret = io_import_reg_vec(ITER_SOURCE, &kmsg->msg.msg_iter,
+ sr->notif, &kmsg->vec, uvec_segs,
+ issue_flags);
if (unlikely(ret))
return ret;
req->flags &= ~REQ_F_IMPORT_BUFFER;
diff --git a/io_uring/query.c b/io_uring/query.c
index 645301bd2c82..cf02893ba911 100644
--- a/io_uring/query.c
+++ b/io_uring/query.c
@@ -20,6 +20,8 @@ static ssize_t io_query_ops(void *data)
e->ring_setup_flags = IORING_SETUP_FLAGS;
e->enter_flags = IORING_ENTER_FLAGS;
e->sqe_flags = SQE_VALID_FLAGS;
+ e->nr_query_opcodes = __IO_URING_QUERY_MAX;
+ e->__pad = 0;
return sizeof(*e);
}
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 2602d76d5ff0..0010c4992490 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -943,8 +943,8 @@ int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq,
struct req_iterator rq_iter;
struct io_mapped_ubuf *imu;
struct io_rsrc_node *node;
- struct bio_vec bv, *bvec;
- u16 nr_bvecs;
+ struct bio_vec bv;
+ unsigned int nr_bvecs = 0;
int ret = 0;
io_ring_submit_lock(ctx, issue_flags);
@@ -965,8 +965,11 @@ int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq,
goto unlock;
}
- nr_bvecs = blk_rq_nr_phys_segments(rq);
- imu = io_alloc_imu(ctx, nr_bvecs);
+ /*
+ * blk_rq_nr_phys_segments() may overestimate the number of bvecs
+ * but avoids needing to iterate over the bvecs
+ */
+ imu = io_alloc_imu(ctx, blk_rq_nr_phys_segments(rq));
if (!imu) {
kfree(node);
ret = -ENOMEM;
@@ -977,16 +980,15 @@ int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq,
imu->len = blk_rq_bytes(rq);
imu->acct_pages = 0;
imu->folio_shift = PAGE_SHIFT;
- imu->nr_bvecs = nr_bvecs;
refcount_set(&imu->refs, 1);
imu->release = release;
imu->priv = rq;
imu->is_kbuf = true;
imu->dir = 1 << rq_data_dir(rq);
- bvec = imu->bvec;
rq_for_each_bvec(bv, rq, rq_iter)
- *bvec++ = bv;
+ imu->bvec[nr_bvecs++] = bv;
+ imu->nr_bvecs = nr_bvecs;
node->buf = imu;
data->nodes[index] = node;
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 5b2241a5813c..6310a3d08409 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -277,7 +277,6 @@ static int __io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
} else {
rw->kiocb.ki_ioprio = get_current_ioprio();
}
- rw->kiocb.dio_complete = NULL;
rw->kiocb.ki_flags = 0;
rw->kiocb.ki_write_stream = READ_ONCE(sqe->write_stream);
@@ -463,7 +462,10 @@ int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
void io_readv_writev_cleanup(struct io_kiocb *req)
{
+ struct io_async_rw *rw = req->async_data;
+
lockdep_assert_held(&req->ctx->uring_lock);
+ io_vec_free(&rw->vec);
io_rw_recycle(req, 0);
}
@@ -566,15 +568,6 @@ static inline int io_fixup_rw_res(struct io_kiocb *req, long res)
void io_req_rw_complete(struct io_kiocb *req, io_tw_token_t tw)
{
- struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
- struct kiocb *kiocb = &rw->kiocb;
-
- if ((kiocb->ki_flags & IOCB_DIO_CALLER_COMP) && kiocb->dio_complete) {
- long res = kiocb->dio_complete(rw->kiocb.private);
-
- io_req_set_res(req, io_fixup_rw_res(req, res), 0);
- }
-
io_req_io_end(req);
if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))
@@ -589,10 +582,8 @@ static void io_complete_rw(struct kiocb *kiocb, long res)
struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb);
struct io_kiocb *req = cmd_to_io_kiocb(rw);
- if (!kiocb->dio_complete || !(kiocb->ki_flags & IOCB_DIO_CALLER_COMP)) {
- __io_complete_rw_common(req, res);
- io_req_set_res(req, io_fixup_rw_res(req, res), 0);
- }
+ __io_complete_rw_common(req, res);
+ io_req_set_res(req, io_fixup_rw_res(req, res), 0);
req->io_task_work.func = io_req_rw_complete;
__io_req_task_work_add(req, IOU_F_TWQ_LAZY_WAKE);
}
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 093551fe66a7..56e811f9e5fa 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -892,15 +892,35 @@ static int prepare_open(struct dentry *dentry, int oflag, int ro,
return inode_permission(&nop_mnt_idmap, d_inode(dentry), acc);
}
+static struct file *mqueue_file_open(struct filename *name,
+ struct vfsmount *mnt, int oflag, bool ro,
+ umode_t mode, struct mq_attr *attr)
+{
+ struct dentry *dentry;
+ struct file *file;
+ int ret;
+
+ dentry = start_creating_noperm(mnt->mnt_root, &QSTR(name->name));
+ if (IS_ERR(dentry))
+ return ERR_CAST(dentry);
+
+ ret = prepare_open(dentry, oflag, ro, mode, name, attr);
+ file = ERR_PTR(ret);
+ if (!ret) {
+ const struct path path = { .mnt = mnt, .dentry = dentry };
+ file = dentry_open(&path, oflag, current_cred());
+ }
+
+ end_creating(dentry);
+ return file;
+}
+
static int do_mq_open(const char __user *u_name, int oflag, umode_t mode,
struct mq_attr *attr)
{
+ struct filename *name __free(putname) = NULL;;
struct vfsmount *mnt = current->nsproxy->ipc_ns->mq_mnt;
- struct dentry *root = mnt->mnt_root;
- struct filename *name;
- struct path path;
- int fd, error;
- int ro;
+ int fd, ro;
audit_mq_open(oflag, mode, attr);
@@ -908,37 +928,10 @@ static int do_mq_open(const char __user *u_name, int oflag, umode_t mode,
if (IS_ERR(name))
return PTR_ERR(name);
- fd = get_unused_fd_flags(O_CLOEXEC);
- if (fd < 0)
- goto out_putname;
-
ro = mnt_want_write(mnt); /* we'll drop it in any case */
- inode_lock(d_inode(root));
- path.dentry = lookup_noperm(&QSTR(name->name), root);
- if (IS_ERR(path.dentry)) {
- error = PTR_ERR(path.dentry);
- goto out_putfd;
- }
- path.mnt = mntget(mnt);
- error = prepare_open(path.dentry, oflag, ro, mode, name, attr);
- if (!error) {
- struct file *file = dentry_open(&path, oflag, current_cred());
- if (!IS_ERR(file))
- fd_install(fd, file);
- else
- error = PTR_ERR(file);
- }
- path_put(&path);
-out_putfd:
- if (error) {
- put_unused_fd(fd);
- fd = error;
- }
- inode_unlock(d_inode(root));
+ fd = FD_ADD(O_CLOEXEC, mqueue_file_open(name, mnt, oflag, ro, mode, attr));
if (!ro)
mnt_drop_write(mnt);
-out_putname:
- putname(name);
return fd;
}
@@ -957,7 +950,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
int err;
struct filename *name;
struct dentry *dentry;
- struct inode *inode = NULL;
+ struct inode *inode;
struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
struct vfsmount *mnt = ipc_ns->mq_mnt;
@@ -969,26 +962,20 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
err = mnt_want_write(mnt);
if (err)
goto out_name;
- inode_lock_nested(d_inode(mnt->mnt_root), I_MUTEX_PARENT);
- dentry = lookup_noperm(&QSTR(name->name), mnt->mnt_root);
+ dentry = start_removing_noperm(mnt->mnt_root, &QSTR(name->name));
if (IS_ERR(dentry)) {
err = PTR_ERR(dentry);
- goto out_unlock;
+ goto out_drop_write;
}
inode = d_inode(dentry);
- if (!inode) {
- err = -ENOENT;
- } else {
- ihold(inode);
- err = vfs_unlink(&nop_mnt_idmap, d_inode(dentry->d_parent),
- dentry, NULL);
- }
- dput(dentry);
-
-out_unlock:
- inode_unlock(d_inode(mnt->mnt_root));
+ ihold(inode);
+ err = vfs_unlink(&nop_mnt_idmap, d_inode(mnt->mnt_root),
+ dentry, NULL);
+ end_removing(dentry);
iput(inode);
+
+out_drop_write:
mnt_drop_write(mnt);
out_name:
putname(name);
diff --git a/ipc/msgutil.c b/ipc/msgutil.c
index 7a03f6d03de3..e28f0cecb2ec 100644
--- a/ipc/msgutil.c
+++ b/ipc/msgutil.c
@@ -27,13 +27,8 @@ DEFINE_SPINLOCK(mq_lock);
* and not CONFIG_IPC_NS.
*/
struct ipc_namespace init_ipc_ns = {
- .ns.__ns_ref = REFCOUNT_INIT(1),
+ .ns = NS_COMMON_INIT(init_ipc_ns),
.user_ns = &init_user_ns,
- .ns.inum = ns_init_inum(&init_ipc_ns),
-#ifdef CONFIG_IPC_NS
- .ns.ops = &ipcns_operations,
-#endif
- .ns.ns_type = ns_common_type(&init_ipc_ns),
};
struct msg_msgseg {
diff --git a/ipc/namespace.c b/ipc/namespace.c
index 59b12fcb40bd..c0dbfdd9015f 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -66,6 +66,7 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
if (err)
goto fail_free;
+ ns_tree_gen_id(ns);
ns->user_ns = get_user_ns(user_ns);
ns->ucounts = ucounts;
@@ -86,7 +87,7 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
sem_init_ns(ns);
shm_init_ns(ns);
- ns_tree_add(ns);
+ ns_tree_add_raw(ns);
return ns;
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec
index 422270d64820..54e581072617 100644
--- a/kernel/Kconfig.kexec
+++ b/kernel/Kconfig.kexec
@@ -109,6 +109,15 @@ config KEXEC_HANDOVER
to keep data or state alive across the kexec. For this to work,
both source and target kernels need to have this option enabled.
+config KEXEC_HANDOVER_DEBUG
+ bool "Enable Kexec Handover debug checks"
+ depends on KEXEC_HANDOVER
+ help
+ This option enables extra sanity checks for the Kexec Handover
+ subsystem. Since, KHO performance is crucial in live update
+ scenarios and the extra code might be adding overhead it is
+ only optionally enabled.
+
config CRASH_DUMP
bool "kernel crash dumps"
default ARCH_DEFAULT_CRASH_DUMP
diff --git a/kernel/Makefile b/kernel/Makefile
index df3dd8291bb6..9fe722305c9b 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -83,6 +83,7 @@ obj-$(CONFIG_KEXEC) += kexec.o
obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o
obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o
+obj-$(CONFIG_KEXEC_HANDOVER_DEBUG) += kexec_handover_debug.o
obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_CGROUPS) += cgroup/
diff --git a/kernel/acct.c b/kernel/acct.c
index 61630110e29d..2a2b3c874acd 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -520,26 +520,23 @@ static void fill_ac(struct bsd_acct_struct *acct)
static void acct_write_process(struct bsd_acct_struct *acct)
{
struct file *file = acct->file;
- const struct cred *cred;
acct_t *ac = &acct->ac;
/* Perform file operations on behalf of whoever enabled accounting */
- cred = override_creds(file->f_cred);
-
- /*
- * First check to see if there is enough free_space to continue
- * the process accounting system. Then get freeze protection. If
- * the fs is frozen, just skip the write as we could deadlock
- * the system otherwise.
- */
- if (check_free_space(acct) && file_start_write_trylock(file)) {
- /* it's been opened O_APPEND, so position is irrelevant */
- loff_t pos = 0;
- __kernel_write(file, ac, sizeof(acct_t), &pos);
- file_end_write(file);
+ scoped_with_creds(file->f_cred) {
+ /*
+ * First check to see if there is enough free_space to continue
+ * the process accounting system. Then get freeze protection. If
+ * the fs is frozen, just skip the write as we could deadlock
+ * the system otherwise.
+ */
+ if (check_free_space(acct) && file_start_write_trylock(file)) {
+ /* it's been opened O_APPEND, so position is irrelevant */
+ loff_t pos = 0;
+ __kernel_write(file, ac, sizeof(acct_t), &pos);
+ file_end_write(file);
+ }
}
-
- revert_creds(cred);
}
static void do_acct_process(struct bsd_acct_struct *acct)
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 6ac35430c573..eec60b57bd3d 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -634,37 +634,24 @@ release_prog:
int bpf_iter_new_fd(struct bpf_link *link)
{
struct bpf_iter_link *iter_link;
- struct file *file;
unsigned int flags;
- int err, fd;
+ int err;
if (link->ops != &bpf_iter_link_lops)
return -EINVAL;
flags = O_RDONLY | O_CLOEXEC;
- fd = get_unused_fd_flags(flags);
- if (fd < 0)
- return fd;
-
- file = anon_inode_getfile("bpf_iter", &bpf_iter_fops, NULL, flags);
- if (IS_ERR(file)) {
- err = PTR_ERR(file);
- goto free_fd;
- }
+
+ FD_PREPARE(fdf, flags, anon_inode_getfile("bpf_iter", &bpf_iter_fops, NULL, flags));
+ if (fdf.err)
+ return fdf.err;
iter_link = container_of(link, struct bpf_iter_link, link);
- err = prepare_seq_file(file, iter_link);
+ err = prepare_seq_file(fd_prepare_file(fdf), iter_link);
if (err)
- goto free_file;
+ return err; /* Automatic cleanup handles fput */
- fd_install(fd, file);
- return fd;
-
-free_file:
- fput(file);
-free_fd:
- put_unused_fd(fd);
- return err;
+ return fd_publish(fdf);
}
struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop)
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index eb25e70e0bdc..e4007fea4909 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -4169,7 +4169,8 @@ release_prog:
}
/**
- * bpf_task_work_schedule_signal - Schedule BPF callback using task_work_add with TWA_SIGNAL mode
+ * bpf_task_work_schedule_signal_impl - Schedule BPF callback using task_work_add with TWA_SIGNAL
+ * mode
* @task: Task struct for which callback should be scheduled
* @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping
* @map__map: bpf_map that embeds struct bpf_task_work in the values
@@ -4178,15 +4179,17 @@ release_prog:
*
* Return: 0 if task work has been scheduled successfully, negative error code otherwise
*/
-__bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct bpf_task_work *tw,
- void *map__map, bpf_task_work_callback_t callback,
- void *aux__prog)
+__bpf_kfunc int bpf_task_work_schedule_signal_impl(struct task_struct *task,
+ struct bpf_task_work *tw, void *map__map,
+ bpf_task_work_callback_t callback,
+ void *aux__prog)
{
return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_SIGNAL);
}
/**
- * bpf_task_work_schedule_resume - Schedule BPF callback using task_work_add with TWA_RESUME mode
+ * bpf_task_work_schedule_resume_impl - Schedule BPF callback using task_work_add with TWA_RESUME
+ * mode
* @task: Task struct for which callback should be scheduled
* @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping
* @map__map: bpf_map that embeds struct bpf_task_work in the values
@@ -4195,9 +4198,10 @@ __bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct b
*
* Return: 0 if task work has been scheduled successfully, negative error code otherwise
*/
-__bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task, struct bpf_task_work *tw,
- void *map__map, bpf_task_work_callback_t callback,
- void *aux__prog)
+__bpf_kfunc int bpf_task_work_schedule_resume_impl(struct task_struct *task,
+ struct bpf_task_work *tw, void *map__map,
+ bpf_task_work_callback_t callback,
+ void *aux__prog)
{
return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_RESUME);
}
@@ -4376,9 +4380,9 @@ BTF_ID_FLAGS(func, bpf_strnstr);
#if defined(CONFIG_BPF_LSM) && defined(CONFIG_CGROUPS)
BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU)
#endif
-BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_stream_vprintk_impl, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_task_work_schedule_signal_impl, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_task_work_schedule_resume_impl, KF_TRUSTED_ARGS)
BTF_KFUNCS_END(common_btf_ids)
static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c
index eb6c5a21c2ef..ff16c631951b 100644
--- a/kernel/bpf/stream.c
+++ b/kernel/bpf/stream.c
@@ -355,7 +355,8 @@ __bpf_kfunc_start_defs();
* Avoid using enum bpf_stream_id so that kfunc users don't have to pull in the
* enum in headers.
*/
-__bpf_kfunc int bpf_stream_vprintk(int stream_id, const char *fmt__str, const void *args, u32 len__sz, void *aux__prog)
+__bpf_kfunc int bpf_stream_vprintk_impl(int stream_id, const char *fmt__str, const void *args,
+ u32 len__sz, void *aux__prog)
{
struct bpf_bprintf_data data = {
.get_bin_args = true,
diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c
index 0bbe412f854e..feecd8f4dbf9 100644
--- a/kernel/bpf/token.c
+++ b/kernel/bpf/token.c
@@ -110,16 +110,15 @@ const struct file_operations bpf_token_fops = {
int bpf_token_create(union bpf_attr *attr)
{
+ struct bpf_token *token __free(kfree) = NULL;
struct bpf_mount_opts *mnt_opts;
- struct bpf_token *token = NULL;
struct user_namespace *userns;
struct inode *inode;
- struct file *file;
CLASS(fd, f)(attr->token_create.bpffs_fd);
struct path path;
struct super_block *sb;
umode_t mode;
- int err, fd;
+ int err;
if (fd_empty(f))
return -EBADF;
@@ -166,23 +165,20 @@ int bpf_token_create(union bpf_attr *attr)
inode->i_fop = &bpf_token_fops;
clear_nlink(inode); /* make sure it is unlinked */
- file = alloc_file_pseudo(inode, path.mnt, BPF_TOKEN_INODE_NAME, O_RDWR, &bpf_token_fops);
- if (IS_ERR(file)) {
- iput(inode);
- return PTR_ERR(file);
- }
+ FD_PREPARE(fdf, O_CLOEXEC,
+ alloc_file_pseudo(inode, path.mnt, BPF_TOKEN_INODE_NAME,
+ O_RDWR, &bpf_token_fops));
+ if (fdf.err)
+ return fdf.err;
token = kzalloc(sizeof(*token), GFP_USER);
- if (!token) {
- err = -ENOMEM;
- goto out_file;
- }
+ if (!token)
+ return -ENOMEM;
atomic64_set(&token->refcnt, 1);
- /* remember bpffs owning userns for future ns_capable() checks */
- token->userns = get_user_ns(userns);
-
+ /* remember bpffs owning userns for future ns_capable() checks. */
+ token->userns = userns;
token->allowed_cmds = mnt_opts->delegate_cmds;
token->allowed_maps = mnt_opts->delegate_maps;
token->allowed_progs = mnt_opts->delegate_progs;
@@ -190,24 +186,11 @@ int bpf_token_create(union bpf_attr *attr)
err = security_bpf_token_create(token, attr, &path);
if (err)
- goto out_token;
-
- fd = get_unused_fd_flags(O_CLOEXEC);
- if (fd < 0) {
- err = fd;
- goto out_token;
- }
-
- file->private_data = token;
- fd_install(fd, file);
-
- return fd;
+ return err;
-out_token:
- bpf_token_free(token);
-out_file:
- fput(file);
- return err;
+ get_user_ns(token->userns);
+ fd_prepare_file(fdf)->private_data = no_free_ptr(token);
+ return fd_publish(fdf);
}
int bpf_token_get_info_by_fd(struct bpf_token *token,
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 5949095e51c3..f2cb0b097093 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -479,11 +479,6 @@ again:
* BPF_TRAMP_F_SHARE_IPMODIFY is set, we can generate the
* trampoline again, and retry register.
*/
- /* reset fops->func and fops->trampoline for re-register */
- tr->fops->func = NULL;
- tr->fops->trampoline = 0;
-
- /* free im memory and reallocate later */
bpf_tramp_image_free(im);
goto again;
}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ff40e5e65c43..fbe4bb91c564 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -8866,7 +8866,7 @@ static int widen_imprecise_scalars(struct bpf_verifier_env *env,
struct bpf_verifier_state *cur)
{
struct bpf_func_state *fold, *fcur;
- int i, fr;
+ int i, fr, num_slots;
reset_idmap_scratch(env);
for (fr = old->curframe; fr >= 0; fr--) {
@@ -8879,7 +8879,9 @@ static int widen_imprecise_scalars(struct bpf_verifier_env *env,
&fcur->regs[i],
&env->idmap_scratch);
- for (i = 0; i < fold->allocated_stack / BPF_REG_SIZE; i++) {
+ num_slots = min(fold->allocated_stack / BPF_REG_SIZE,
+ fcur->allocated_stack / BPF_REG_SIZE);
+ for (i = 0; i < num_slots; i++) {
if (!is_spilled_reg(&fold->stack[i]) ||
!is_spilled_reg(&fcur->stack[i]))
continue;
@@ -12259,8 +12261,8 @@ enum special_kfunc_type {
KF_bpf_res_spin_lock_irqsave,
KF_bpf_res_spin_unlock_irqrestore,
KF___bpf_trap,
- KF_bpf_task_work_schedule_signal,
- KF_bpf_task_work_schedule_resume,
+ KF_bpf_task_work_schedule_signal_impl,
+ KF_bpf_task_work_schedule_resume_impl,
};
BTF_ID_LIST(special_kfunc_list)
@@ -12331,13 +12333,13 @@ BTF_ID(func, bpf_res_spin_unlock)
BTF_ID(func, bpf_res_spin_lock_irqsave)
BTF_ID(func, bpf_res_spin_unlock_irqrestore)
BTF_ID(func, __bpf_trap)
-BTF_ID(func, bpf_task_work_schedule_signal)
-BTF_ID(func, bpf_task_work_schedule_resume)
+BTF_ID(func, bpf_task_work_schedule_signal_impl)
+BTF_ID(func, bpf_task_work_schedule_resume_impl)
static bool is_task_work_add_kfunc(u32 func_id)
{
- return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal] ||
- func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume];
+ return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal_impl] ||
+ func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume_impl];
}
static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index fdee387f0d6b..ae1eb7a85eb4 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -250,12 +250,9 @@ bool cgroup_enable_per_threadgroup_rwsem __read_mostly;
/* cgroup namespace for init task */
struct cgroup_namespace init_cgroup_ns = {
- .ns.__ns_ref = REFCOUNT_INIT(2),
+ .ns = NS_COMMON_INIT(init_cgroup_ns),
.user_ns = &init_user_ns,
- .ns.ops = &cgroupns_operations,
- .ns.inum = ns_init_inum(&init_cgroup_ns),
.root_cset = &init_css_set,
- .ns.ns_type = ns_common_type(&init_cgroup_ns),
};
static struct file_system_type cgroup2_fs_type;
@@ -1522,9 +1519,9 @@ static struct cgroup *current_cgns_cgroup_dfl(void)
} else {
/*
* NOTE: This function may be called from bpf_cgroup_from_id()
- * on a task which has already passed exit_task_namespaces() and
- * nsproxy == NULL. Fall back to cgrp_dfl_root which will make all
- * cgroups visible for lookups.
+ * on a task which has already passed exit_nsproxy_namespaces()
+ * and nsproxy == NULL. Fall back to cgrp_dfl_root which will
+ * make all cgroups visible for lookups.
*/
return &cgrp_dfl_root.cgrp;
}
@@ -5363,7 +5360,6 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
struct cgroup_file_ctx *ctx = of->priv;
struct cgroup *src_cgrp, *dst_cgrp;
struct task_struct *task;
- const struct cred *saved_cred;
ssize_t ret;
enum cgroup_attach_lock_mode lock_mode;
@@ -5386,11 +5382,10 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
* permissions using the credentials from file open to protect against
* inherited fd attacks.
*/
- saved_cred = override_creds(of->file->f_cred);
- ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
- of->file->f_path.dentry->d_sb,
- threadgroup, ctx->ns);
- revert_creds(saved_cred);
+ scoped_with_creds(of->file->f_cred)
+ ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
+ of->file->f_path.dentry->d_sb,
+ threadgroup, ctx->ns);
if (ret)
goto out_finish;
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
index fdbe57578e68..db9617556dd7 100644
--- a/kernel/cgroup/namespace.c
+++ b/kernel/cgroup/namespace.c
@@ -30,7 +30,6 @@ static struct cgroup_namespace *alloc_cgroup_ns(void)
ret = ns_common_init(new_ns);
if (ret)
return ERR_PTR(ret);
- ns_tree_add(new_ns);
return no_free_ptr(new_ns);
}
@@ -86,6 +85,7 @@ struct cgroup_namespace *copy_cgroup_ns(u64 flags,
new_ns->ucounts = ucounts;
new_ns->root_cset = cset;
+ ns_tree_add(new_ns);
return new_ns;
}
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 3b1c43382eec..99dac1aa972a 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -373,7 +373,7 @@ static int __crash_shrink_memory(struct resource *old_res,
old_res->start = 0;
old_res->end = 0;
} else {
- crashk_res.end = ram_res->start - 1;
+ old_res->end = ram_res->start - 1;
}
crash_free_reserved_phys_range(ram_res->start, ram_res->end);
diff --git a/kernel/cred.c b/kernel/cred.c
index dbf6b687dc5c..a6f686b30da1 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -35,33 +35,6 @@ do { \
static struct kmem_cache *cred_jar;
-/* init to 2 - one for init_task, one to ensure it is never freed */
-static struct group_info init_groups = { .usage = REFCOUNT_INIT(2) };
-
-/*
- * The initial credentials for the initial task
- */
-struct cred init_cred = {
- .usage = ATOMIC_INIT(4),
- .uid = GLOBAL_ROOT_UID,
- .gid = GLOBAL_ROOT_GID,
- .suid = GLOBAL_ROOT_UID,
- .sgid = GLOBAL_ROOT_GID,
- .euid = GLOBAL_ROOT_UID,
- .egid = GLOBAL_ROOT_GID,
- .fsuid = GLOBAL_ROOT_UID,
- .fsgid = GLOBAL_ROOT_GID,
- .securebits = SECUREBITS_DEFAULT,
- .cap_inheritable = CAP_EMPTY_SET,
- .cap_permitted = CAP_FULL_SET,
- .cap_effective = CAP_FULL_SET,
- .cap_bset = CAP_FULL_SET,
- .user = INIT_USER,
- .user_ns = &init_user_ns,
- .group_info = &init_groups,
- .ucounts = &init_ucounts,
-};
-
/*
* The RCU callback to actually dispose of a set of credentials
*/
@@ -306,6 +279,7 @@ int copy_creds(struct task_struct *p, u64 clone_flags)
kdebug("share_creds(%p{%ld})",
p->cred, atomic_long_read(&p->cred->usage));
inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
+ get_cred_namespaces(p);
return 0;
}
@@ -343,6 +317,8 @@ int copy_creds(struct task_struct *p, u64 clone_flags)
p->cred = p->real_cred = get_cred(new);
inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
+ get_cred_namespaces(p);
+
return 0;
error_put:
@@ -435,10 +411,13 @@ int commit_creds(struct cred *new)
*/
if (new->user != old->user || new->user_ns != old->user_ns)
inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1);
+
rcu_assign_pointer(task->real_cred, new);
rcu_assign_pointer(task->cred, new);
if (new->user != old->user || new->user_ns != old->user_ns)
dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1);
+ if (new->user_ns != old->user_ns)
+ switch_cred_namespaces(old, new);
/* send notifications */
if (!uid_eq(new->uid, old->uid) ||
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 1f9ee9759426..f973e7e73c90 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -481,6 +481,7 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
case PCI_P2PDMA_MAP_BUS_ADDR:
sg->dma_address = pci_p2pdma_bus_addr_map(&p2pdma_state,
sg_phys(sg));
+ sg_dma_len(sg) = sg->length;
sg_dma_mark_bus_address(sg);
continue;
default:
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1fd347da9026..2c35acc2722b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -11901,7 +11901,7 @@ static int cpu_clock_event_add(struct perf_event *event, int flags)
static void cpu_clock_event_del(struct perf_event *event, int flags)
{
- cpu_clock_event_stop(event, flags);
+ cpu_clock_event_stop(event, PERF_EF_UPDATE);
}
static void cpu_clock_event_read(struct perf_event *event)
diff --git a/kernel/exit.c b/kernel/exit.c
index 9f74e8f1c431..988e16efd66b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -291,6 +291,7 @@ repeat:
write_unlock_irq(&tasklist_lock);
/* @thread_pid can't go away until free_pids() below */
proc_flush_pid(thread_pid);
+ exit_cred_namespaces(p);
add_device_randomness(&p->se.sum_exec_runtime,
sizeof(p->se.sum_exec_runtime));
free_pids(post.pids);
@@ -962,7 +963,7 @@ void __noreturn do_exit(long code)
exit_fs(tsk);
if (group_dead)
disassociate_ctty(1);
- exit_task_namespaces(tsk);
+ exit_nsproxy_namespaces(tsk);
exit_task_work(tsk);
exit_thread(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index 3da0f08615a9..f1857672426e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2453,7 +2453,7 @@ bad_fork_cleanup_io:
if (p->io_context)
exit_io_context(p);
bad_fork_cleanup_namespaces:
- exit_task_namespaces(p);
+ exit_nsproxy_namespaces(p);
bad_fork_cleanup_mm:
if (p->mm) {
mm_clear_owner(p->mm, p);
@@ -2487,6 +2487,7 @@ bad_fork_cleanup_delayacct:
delayacct_tsk_free(p);
bad_fork_cleanup_count:
dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
+ exit_cred_namespaces(p);
exit_creds(p);
bad_fork_free:
WRITE_ONCE(p->__state, TASK_DEAD);
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index a08cc076f332..ffde93d051a4 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -18,7 +18,9 @@
#include <linux/mm.h>
#include "gcov.h"
-#if (__GNUC__ >= 14)
+#if (__GNUC__ >= 15)
+#define GCOV_COUNTERS 10
+#elif (__GNUC__ >= 14)
#define GCOV_COUNTERS 9
#elif (__GNUC__ >= 10)
#define GCOV_COUNTERS 8
diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c
index 76f0940fb485..03d12e27189f 100644
--- a/kernel/kexec_handover.c
+++ b/kernel/kexec_handover.c
@@ -8,6 +8,7 @@
#define pr_fmt(fmt) "KHO: " fmt
+#include <linux/cleanup.h>
#include <linux/cma.h>
#include <linux/count_zeros.h>
#include <linux/debugfs.h>
@@ -22,6 +23,7 @@
#include <asm/early_ioremap.h>
+#include "kexec_handover_internal.h"
/*
* KHO is tightly coupled with mm init and needs access to some of mm
* internal APIs.
@@ -67,10 +69,10 @@ early_param("kho", kho_parse_enable);
* Keep track of memory that is to be preserved across KHO.
*
* The serializing side uses two levels of xarrays to manage chunks of per-order
- * 512 byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order of a
- * 1TB system would fit inside a single 512 byte bitmap. For order 0 allocations
- * each bitmap will cover 16M of address space. Thus, for 16G of memory at most
- * 512K of bitmap memory will be needed for order 0.
+ * PAGE_SIZE byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order
+ * of a 8TB system would fit inside a single 4096 byte bitmap. For order 0
+ * allocations each bitmap will cover 128M of address space. Thus, for 16G of
+ * memory at most 512K of bitmap memory will be needed for order 0.
*
* This approach is fully incremental, as the serialization progresses folios
* can continue be aggregated to the tracker. The final step, immediately prior
@@ -78,12 +80,14 @@ early_param("kho", kho_parse_enable);
* successor kernel to parse.
*/
-#define PRESERVE_BITS (512 * 8)
+#define PRESERVE_BITS (PAGE_SIZE * 8)
struct kho_mem_phys_bits {
DECLARE_BITMAP(preserve, PRESERVE_BITS);
};
+static_assert(sizeof(struct kho_mem_phys_bits) == PAGE_SIZE);
+
struct kho_mem_phys {
/*
* Points to kho_mem_phys_bits, a sparse bitmap array. Each bit is sized
@@ -131,28 +135,28 @@ static struct kho_out kho_out = {
.finalized = false,
};
-static void *xa_load_or_alloc(struct xarray *xa, unsigned long index, size_t sz)
+static void *xa_load_or_alloc(struct xarray *xa, unsigned long index)
{
- void *elm, *res;
+ void *res = xa_load(xa, index);
+
+ if (res)
+ return res;
- elm = xa_load(xa, index);
- if (elm)
- return elm;
+ void *elm __free(free_page) = (void *)get_zeroed_page(GFP_KERNEL);
- elm = kzalloc(sz, GFP_KERNEL);
if (!elm)
return ERR_PTR(-ENOMEM);
+ if (WARN_ON(kho_scratch_overlap(virt_to_phys(elm), PAGE_SIZE)))
+ return ERR_PTR(-EINVAL);
+
res = xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL);
if (xa_is_err(res))
- res = ERR_PTR(xa_err(res));
-
- if (res) {
- kfree(elm);
+ return ERR_PTR(xa_err(res));
+ else if (res)
return res;
- }
- return elm;
+ return no_free_ptr(elm);
}
static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn,
@@ -167,12 +171,12 @@ static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn,
const unsigned long pfn_high = pfn >> order;
physxa = xa_load(&track->orders, order);
- if (!physxa)
- continue;
+ if (WARN_ON_ONCE(!physxa))
+ return;
bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS);
- if (!bits)
- continue;
+ if (WARN_ON_ONCE(!bits))
+ return;
clear_bit(pfn_high % PRESERVE_BITS, bits->preserve);
@@ -216,8 +220,7 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn,
}
}
- bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS,
- sizeof(*bits));
+ bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS);
if (IS_ERR(bits))
return PTR_ERR(bits);
@@ -345,15 +348,19 @@ static_assert(sizeof(struct khoser_mem_chunk) == PAGE_SIZE);
static struct khoser_mem_chunk *new_chunk(struct khoser_mem_chunk *cur_chunk,
unsigned long order)
{
- struct khoser_mem_chunk *chunk;
+ struct khoser_mem_chunk *chunk __free(free_page) = NULL;
- chunk = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ chunk = (void *)get_zeroed_page(GFP_KERNEL);
if (!chunk)
- return NULL;
+ return ERR_PTR(-ENOMEM);
+
+ if (WARN_ON(kho_scratch_overlap(virt_to_phys(chunk), PAGE_SIZE)))
+ return ERR_PTR(-EINVAL);
+
chunk->hdr.order = order;
if (cur_chunk)
KHOSER_STORE_PTR(cur_chunk->hdr.next, chunk);
- return chunk;
+ return no_free_ptr(chunk);
}
static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk)
@@ -374,14 +381,17 @@ static int kho_mem_serialize(struct kho_serialization *ser)
struct khoser_mem_chunk *chunk = NULL;
struct kho_mem_phys *physxa;
unsigned long order;
+ int err = -ENOMEM;
xa_for_each(&ser->track.orders, order, physxa) {
struct kho_mem_phys_bits *bits;
unsigned long phys;
chunk = new_chunk(chunk, order);
- if (!chunk)
+ if (IS_ERR(chunk)) {
+ err = PTR_ERR(chunk);
goto err_free;
+ }
if (!first_chunk)
first_chunk = chunk;
@@ -391,8 +401,10 @@ static int kho_mem_serialize(struct kho_serialization *ser)
if (chunk->hdr.num_elms == ARRAY_SIZE(chunk->bitmaps)) {
chunk = new_chunk(chunk, order);
- if (!chunk)
+ if (IS_ERR(chunk)) {
+ err = PTR_ERR(chunk);
goto err_free;
+ }
}
elm = &chunk->bitmaps[chunk->hdr.num_elms];
@@ -409,7 +421,7 @@ static int kho_mem_serialize(struct kho_serialization *ser)
err_free:
kho_mem_ser_free(first_chunk);
- return -ENOMEM;
+ return err;
}
static void __init deserialize_bitmap(unsigned int order,
@@ -465,8 +477,8 @@ static void __init kho_mem_deserialize(const void *fdt)
* area for early allocations that happen before page allocator is
* initialized.
*/
-static struct kho_scratch *kho_scratch;
-static unsigned int kho_scratch_cnt;
+struct kho_scratch *kho_scratch;
+unsigned int kho_scratch_cnt;
/*
* The scratch areas are scaled by default as percent of memory allocated from
@@ -752,6 +764,9 @@ int kho_preserve_folio(struct folio *folio)
const unsigned int order = folio_order(folio);
struct kho_mem_track *track = &kho_out.ser.track;
+ if (WARN_ON(kho_scratch_overlap(pfn << PAGE_SHIFT, PAGE_SIZE << order)))
+ return -EINVAL;
+
return __kho_preserve_order(track, pfn, order);
}
EXPORT_SYMBOL_GPL(kho_preserve_folio);
@@ -775,6 +790,11 @@ int kho_preserve_pages(struct page *page, unsigned int nr_pages)
unsigned long failed_pfn = 0;
int err = 0;
+ if (WARN_ON(kho_scratch_overlap(start_pfn << PAGE_SHIFT,
+ nr_pages << PAGE_SHIFT))) {
+ return -EINVAL;
+ }
+
while (pfn < end_pfn) {
const unsigned int order =
min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
@@ -862,16 +882,17 @@ err_free:
return NULL;
}
-static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk)
+static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk,
+ unsigned short order)
{
struct kho_mem_track *track = &kho_out.ser.track;
unsigned long pfn = PHYS_PFN(virt_to_phys(chunk));
__kho_unpreserve(track, pfn, pfn + 1);
- for (int i = 0; chunk->phys[i]; i++) {
+ for (int i = 0; i < ARRAY_SIZE(chunk->phys) && chunk->phys[i]; i++) {
pfn = PHYS_PFN(chunk->phys[i]);
- __kho_unpreserve(track, pfn, pfn + 1);
+ __kho_unpreserve(track, pfn, pfn + (1 << order));
}
}
@@ -882,7 +903,7 @@ static void kho_vmalloc_free_chunks(struct kho_vmalloc *kho_vmalloc)
while (chunk) {
struct kho_vmalloc_chunk *tmp = chunk;
- kho_vmalloc_unpreserve_chunk(chunk);
+ kho_vmalloc_unpreserve_chunk(chunk, kho_vmalloc->order);
chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
free_page((unsigned long)tmp);
@@ -992,7 +1013,7 @@ void *kho_restore_vmalloc(const struct kho_vmalloc *preservation)
while (chunk) {
struct page *page;
- for (int i = 0; chunk->phys[i]; i++) {
+ for (int i = 0; i < ARRAY_SIZE(chunk->phys) && chunk->phys[i]; i++) {
phys_addr_t phys = chunk->phys[i];
if (idx + contig_pages > total_pages)
diff --git a/kernel/kexec_handover_debug.c b/kernel/kexec_handover_debug.c
new file mode 100644
index 000000000000..6efb696f5426
--- /dev/null
+++ b/kernel/kexec_handover_debug.c
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * kexec_handover_debug.c - kexec handover optional debug functionality
+ * Copyright (C) 2025 Google LLC, Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+#define pr_fmt(fmt) "KHO: " fmt
+
+#include "kexec_handover_internal.h"
+
+bool kho_scratch_overlap(phys_addr_t phys, size_t size)
+{
+ phys_addr_t scratch_start, scratch_end;
+ unsigned int i;
+
+ for (i = 0; i < kho_scratch_cnt; i++) {
+ scratch_start = kho_scratch[i].addr;
+ scratch_end = kho_scratch[i].addr + kho_scratch[i].size;
+
+ if (phys < scratch_end && (phys + size) > scratch_start)
+ return true;
+ }
+
+ return false;
+}
diff --git a/kernel/kexec_handover_internal.h b/kernel/kexec_handover_internal.h
new file mode 100644
index 000000000000..3c3c7148ceed
--- /dev/null
+++ b/kernel/kexec_handover_internal.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef LINUX_KEXEC_HANDOVER_INTERNAL_H
+#define LINUX_KEXEC_HANDOVER_INTERNAL_H
+
+#include <linux/kexec_handover.h>
+#include <linux/types.h>
+
+extern struct kho_scratch *kho_scratch;
+extern unsigned int kho_scratch_cnt;
+
+#ifdef CONFIG_KEXEC_HANDOVER_DEBUG
+bool kho_scratch_overlap(phys_addr_t phys, size_t size);
+#else
+static inline bool kho_scratch_overlap(phys_addr_t phys, size_t size)
+{
+ return false;
+}
+#endif /* CONFIG_KEXEC_HANDOVER_DEBUG */
+
+#endif /* LINUX_KEXEC_HANDOVER_INTERNAL_H */
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index 949103fd8e9b..2c6b02d4699b 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -78,16 +78,8 @@ void debug_mutex_unlock(struct mutex *lock)
}
}
-void debug_mutex_init(struct mutex *lock, const char *name,
- struct lock_class_key *key)
+void debug_mutex_init(struct mutex *lock)
{
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- /*
- * Make sure we are not reinitializing a held lock:
- */
- debug_check_no_locks_freed((void *)lock, sizeof(*lock));
- lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_SLEEP);
-#endif
lock->magic = lock;
}
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index de7d6702cd96..2a1d165b3167 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -43,8 +43,7 @@
# define MUTEX_WARN_ON(cond)
#endif
-void
-__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
+static void __mutex_init_generic(struct mutex *lock)
{
atomic_long_set(&lock->owner, 0);
raw_spin_lock_init(&lock->wait_lock);
@@ -52,10 +51,8 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
osq_lock_init(&lock->osq);
#endif
-
- debug_mutex_init(lock, name, key);
+ debug_mutex_init(lock);
}
-EXPORT_SYMBOL(__mutex_init);
static inline struct task_struct *__owner_task(unsigned long owner)
{
@@ -142,6 +139,11 @@ static inline bool __mutex_trylock(struct mutex *lock)
* There is nothing that would stop spreading the lockdep annotations outwards
* except more code.
*/
+void mutex_init_generic(struct mutex *lock)
+{
+ __mutex_init_generic(lock);
+}
+EXPORT_SYMBOL(mutex_init_generic);
/*
* Optimistic trylock that only works in the uncontended case. Make sure to
@@ -166,7 +168,21 @@ static __always_inline bool __mutex_unlock_fast(struct mutex *lock)
return atomic_long_try_cmpxchg_release(&lock->owner, &curr, 0UL);
}
-#endif
+
+#else /* !CONFIG_DEBUG_LOCK_ALLOC */
+
+void mutex_init_lockep(struct mutex *lock, const char *name, struct lock_class_key *key)
+{
+ __mutex_init_generic(lock);
+
+ /*
+ * Make sure we are not reinitializing a held lock:
+ */
+ debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+ lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_SLEEP);
+}
+EXPORT_SYMBOL(mutex_init_lockep);
+#endif /* !CONFIG_DEBUG_LOCK_ALLOC */
static inline void __mutex_set_flag(struct mutex *lock, unsigned long flag)
{
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h
index 2e8080a9bee3..9ad4da8cea00 100644
--- a/kernel/locking/mutex.h
+++ b/kernel/locking/mutex.h
@@ -59,8 +59,7 @@ extern void debug_mutex_add_waiter(struct mutex *lock,
extern void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
struct task_struct *task);
extern void debug_mutex_unlock(struct mutex *lock);
-extern void debug_mutex_init(struct mutex *lock, const char *name,
- struct lock_class_key *key);
+extern void debug_mutex_init(struct mutex *lock);
#else /* CONFIG_DEBUG_MUTEXES */
# define debug_mutex_lock_common(lock, waiter) do { } while (0)
# define debug_mutex_wake_waiter(lock, waiter) do { } while (0)
@@ -68,6 +67,6 @@ extern void debug_mutex_init(struct mutex *lock, const char *name,
# define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0)
# define debug_mutex_remove_waiter(lock, waiter, ti) do { } while (0)
# define debug_mutex_unlock(lock) do { } while (0)
-# define debug_mutex_init(lock, name, key) do { } while (0)
+# define debug_mutex_init(lock) do { } while (0)
#endif /* !CONFIG_DEBUG_MUTEXES */
#endif /* CONFIG_PREEMPT_RT */
diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c
index bafd5af98eae..59dbd29cb219 100644
--- a/kernel/locking/rtmutex_api.c
+++ b/kernel/locking/rtmutex_api.c
@@ -515,13 +515,11 @@ void rt_mutex_debug_task_free(struct task_struct *task)
#ifdef CONFIG_PREEMPT_RT
/* Mutexes */
-void __mutex_rt_init(struct mutex *mutex, const char *name,
- struct lock_class_key *key)
+static void __mutex_rt_init_generic(struct mutex *mutex)
{
+ rt_mutex_base_init(&mutex->rtmutex);
debug_check_no_locks_freed((void *)mutex, sizeof(*mutex));
- lockdep_init_map_wait(&mutex->dep_map, name, key, 0, LD_WAIT_SLEEP);
}
-EXPORT_SYMBOL(__mutex_rt_init);
static __always_inline int __mutex_lock_common(struct mutex *lock,
unsigned int state,
@@ -542,6 +540,13 @@ static __always_inline int __mutex_lock_common(struct mutex *lock,
}
#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void mutex_rt_init_lockdep(struct mutex *mutex, const char *name, struct lock_class_key *key)
+{
+ __mutex_rt_init_generic(mutex);
+ lockdep_init_map_wait(&mutex->dep_map, name, key, 0, LD_WAIT_SLEEP);
+}
+EXPORT_SYMBOL(mutex_rt_init_lockdep);
+
void __sched mutex_lock_nested(struct mutex *lock, unsigned int subclass)
{
__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_);
@@ -598,6 +603,12 @@ int __sched _mutex_trylock_nest_lock(struct mutex *lock,
EXPORT_SYMBOL_GPL(_mutex_trylock_nest_lock);
#else /* CONFIG_DEBUG_LOCK_ALLOC */
+void mutex_rt_init_generic(struct mutex *mutex)
+{
+ __mutex_rt_init_generic(mutex);
+}
+EXPORT_SYMBOL(mutex_rt_init_generic);
+
void __sched mutex_lock(struct mutex *lock)
{
__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_);
diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
index 87b03d2e41db..2338b3adfb55 100644
--- a/kernel/locking/spinlock_debug.c
+++ b/kernel/locking/spinlock_debug.c
@@ -184,8 +184,8 @@ void do_raw_read_unlock(rwlock_t *lock)
static inline void debug_write_lock_before(rwlock_t *lock)
{
RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
- RWLOCK_BUG_ON(lock->owner == current, lock, "recursion");
- RWLOCK_BUG_ON(lock->owner_cpu == raw_smp_processor_id(),
+ RWLOCK_BUG_ON(READ_ONCE(lock->owner) == current, lock, "recursion");
+ RWLOCK_BUG_ON(READ_ONCE(lock->owner_cpu) == raw_smp_processor_id(),
lock, "cpu recursion");
}
diff --git a/kernel/nscommon.c b/kernel/nscommon.c
index c1fb2bad6d72..bdc3c86231d3 100644
--- a/kernel/nscommon.c
+++ b/kernel/nscommon.c
@@ -1,7 +1,10 @@
// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */
#include <linux/ns_common.h>
+#include <linux/nstree.h>
#include <linux/proc_ns.h>
+#include <linux/user_namespace.h>
#include <linux/vfsdebug.h>
#ifdef CONFIG_DEBUG_VFS
@@ -52,26 +55,257 @@ static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops)
int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum)
{
+ int ret = 0;
+
refcount_set(&ns->__ns_ref, 1);
ns->stashed = NULL;
ns->ops = ops;
ns->ns_id = 0;
ns->ns_type = ns_type;
- RB_CLEAR_NODE(&ns->ns_tree_node);
- INIT_LIST_HEAD(&ns->ns_list_node);
+ ns_tree_node_init(&ns->ns_tree_node);
+ ns_tree_node_init(&ns->ns_unified_node);
+ ns_tree_node_init(&ns->ns_owner_node);
+ ns_tree_root_init(&ns->ns_owner_root);
#ifdef CONFIG_DEBUG_VFS
ns_debug(ns, ops);
#endif
- if (inum) {
+ if (inum)
ns->inum = inum;
- return 0;
- }
- return proc_alloc_inum(&ns->inum);
+ else
+ ret = proc_alloc_inum(&ns->inum);
+ if (ret)
+ return ret;
+ /*
+ * Tree ref starts at 0. It's incremented when namespace enters
+ * active use (installed in nsproxy) and decremented when all
+ * active uses are gone. Initial namespaces are always active.
+ */
+ if (is_ns_init_inum(ns))
+ atomic_set(&ns->__ns_ref_active, 1);
+ else
+ atomic_set(&ns->__ns_ref_active, 0);
+ return 0;
}
void __ns_common_free(struct ns_common *ns)
{
proc_free_inum(ns->inum);
}
+
+struct ns_common *__must_check ns_owner(struct ns_common *ns)
+{
+ struct user_namespace *owner;
+
+ if (unlikely(!ns->ops))
+ return NULL;
+ VFS_WARN_ON_ONCE(!ns->ops->owner);
+ owner = ns->ops->owner(ns);
+ VFS_WARN_ON_ONCE(!owner && ns != to_ns_common(&init_user_ns));
+ if (!owner)
+ return NULL;
+ /* Skip init_user_ns as it's always active */
+ if (owner == &init_user_ns)
+ return NULL;
+ return to_ns_common(owner);
+}
+
+/*
+ * The active reference count works by having each namespace that gets
+ * created take a single active reference on its owning user namespace.
+ * That single reference is only released once the child namespace's
+ * active count itself goes down.
+ *
+ * A regular namespace tree might look as follow:
+ * Legend:
+ * + : adding active reference
+ * - : dropping active reference
+ * x : always active (initial namespace)
+ *
+ *
+ * net_ns pid_ns
+ * \ /
+ * + +
+ * user_ns1 (2)
+ * |
+ * ipc_ns | uts_ns
+ * \ | /
+ * + + +
+ * user_ns2 (3)
+ * |
+ * cgroup_ns | mnt_ns
+ * \ | /
+ * x x x
+ * init_user_ns (1)
+ *
+ * If both net_ns and pid_ns put their last active reference on
+ * themselves it will cascade to user_ns1 dropping its own active
+ * reference and dropping one active reference on user_ns2:
+ *
+ * net_ns pid_ns
+ * \ /
+ * - -
+ * user_ns1 (0)
+ * |
+ * ipc_ns | uts_ns
+ * \ | /
+ * + - +
+ * user_ns2 (2)
+ * |
+ * cgroup_ns | mnt_ns
+ * \ | /
+ * x x x
+ * init_user_ns (1)
+ *
+ * The iteration stops once we reach a namespace that still has active
+ * references.
+ */
+void __ns_ref_active_put(struct ns_common *ns)
+{
+ /* Initial namespaces are always active. */
+ if (is_ns_init_id(ns))
+ return;
+
+ if (!atomic_dec_and_test(&ns->__ns_ref_active)) {
+ VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0);
+ return;
+ }
+
+ VFS_WARN_ON_ONCE(is_ns_init_id(ns));
+ VFS_WARN_ON_ONCE(!__ns_ref_read(ns));
+
+ for (;;) {
+ ns = ns_owner(ns);
+ if (!ns)
+ return;
+ VFS_WARN_ON_ONCE(is_ns_init_id(ns));
+ if (!atomic_dec_and_test(&ns->__ns_ref_active)) {
+ VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0);
+ return;
+ }
+ }
+}
+
+/*
+ * The active reference count works by having each namespace that gets
+ * created take a single active reference on its owning user namespace.
+ * That single reference is only released once the child namespace's
+ * active count itself goes down. This makes it possible to efficiently
+ * resurrect a namespace tree:
+ *
+ * A regular namespace tree might look as follow:
+ * Legend:
+ * + : adding active reference
+ * - : dropping active reference
+ * x : always active (initial namespace)
+ *
+ *
+ * net_ns pid_ns
+ * \ /
+ * + +
+ * user_ns1 (2)
+ * |
+ * ipc_ns | uts_ns
+ * \ | /
+ * + + +
+ * user_ns2 (3)
+ * |
+ * cgroup_ns | mnt_ns
+ * \ | /
+ * x x x
+ * init_user_ns (1)
+ *
+ * If both net_ns and pid_ns put their last active reference on
+ * themselves it will cascade to user_ns1 dropping its own active
+ * reference and dropping one active reference on user_ns2:
+ *
+ * net_ns pid_ns
+ * \ /
+ * - -
+ * user_ns1 (0)
+ * |
+ * ipc_ns | uts_ns
+ * \ | /
+ * + - +
+ * user_ns2 (2)
+ * |
+ * cgroup_ns | mnt_ns
+ * \ | /
+ * x x x
+ * init_user_ns (1)
+ *
+ * Assume the whole tree is dead but all namespaces are still active:
+ *
+ * net_ns pid_ns
+ * \ /
+ * - -
+ * user_ns1 (0)
+ * |
+ * ipc_ns | uts_ns
+ * \ | /
+ * - - -
+ * user_ns2 (0)
+ * |
+ * cgroup_ns | mnt_ns
+ * \ | /
+ * x x x
+ * init_user_ns (1)
+ *
+ * Now assume the net_ns gets resurrected (.e.g., via the SIOCGSKNS ioctl()):
+ *
+ * net_ns pid_ns
+ * \ /
+ * + -
+ * user_ns1 (0)
+ * |
+ * ipc_ns | uts_ns
+ * \ | /
+ * - + -
+ * user_ns2 (0)
+ * |
+ * cgroup_ns | mnt_ns
+ * \ | /
+ * x x x
+ * init_user_ns (1)
+ *
+ * If net_ns had a zero reference count and we bumped it we also need to
+ * take another reference on its owning user namespace. Similarly, if
+ * pid_ns had a zero reference count it also needs to take another
+ * reference on its owning user namespace. So both net_ns and pid_ns
+ * will each have their own reference on the owning user namespace.
+ *
+ * If the owning user namespace user_ns1 had a zero reference count then
+ * it also needs to take another reference on its owning user namespace
+ * and so on.
+ */
+void __ns_ref_active_get(struct ns_common *ns)
+{
+ int prev;
+
+ /* Initial namespaces are always active. */
+ if (is_ns_init_id(ns))
+ return;
+
+ /* If we didn't resurrect the namespace we're done. */
+ prev = atomic_fetch_add(1, &ns->__ns_ref_active);
+ VFS_WARN_ON_ONCE(prev < 0);
+ if (likely(prev))
+ return;
+
+ /*
+ * We did resurrect it. Walk the ownership hierarchy upwards
+ * until we found an owning user namespace that is active.
+ */
+ for (;;) {
+ ns = ns_owner(ns);
+ if (!ns)
+ return;
+
+ VFS_WARN_ON_ONCE(is_ns_init_id(ns));
+ prev = atomic_fetch_add(1, &ns->__ns_ref_active);
+ VFS_WARN_ON_ONCE(prev < 0);
+ if (likely(prev))
+ return;
+ }
+}
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 19aa64ab08c8..259c4b4f1eeb 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -26,6 +26,7 @@
#include <linux/syscalls.h>
#include <linux/cgroup.h>
#include <linux/perf_event.h>
+#include <linux/nstree.h>
static struct kmem_cache *nsproxy_cachep;
@@ -59,6 +60,25 @@ static inline struct nsproxy *create_nsproxy(void)
return nsproxy;
}
+static inline void nsproxy_free(struct nsproxy *ns)
+{
+ put_mnt_ns(ns->mnt_ns);
+ put_uts_ns(ns->uts_ns);
+ put_ipc_ns(ns->ipc_ns);
+ put_pid_ns(ns->pid_ns_for_children);
+ put_time_ns(ns->time_ns);
+ put_time_ns(ns->time_ns_for_children);
+ put_cgroup_ns(ns->cgroup_ns);
+ put_net(ns->net_ns);
+ kmem_cache_free(nsproxy_cachep, ns);
+}
+
+void deactivate_nsproxy(struct nsproxy *ns)
+{
+ nsproxy_ns_active_put(ns);
+ nsproxy_free(ns);
+}
+
/*
* Create new nsproxy and all of its the associated namespaces.
* Return the newly created nsproxy. Do not attach this to the task,
@@ -179,23 +199,11 @@ int copy_namespaces(u64 flags, struct task_struct *tsk)
if ((flags & CLONE_VM) == 0)
timens_on_fork(new_ns, tsk);
+ nsproxy_ns_active_get(new_ns);
tsk->nsproxy = new_ns;
return 0;
}
-void free_nsproxy(struct nsproxy *ns)
-{
- put_mnt_ns(ns->mnt_ns);
- put_uts_ns(ns->uts_ns);
- put_ipc_ns(ns->ipc_ns);
- put_pid_ns(ns->pid_ns_for_children);
- put_time_ns(ns->time_ns);
- put_time_ns(ns->time_ns_for_children);
- put_cgroup_ns(ns->cgroup_ns);
- put_net(ns->net_ns);
- kmem_cache_free(nsproxy_cachep, ns);
-}
-
/*
* Called from unshare. Unshare all the namespaces part of nsproxy.
* On success, returns the new nsproxy.
@@ -232,6 +240,9 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
might_sleep();
+ if (new)
+ nsproxy_ns_active_get(new);
+
task_lock(p);
ns = p->nsproxy;
p->nsproxy = new;
@@ -241,11 +252,27 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
put_nsproxy(ns);
}
-void exit_task_namespaces(struct task_struct *p)
+void exit_nsproxy_namespaces(struct task_struct *p)
{
switch_task_namespaces(p, NULL);
}
+void switch_cred_namespaces(const struct cred *old, const struct cred *new)
+{
+ ns_ref_active_get(new->user_ns);
+ ns_ref_active_put(old->user_ns);
+}
+
+void get_cred_namespaces(struct task_struct *tsk)
+{
+ ns_ref_active_get(tsk->real_cred->user_ns);
+}
+
+void exit_cred_namespaces(struct task_struct *tsk)
+{
+ ns_ref_active_put(tsk->real_cred->user_ns);
+}
+
int exec_task_namespaces(void)
{
struct task_struct *tsk = current;
@@ -315,7 +342,7 @@ static void put_nsset(struct nsset *nsset)
if (nsset->fs && (flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS))
free_fs_struct(nsset->fs);
if (nsset->nsproxy)
- free_nsproxy(nsset->nsproxy);
+ nsproxy_free(nsset->nsproxy);
}
static int prepare_nsset(unsigned flags, struct nsset *nsset)
diff --git a/kernel/nstree.c b/kernel/nstree.c
index b24a320a11a6..f36c59e6951d 100644
--- a/kernel/nstree.c
+++ b/kernel/nstree.c
@@ -1,140 +1,261 @@
// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */
#include <linux/nstree.h>
#include <linux/proc_ns.h>
+#include <linux/rculist.h>
#include <linux/vfsdebug.h>
+#include <linux/syscalls.h>
+#include <linux/user_namespace.h>
-/**
- * struct ns_tree - Namespace tree
- * @ns_tree: Rbtree of namespaces of a particular type
- * @ns_list: Sequentially walkable list of all namespaces of this type
- * @ns_tree_lock: Seqlock to protect the tree and list
- * @type: type of namespaces in this tree
- */
-struct ns_tree {
- struct rb_root ns_tree;
- struct list_head ns_list;
- seqlock_t ns_tree_lock;
- int type;
+static __cacheline_aligned_in_smp DEFINE_SEQLOCK(ns_tree_lock);
+
+DEFINE_LOCK_GUARD_0(ns_tree_writer,
+ write_seqlock(&ns_tree_lock),
+ write_sequnlock(&ns_tree_lock))
+
+DEFINE_LOCK_GUARD_0(ns_tree_locked_reader,
+ read_seqlock_excl(&ns_tree_lock),
+ read_sequnlock_excl(&ns_tree_lock))
+
+static struct ns_tree_root ns_unified_root = { /* protected by ns_tree_lock */
+ .ns_rb = RB_ROOT,
+ .ns_list_head = LIST_HEAD_INIT(ns_unified_root.ns_list_head),
};
-struct ns_tree mnt_ns_tree = {
- .ns_tree = RB_ROOT,
- .ns_list = LIST_HEAD_INIT(mnt_ns_tree.ns_list),
- .ns_tree_lock = __SEQLOCK_UNLOCKED(mnt_ns_tree.ns_tree_lock),
- .type = CLONE_NEWNS,
+struct ns_tree_root mnt_ns_tree = {
+ .ns_rb = RB_ROOT,
+ .ns_list_head = LIST_HEAD_INIT(mnt_ns_tree.ns_list_head),
};
-struct ns_tree net_ns_tree = {
- .ns_tree = RB_ROOT,
- .ns_list = LIST_HEAD_INIT(net_ns_tree.ns_list),
- .ns_tree_lock = __SEQLOCK_UNLOCKED(net_ns_tree.ns_tree_lock),
- .type = CLONE_NEWNET,
+struct ns_tree_root net_ns_tree = {
+ .ns_rb = RB_ROOT,
+ .ns_list_head = LIST_HEAD_INIT(net_ns_tree.ns_list_head),
};
EXPORT_SYMBOL_GPL(net_ns_tree);
-struct ns_tree uts_ns_tree = {
- .ns_tree = RB_ROOT,
- .ns_list = LIST_HEAD_INIT(uts_ns_tree.ns_list),
- .ns_tree_lock = __SEQLOCK_UNLOCKED(uts_ns_tree.ns_tree_lock),
- .type = CLONE_NEWUTS,
+struct ns_tree_root uts_ns_tree = {
+ .ns_rb = RB_ROOT,
+ .ns_list_head = LIST_HEAD_INIT(uts_ns_tree.ns_list_head),
};
-struct ns_tree user_ns_tree = {
- .ns_tree = RB_ROOT,
- .ns_list = LIST_HEAD_INIT(user_ns_tree.ns_list),
- .ns_tree_lock = __SEQLOCK_UNLOCKED(user_ns_tree.ns_tree_lock),
- .type = CLONE_NEWUSER,
+struct ns_tree_root user_ns_tree = {
+ .ns_rb = RB_ROOT,
+ .ns_list_head = LIST_HEAD_INIT(user_ns_tree.ns_list_head),
};
-struct ns_tree ipc_ns_tree = {
- .ns_tree = RB_ROOT,
- .ns_list = LIST_HEAD_INIT(ipc_ns_tree.ns_list),
- .ns_tree_lock = __SEQLOCK_UNLOCKED(ipc_ns_tree.ns_tree_lock),
- .type = CLONE_NEWIPC,
+struct ns_tree_root ipc_ns_tree = {
+ .ns_rb = RB_ROOT,
+ .ns_list_head = LIST_HEAD_INIT(ipc_ns_tree.ns_list_head),
};
-struct ns_tree pid_ns_tree = {
- .ns_tree = RB_ROOT,
- .ns_list = LIST_HEAD_INIT(pid_ns_tree.ns_list),
- .ns_tree_lock = __SEQLOCK_UNLOCKED(pid_ns_tree.ns_tree_lock),
- .type = CLONE_NEWPID,
+struct ns_tree_root pid_ns_tree = {
+ .ns_rb = RB_ROOT,
+ .ns_list_head = LIST_HEAD_INIT(pid_ns_tree.ns_list_head),
};
-struct ns_tree cgroup_ns_tree = {
- .ns_tree = RB_ROOT,
- .ns_list = LIST_HEAD_INIT(cgroup_ns_tree.ns_list),
- .ns_tree_lock = __SEQLOCK_UNLOCKED(cgroup_ns_tree.ns_tree_lock),
- .type = CLONE_NEWCGROUP,
+struct ns_tree_root cgroup_ns_tree = {
+ .ns_rb = RB_ROOT,
+ .ns_list_head = LIST_HEAD_INIT(cgroup_ns_tree.ns_list_head),
};
-struct ns_tree time_ns_tree = {
- .ns_tree = RB_ROOT,
- .ns_list = LIST_HEAD_INIT(time_ns_tree.ns_list),
- .ns_tree_lock = __SEQLOCK_UNLOCKED(time_ns_tree.ns_tree_lock),
- .type = CLONE_NEWTIME,
+struct ns_tree_root time_ns_tree = {
+ .ns_rb = RB_ROOT,
+ .ns_list_head = LIST_HEAD_INIT(time_ns_tree.ns_list_head),
};
-DEFINE_COOKIE(namespace_cookie);
+/**
+ * ns_tree_node_init - Initialize a namespace tree node
+ * @node: The node to initialize
+ *
+ * Initializes both the rbtree node and list entry.
+ */
+void ns_tree_node_init(struct ns_tree_node *node)
+{
+ RB_CLEAR_NODE(&node->ns_node);
+ INIT_LIST_HEAD(&node->ns_list_entry);
+}
+
+/**
+ * ns_tree_root_init - Initialize a namespace tree root
+ * @root: The root to initialize
+ *
+ * Initializes both the rbtree root and list head.
+ */
+void ns_tree_root_init(struct ns_tree_root *root)
+{
+ root->ns_rb = RB_ROOT;
+ INIT_LIST_HEAD(&root->ns_list_head);
+}
+
+/**
+ * ns_tree_node_empty - Check if a namespace tree node is empty
+ * @node: The node to check
+ *
+ * Returns true if the node is not in any tree.
+ */
+bool ns_tree_node_empty(const struct ns_tree_node *node)
+{
+ return RB_EMPTY_NODE(&node->ns_node);
+}
+
+/**
+ * ns_tree_node_add - Add a node to a namespace tree
+ * @node: The node to add
+ * @root: The tree root to add to
+ * @cmp: Comparison function for rbtree insertion
+ *
+ * Adds the node to both the rbtree and the list, maintaining sorted order.
+ * The list is maintained in the same order as the rbtree to enable efficient
+ * iteration.
+ *
+ * Returns: NULL if insertion succeeded, existing node if duplicate found
+ */
+struct rb_node *ns_tree_node_add(struct ns_tree_node *node,
+ struct ns_tree_root *root,
+ int (*cmp)(struct rb_node *, const struct rb_node *))
+{
+ struct rb_node *ret, *prev;
+
+ /* Add to rbtree */
+ ret = rb_find_add_rcu(&node->ns_node, &root->ns_rb, cmp);
+
+ /* Add to list in sorted order */
+ prev = rb_prev(&node->ns_node);
+ if (!prev) {
+ /* No previous node, add at head */
+ list_add_rcu(&node->ns_list_entry, &root->ns_list_head);
+ } else {
+ /* Add after previous node */
+ struct ns_tree_node *prev_node;
+ prev_node = rb_entry(prev, struct ns_tree_node, ns_node);
+ list_add_rcu(&node->ns_list_entry, &prev_node->ns_list_entry);
+ }
+
+ return ret;
+}
+
+/**
+ * ns_tree_node_del - Remove a node from a namespace tree
+ * @node: The node to remove
+ * @root: The tree root to remove from
+ *
+ * Removes the node from both the rbtree and the list atomically.
+ */
+void ns_tree_node_del(struct ns_tree_node *node, struct ns_tree_root *root)
+{
+ rb_erase(&node->ns_node, &root->ns_rb);
+ RB_CLEAR_NODE(&node->ns_node);
+ list_bidir_del_rcu(&node->ns_list_entry);
+}
static inline struct ns_common *node_to_ns(const struct rb_node *node)
{
if (!node)
return NULL;
- return rb_entry(node, struct ns_common, ns_tree_node);
+ return rb_entry(node, struct ns_common, ns_tree_node.ns_node);
}
-static inline int ns_cmp(struct rb_node *a, const struct rb_node *b)
+static inline struct ns_common *node_to_ns_unified(const struct rb_node *node)
{
- struct ns_common *ns_a = node_to_ns(a);
- struct ns_common *ns_b = node_to_ns(b);
- u64 ns_id_a = ns_a->ns_id;
- u64 ns_id_b = ns_b->ns_id;
+ if (!node)
+ return NULL;
+ return rb_entry(node, struct ns_common, ns_unified_node.ns_node);
+}
- if (ns_id_a < ns_id_b)
+static inline struct ns_common *node_to_ns_owner(const struct rb_node *node)
+{
+ if (!node)
+ return NULL;
+ return rb_entry(node, struct ns_common, ns_owner_node.ns_node);
+}
+
+static int ns_id_cmp(u64 id_a, u64 id_b)
+{
+ if (id_a < id_b)
return -1;
- if (ns_id_a > ns_id_b)
+ if (id_a > id_b)
return 1;
return 0;
}
-void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree)
+static int ns_cmp(struct rb_node *a, const struct rb_node *b)
+{
+ return ns_id_cmp(node_to_ns(a)->ns_id, node_to_ns(b)->ns_id);
+}
+
+static int ns_cmp_unified(struct rb_node *a, const struct rb_node *b)
+{
+ return ns_id_cmp(node_to_ns_unified(a)->ns_id, node_to_ns_unified(b)->ns_id);
+}
+
+static int ns_cmp_owner(struct rb_node *a, const struct rb_node *b)
{
- struct rb_node *node, *prev;
+ return ns_id_cmp(node_to_ns_owner(a)->ns_id, node_to_ns_owner(b)->ns_id);
+}
+
+void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree_root *ns_tree)
+{
+ struct rb_node *node;
+ const struct proc_ns_operations *ops = ns->ops;
VFS_WARN_ON_ONCE(!ns->ns_id);
- write_seqlock(&ns_tree->ns_tree_lock);
+ guard(ns_tree_writer)();
- VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type);
+ /* Add to per-type tree and list */
+ node = ns_tree_node_add(&ns->ns_tree_node, ns_tree, ns_cmp);
- node = rb_find_add_rcu(&ns->ns_tree_node, &ns_tree->ns_tree, ns_cmp);
- /*
- * If there's no previous entry simply add it after the
- * head and if there is add it after the previous entry.
- */
- prev = rb_prev(&ns->ns_tree_node);
- if (!prev)
- list_add_rcu(&ns->ns_list_node, &ns_tree->ns_list);
- else
- list_add_rcu(&ns->ns_list_node, &node_to_ns(prev)->ns_list_node);
+ /* Add to unified tree and list */
+ ns_tree_node_add(&ns->ns_unified_node, &ns_unified_root, ns_cmp_unified);
+
+ /* Add to owner's tree if applicable */
+ if (ops) {
+ struct user_namespace *user_ns;
- write_sequnlock(&ns_tree->ns_tree_lock);
+ VFS_WARN_ON_ONCE(!ops->owner);
+ user_ns = ops->owner(ns);
+ if (user_ns) {
+ struct ns_common *owner = &user_ns->ns;
+ VFS_WARN_ON_ONCE(owner->ns_type != CLONE_NEWUSER);
+
+ /* Insert into owner's tree and list */
+ ns_tree_node_add(&ns->ns_owner_node, &owner->ns_owner_root, ns_cmp_owner);
+ } else {
+ /* Only the initial user namespace doesn't have an owner. */
+ VFS_WARN_ON_ONCE(ns != to_ns_common(&init_user_ns));
+ }
+ }
VFS_WARN_ON_ONCE(node);
}
-void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree)
+void __ns_tree_remove(struct ns_common *ns, struct ns_tree_root *ns_tree)
{
- VFS_WARN_ON_ONCE(RB_EMPTY_NODE(&ns->ns_tree_node));
- VFS_WARN_ON_ONCE(list_empty(&ns->ns_list_node));
- VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type);
+ const struct proc_ns_operations *ops = ns->ops;
+ struct user_namespace *user_ns;
+
+ VFS_WARN_ON_ONCE(ns_tree_node_empty(&ns->ns_tree_node));
+ VFS_WARN_ON_ONCE(list_empty(&ns->ns_tree_node.ns_list_entry));
+
+ write_seqlock(&ns_tree_lock);
+
+ /* Remove from per-type tree and list */
+ ns_tree_node_del(&ns->ns_tree_node, ns_tree);
+
+ /* Remove from unified tree and list */
+ ns_tree_node_del(&ns->ns_unified_node, &ns_unified_root);
- write_seqlock(&ns_tree->ns_tree_lock);
- rb_erase(&ns->ns_tree_node, &ns_tree->ns_tree);
- list_bidir_del_rcu(&ns->ns_list_node);
- RB_CLEAR_NODE(&ns->ns_tree_node);
- write_sequnlock(&ns_tree->ns_tree_lock);
+ /* Remove from owner's tree if applicable */
+ if (ops) {
+ user_ns = ops->owner(ns);
+ if (user_ns) {
+ struct ns_common *owner = &user_ns->ns;
+ ns_tree_node_del(&ns->ns_owner_node, &owner->ns_owner_root);
+ }
+ }
+
+ write_sequnlock(&ns_tree_lock);
}
EXPORT_SYMBOL_GPL(__ns_tree_remove);
@@ -150,8 +271,19 @@ static int ns_find(const void *key, const struct rb_node *node)
return 0;
}
+static int ns_find_unified(const void *key, const struct rb_node *node)
+{
+ const u64 ns_id = *(u64 *)key;
+ const struct ns_common *ns = node_to_ns_unified(node);
-static struct ns_tree *ns_tree_from_type(int ns_type)
+ if (ns_id < ns->ns_id)
+ return -1;
+ if (ns_id > ns->ns_id)
+ return 1;
+ return 0;
+}
+
+static struct ns_tree_root *ns_tree_from_type(int ns_type)
{
switch (ns_type) {
case CLONE_NEWCGROUP:
@@ -175,73 +307,507 @@ static struct ns_tree *ns_tree_from_type(int ns_type)
return NULL;
}
-struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type)
+static struct ns_common *__ns_unified_tree_lookup_rcu(u64 ns_id)
{
- struct ns_tree *ns_tree;
struct rb_node *node;
unsigned int seq;
- RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_lookup_rcu() usage");
+ do {
+ seq = read_seqbegin(&ns_tree_lock);
+ node = rb_find_rcu(&ns_id, &ns_unified_root.ns_rb, ns_find_unified);
+ if (node)
+ break;
+ } while (read_seqretry(&ns_tree_lock, seq));
+
+ return node_to_ns_unified(node);
+}
+
+static struct ns_common *__ns_tree_lookup_rcu(u64 ns_id, int ns_type)
+{
+ struct ns_tree_root *ns_tree;
+ struct rb_node *node;
+ unsigned int seq;
ns_tree = ns_tree_from_type(ns_type);
if (!ns_tree)
return NULL;
do {
- seq = read_seqbegin(&ns_tree->ns_tree_lock);
- node = rb_find_rcu(&ns_id, &ns_tree->ns_tree, ns_find);
+ seq = read_seqbegin(&ns_tree_lock);
+ node = rb_find_rcu(&ns_id, &ns_tree->ns_rb, ns_find);
if (node)
break;
- } while (read_seqretry(&ns_tree->ns_tree_lock, seq));
+ } while (read_seqretry(&ns_tree_lock, seq));
- if (!node)
- return NULL;
+ return node_to_ns(node);
+}
- VFS_WARN_ON_ONCE(node_to_ns(node)->ns_type != ns_type);
+struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type)
+{
+ RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_lookup_rcu() usage");
- return node_to_ns(node);
+ if (ns_type)
+ return __ns_tree_lookup_rcu(ns_id, ns_type);
+
+ return __ns_unified_tree_lookup_rcu(ns_id);
}
/**
- * ns_tree_adjoined_rcu - find the next/previous namespace in the same
+ * __ns_tree_adjoined_rcu - find the next/previous namespace in the same
* tree
* @ns: namespace to start from
+ * @ns_tree: namespace tree to search in
* @previous: if true find the previous namespace, otherwise the next
*
* Find the next or previous namespace in the same tree as @ns. If
* there is no next/previous namespace, -ENOENT is returned.
*/
struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns,
- struct ns_tree *ns_tree, bool previous)
+ struct ns_tree_root *ns_tree, bool previous)
{
struct list_head *list;
RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_adjoined_rcu() usage");
if (previous)
- list = rcu_dereference(list_bidir_prev_rcu(&ns->ns_list_node));
+ list = rcu_dereference(list_bidir_prev_rcu(&ns->ns_tree_node.ns_list_entry));
else
- list = rcu_dereference(list_next_rcu(&ns->ns_list_node));
- if (list_is_head(list, &ns_tree->ns_list))
+ list = rcu_dereference(list_next_rcu(&ns->ns_tree_node.ns_list_entry));
+ if (list_is_head(list, &ns_tree->ns_list_head))
return ERR_PTR(-ENOENT);
- VFS_WARN_ON_ONCE(list_entry_rcu(list, struct ns_common, ns_list_node)->ns_type != ns_tree->type);
-
- return list_entry_rcu(list, struct ns_common, ns_list_node);
+ return list_entry_rcu(list, struct ns_common, ns_tree_node.ns_list_entry);
}
/**
- * ns_tree_gen_id - generate a new namespace id
+ * __ns_tree_gen_id - generate a new namespace id
* @ns: namespace to generate id for
+ * @id: if non-zero, this is the initial namespace and this is a fixed id
*
* Generates a new namespace id and assigns it to the namespace. All
* namespaces types share the same id space and thus can be compared
* directly. IOW, when two ids of two namespace are equal, they are
* identical.
*/
-u64 ns_tree_gen_id(struct ns_common *ns)
+u64 __ns_tree_gen_id(struct ns_common *ns, u64 id)
{
- guard(preempt)();
- ns->ns_id = gen_cookie_next(&namespace_cookie);
+ static atomic64_t namespace_cookie = ATOMIC64_INIT(NS_LAST_INIT_ID + 1);
+
+ if (id)
+ ns->ns_id = id;
+ else
+ ns->ns_id = atomic64_inc_return(&namespace_cookie);
return ns->ns_id;
}
+
+struct klistns {
+ u64 __user *uns_ids;
+ u32 nr_ns_ids;
+ u64 last_ns_id;
+ u64 user_ns_id;
+ u32 ns_type;
+ struct user_namespace *user_ns;
+ bool userns_capable;
+ struct ns_common *first_ns;
+};
+
+static void __free_klistns_free(const struct klistns *kls)
+{
+ if (kls->user_ns_id != LISTNS_CURRENT_USER)
+ put_user_ns(kls->user_ns);
+ if (kls->first_ns && kls->first_ns->ops)
+ kls->first_ns->ops->put(kls->first_ns);
+}
+
+#define NS_ALL (PID_NS | USER_NS | MNT_NS | UTS_NS | IPC_NS | NET_NS | CGROUP_NS | TIME_NS)
+
+static int copy_ns_id_req(const struct ns_id_req __user *req,
+ struct ns_id_req *kreq)
+{
+ int ret;
+ size_t usize;
+
+ BUILD_BUG_ON(sizeof(struct ns_id_req) != NS_ID_REQ_SIZE_VER0);
+
+ ret = get_user(usize, &req->size);
+ if (ret)
+ return -EFAULT;
+ if (unlikely(usize > PAGE_SIZE))
+ return -E2BIG;
+ if (unlikely(usize < NS_ID_REQ_SIZE_VER0))
+ return -EINVAL;
+ memset(kreq, 0, sizeof(*kreq));
+ ret = copy_struct_from_user(kreq, sizeof(*kreq), req, usize);
+ if (ret)
+ return ret;
+ if (kreq->spare != 0)
+ return -EINVAL;
+ if (kreq->ns_type & ~NS_ALL)
+ return -EOPNOTSUPP;
+ return 0;
+}
+
+static inline int prepare_klistns(struct klistns *kls, struct ns_id_req *kreq,
+ u64 __user *ns_ids, size_t nr_ns_ids)
+{
+ kls->last_ns_id = kreq->ns_id;
+ kls->user_ns_id = kreq->user_ns_id;
+ kls->nr_ns_ids = nr_ns_ids;
+ kls->ns_type = kreq->ns_type;
+ kls->uns_ids = ns_ids;
+ return 0;
+}
+
+/*
+ * Lookup a namespace owned by owner with id >= ns_id.
+ * Returns the namespace with the smallest id that is >= ns_id.
+ */
+static struct ns_common *lookup_ns_owner_at(u64 ns_id, struct ns_common *owner)
+{
+ struct ns_common *ret = NULL;
+ struct rb_node *node;
+
+ VFS_WARN_ON_ONCE(owner->ns_type != CLONE_NEWUSER);
+
+ guard(ns_tree_locked_reader)();
+
+ node = owner->ns_owner_root.ns_rb.rb_node;
+ while (node) {
+ struct ns_common *ns;
+
+ ns = node_to_ns_owner(node);
+ if (ns_id <= ns->ns_id) {
+ ret = ns;
+ if (ns_id == ns->ns_id)
+ break;
+ node = node->rb_left;
+ } else {
+ node = node->rb_right;
+ }
+ }
+
+ if (ret)
+ ret = ns_get_unless_inactive(ret);
+ return ret;
+}
+
+static struct ns_common *lookup_ns_id(u64 mnt_ns_id, int ns_type)
+{
+ struct ns_common *ns;
+
+ guard(rcu)();
+ ns = ns_tree_lookup_rcu(mnt_ns_id, ns_type);
+ if (!ns)
+ return NULL;
+
+ if (!ns_get_unless_inactive(ns))
+ return NULL;
+
+ return ns;
+}
+
+static inline bool __must_check ns_requested(const struct klistns *kls,
+ const struct ns_common *ns)
+{
+ return !kls->ns_type || (kls->ns_type & ns->ns_type);
+}
+
+static inline bool __must_check may_list_ns(const struct klistns *kls,
+ struct ns_common *ns)
+{
+ if (kls->user_ns) {
+ if (kls->userns_capable)
+ return true;
+ } else {
+ struct ns_common *owner;
+ struct user_namespace *user_ns;
+
+ owner = ns_owner(ns);
+ if (owner)
+ user_ns = to_user_ns(owner);
+ else
+ user_ns = &init_user_ns;
+ if (ns_capable_noaudit(user_ns, CAP_SYS_ADMIN))
+ return true;
+ }
+
+ if (is_current_namespace(ns))
+ return true;
+
+ if (ns->ns_type != CLONE_NEWUSER)
+ return false;
+
+ if (ns_capable_noaudit(to_user_ns(ns), CAP_SYS_ADMIN))
+ return true;
+
+ return false;
+}
+
+static inline void ns_put(struct ns_common *ns)
+{
+ if (ns && ns->ops)
+ ns->ops->put(ns);
+}
+
+DEFINE_FREE(ns_put, struct ns_common *, if (!IS_ERR_OR_NULL(_T)) ns_put(_T))
+
+static inline struct ns_common *__must_check legitimize_ns(const struct klistns *kls,
+ struct ns_common *candidate)
+{
+ struct ns_common *ns __free(ns_put) = NULL;
+
+ if (!ns_requested(kls, candidate))
+ return NULL;
+
+ ns = ns_get_unless_inactive(candidate);
+ if (!ns)
+ return NULL;
+
+ if (!may_list_ns(kls, ns))
+ return NULL;
+
+ return no_free_ptr(ns);
+}
+
+static ssize_t do_listns_userns(struct klistns *kls)
+{
+ u64 __user *ns_ids = kls->uns_ids;
+ size_t nr_ns_ids = kls->nr_ns_ids;
+ struct ns_common *ns = NULL, *first_ns = NULL, *prev = NULL;
+ const struct list_head *head;
+ ssize_t ret;
+
+ VFS_WARN_ON_ONCE(!kls->user_ns_id);
+
+ if (kls->user_ns_id == LISTNS_CURRENT_USER)
+ ns = to_ns_common(current_user_ns());
+ else if (kls->user_ns_id)
+ ns = lookup_ns_id(kls->user_ns_id, CLONE_NEWUSER);
+ if (!ns)
+ return -EINVAL;
+ kls->user_ns = to_user_ns(ns);
+
+ /*
+ * Use the rbtree to find the first namespace we care about and
+ * then use it's list entry to iterate from there.
+ */
+ if (kls->last_ns_id) {
+ kls->first_ns = lookup_ns_owner_at(kls->last_ns_id + 1, ns);
+ if (!kls->first_ns)
+ return -ENOENT;
+ first_ns = kls->first_ns;
+ }
+
+ ret = 0;
+ head = &to_ns_common(kls->user_ns)->ns_owner_root.ns_list_head;
+ kls->userns_capable = ns_capable_noaudit(kls->user_ns, CAP_SYS_ADMIN);
+
+ rcu_read_lock();
+
+ if (!first_ns)
+ first_ns = list_entry_rcu(head->next, typeof(*first_ns), ns_owner_node.ns_list_entry);
+
+ ns = first_ns;
+ list_for_each_entry_from_rcu(ns, head, ns_owner_node.ns_list_entry) {
+ struct ns_common *valid;
+
+ if (!nr_ns_ids)
+ break;
+
+ valid = legitimize_ns(kls, ns);
+ if (!valid)
+ continue;
+
+ rcu_read_unlock();
+
+ ns_put(prev);
+ prev = valid;
+
+ if (put_user(valid->ns_id, ns_ids + ret)) {
+ ns_put(prev);
+ return -EFAULT;
+ }
+
+ nr_ns_ids--;
+ ret++;
+
+ rcu_read_lock();
+ }
+
+ rcu_read_unlock();
+ ns_put(prev);
+ return ret;
+}
+
+/*
+ * Lookup a namespace with id >= ns_id in either the unified tree or a type-specific tree.
+ * Returns the namespace with the smallest id that is >= ns_id.
+ */
+static struct ns_common *lookup_ns_id_at(u64 ns_id, int ns_type)
+{
+ struct ns_common *ret = NULL;
+ struct ns_tree_root *ns_tree = NULL;
+ struct rb_node *node;
+
+ if (ns_type) {
+ ns_tree = ns_tree_from_type(ns_type);
+ if (!ns_tree)
+ return NULL;
+ }
+
+ guard(ns_tree_locked_reader)();
+
+ if (ns_tree)
+ node = ns_tree->ns_rb.rb_node;
+ else
+ node = ns_unified_root.ns_rb.rb_node;
+
+ while (node) {
+ struct ns_common *ns;
+
+ if (ns_type)
+ ns = node_to_ns(node);
+ else
+ ns = node_to_ns_unified(node);
+
+ if (ns_id <= ns->ns_id) {
+ if (ns_type)
+ ret = node_to_ns(node);
+ else
+ ret = node_to_ns_unified(node);
+ if (ns_id == ns->ns_id)
+ break;
+ node = node->rb_left;
+ } else {
+ node = node->rb_right;
+ }
+ }
+
+ if (ret)
+ ret = ns_get_unless_inactive(ret);
+ return ret;
+}
+
+static inline struct ns_common *first_ns_common(const struct list_head *head,
+ struct ns_tree_root *ns_tree)
+{
+ if (ns_tree)
+ return list_entry_rcu(head->next, struct ns_common, ns_tree_node.ns_list_entry);
+ return list_entry_rcu(head->next, struct ns_common, ns_unified_node.ns_list_entry);
+}
+
+static inline struct ns_common *next_ns_common(struct ns_common *ns,
+ struct ns_tree_root *ns_tree)
+{
+ if (ns_tree)
+ return list_entry_rcu(ns->ns_tree_node.ns_list_entry.next, struct ns_common, ns_tree_node.ns_list_entry);
+ return list_entry_rcu(ns->ns_unified_node.ns_list_entry.next, struct ns_common, ns_unified_node.ns_list_entry);
+}
+
+static inline bool ns_common_is_head(struct ns_common *ns,
+ const struct list_head *head,
+ struct ns_tree_root *ns_tree)
+{
+ if (ns_tree)
+ return &ns->ns_tree_node.ns_list_entry == head;
+ return &ns->ns_unified_node.ns_list_entry == head;
+}
+
+static ssize_t do_listns(struct klistns *kls)
+{
+ u64 __user *ns_ids = kls->uns_ids;
+ size_t nr_ns_ids = kls->nr_ns_ids;
+ struct ns_common *ns, *first_ns = NULL, *prev = NULL;
+ struct ns_tree_root *ns_tree = NULL;
+ const struct list_head *head;
+ u32 ns_type;
+ ssize_t ret;
+
+ if (hweight32(kls->ns_type) == 1)
+ ns_type = kls->ns_type;
+ else
+ ns_type = 0;
+
+ if (ns_type) {
+ ns_tree = ns_tree_from_type(ns_type);
+ if (!ns_tree)
+ return -EINVAL;
+ }
+
+ if (kls->last_ns_id) {
+ kls->first_ns = lookup_ns_id_at(kls->last_ns_id + 1, ns_type);
+ if (!kls->first_ns)
+ return -ENOENT;
+ first_ns = kls->first_ns;
+ }
+
+ ret = 0;
+ if (ns_tree)
+ head = &ns_tree->ns_list_head;
+ else
+ head = &ns_unified_root.ns_list_head;
+
+ rcu_read_lock();
+
+ if (!first_ns)
+ first_ns = first_ns_common(head, ns_tree);
+
+ for (ns = first_ns; !ns_common_is_head(ns, head, ns_tree) && nr_ns_ids;
+ ns = next_ns_common(ns, ns_tree)) {
+ struct ns_common *valid;
+
+ valid = legitimize_ns(kls, ns);
+ if (!valid)
+ continue;
+
+ rcu_read_unlock();
+
+ ns_put(prev);
+ prev = valid;
+
+ if (put_user(valid->ns_id, ns_ids + ret)) {
+ ns_put(prev);
+ return -EFAULT;
+ }
+
+ nr_ns_ids--;
+ ret++;
+
+ rcu_read_lock();
+ }
+
+ rcu_read_unlock();
+ ns_put(prev);
+ return ret;
+}
+
+SYSCALL_DEFINE4(listns, const struct ns_id_req __user *, req,
+ u64 __user *, ns_ids, size_t, nr_ns_ids, unsigned int, flags)
+{
+ struct klistns klns __free(klistns_free) = {};
+ const size_t maxcount = 1000000;
+ struct ns_id_req kreq;
+ ssize_t ret;
+
+ if (flags)
+ return -EINVAL;
+
+ if (unlikely(nr_ns_ids > maxcount))
+ return -EOVERFLOW;
+
+ if (!access_ok(ns_ids, nr_ns_ids * sizeof(*ns_ids)))
+ return -EFAULT;
+
+ ret = copy_ns_id_req(req, &kreq);
+ if (ret)
+ return ret;
+
+ ret = prepare_klistns(&klns, &kreq, ns_ids, nr_ns_ids);
+ if (ret)
+ return ret;
+
+ if (kreq.user_ns_id)
+ return do_listns_userns(&klns);
+
+ return do_listns(&klns);
+}
diff --git a/kernel/pid.c b/kernel/pid.c
index 4fffec767a63..a31771bc89c1 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -71,21 +71,16 @@ static int pid_max_max = PID_MAX_LIMIT;
* the scheme scales to up to 4 million PIDs, runtime.
*/
struct pid_namespace init_pid_ns = {
- .ns.__ns_ref = REFCOUNT_INIT(2),
+ .ns = NS_COMMON_INIT(init_pid_ns),
.idr = IDR_INIT(init_pid_ns.idr),
.pid_allocated = PIDNS_ADDING,
.level = 0,
.child_reaper = &init_task,
.user_ns = &init_user_ns,
- .ns.inum = ns_init_inum(&init_pid_ns),
-#ifdef CONFIG_PID_NS
- .ns.ops = &pidns_operations,
-#endif
.pid_max = PID_MAX_DEFAULT,
#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
.memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC,
#endif
- .ns.ns_type = ns_common_type(&init_pid_ns),
};
EXPORT_SYMBOL_GPL(init_pid_ns);
@@ -117,9 +112,13 @@ static void delayed_put_pid(struct rcu_head *rhp)
void free_pid(struct pid *pid)
{
int i;
+ struct pid_namespace *active_ns;
lockdep_assert_not_held(&tasklist_lock);
+ active_ns = pid->numbers[pid->level].ns;
+ ns_ref_active_put(active_ns);
+
spin_lock(&pidmap_lock);
for (i = 0; i <= pid->level; i++) {
struct upid *upid = pid->numbers + i;
@@ -283,6 +282,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
}
spin_unlock(&pidmap_lock);
idr_preload_end();
+ ns_ref_active_get(ns);
return pid;
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 650be58d8d18..e48f5de41361 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -184,7 +184,7 @@ struct pid_namespace *copy_pid_ns(u64 flags,
void put_pid_ns(struct pid_namespace *ns)
{
- if (ns && ns != &init_pid_ns && ns_ref_put(ns))
+ if (ns && ns_ref_put(ns))
schedule_work(&ns->work);
}
EXPORT_SYMBOL_GPL(put_pid_ns);
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 53166ef86ba4..26e45f86b955 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -821,8 +821,7 @@ int hibernate(void)
goto Restore;
ksys_sync_helper();
- if (filesystem_freeze_enabled)
- filesystems_freeze();
+ filesystems_freeze(filesystem_freeze_enabled);
error = freeze_processes();
if (error)
@@ -928,8 +927,7 @@ int hibernate_quiet_exec(int (*func)(void *data), void *data)
if (error)
goto restore;
- if (filesystem_freeze_enabled)
- filesystems_freeze();
+ filesystems_freeze(filesystem_freeze_enabled);
error = freeze_processes();
if (error)
@@ -1079,8 +1077,7 @@ static int software_resume(void)
if (error)
goto Restore;
- if (filesystem_freeze_enabled)
- filesystems_freeze();
+ filesystems_freeze(filesystem_freeze_enabled);
pm_pr_dbg("Preparing processes for hibernation restore.\n");
error = freeze_processes();
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index b4ca17c2fecf..3d4ebedad69f 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -375,8 +375,7 @@ static int suspend_prepare(suspend_state_t state)
if (error)
goto Restore;
- if (filesystem_freeze_enabled)
- filesystems_freeze();
+ filesystems_freeze(filesystem_freeze_enabled);
trace_suspend_resume(TPS("freeze_processes"), 0, true);
error = suspend_freeze_processes();
trace_suspend_resume(TPS("freeze_processes"), 0, false);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 0beff7eeaaba..70ae21f7370d 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -635,7 +635,7 @@ struct cmp_data {
};
/* Indicates the image size after compression */
-static atomic_t compressed_size = ATOMIC_INIT(0);
+static atomic64_t compressed_size = ATOMIC_INIT(0);
/*
* Compression function that runs in its own thread.
@@ -664,7 +664,7 @@ static int compress_threadfn(void *data)
d->ret = crypto_acomp_compress(d->cr);
d->cmp_len = d->cr->dlen;
- atomic_set(&compressed_size, atomic_read(&compressed_size) + d->cmp_len);
+ atomic64_add(d->cmp_len, &compressed_size);
atomic_set_release(&d->stop, 1);
wake_up(&d->done);
}
@@ -689,14 +689,14 @@ static int save_compressed_image(struct swap_map_handle *handle,
ktime_t start;
ktime_t stop;
size_t off;
- unsigned thr, run_threads, nr_threads;
+ unsigned int thr, run_threads, nr_threads;
unsigned char *page = NULL;
struct cmp_data *data = NULL;
struct crc_data *crc = NULL;
hib_init_batch(&hb);
- atomic_set(&compressed_size, 0);
+ atomic64_set(&compressed_size, 0);
/*
* We'll limit the number of threads for compression to limit memory
@@ -877,11 +877,14 @@ out_finish:
stop = ktime_get();
if (!ret)
ret = err2;
- if (!ret)
+ if (!ret) {
+ swsusp_show_speed(start, stop, nr_to_write, "Wrote");
+ pr_info("Image size after compression: %lld kbytes\n",
+ (atomic64_read(&compressed_size) / 1024));
pr_info("Image saving done\n");
- swsusp_show_speed(start, stop, nr_to_write, "Wrote");
- pr_info("Image size after compression: %d kbytes\n",
- (atomic_read(&compressed_size) / 1024));
+ } else {
+ pr_err("Image saving failed: %d\n", ret);
+ }
out_clean:
hib_finish_batch(&hb);
@@ -899,7 +902,8 @@ out_clean:
}
vfree(data);
}
- if (page) free_page((unsigned long)page);
+ if (page)
+ free_page((unsigned long)page);
return ret;
}
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 7097de2c8cda..4f97896887ec 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -313,10 +313,8 @@ static u64 read_sum_exec_runtime(struct task_struct *t)
void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
{
struct signal_struct *sig = tsk->signal;
- u64 utime, stime;
struct task_struct *t;
- unsigned int seq, nextseq;
- unsigned long flags;
+ u64 utime, stime;
/*
* Update current task runtime to account pending time since last
@@ -329,27 +327,19 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
if (same_thread_group(current, tsk))
(void) task_sched_runtime(current);
- rcu_read_lock();
- /* Attempt a lockless read on the first round. */
- nextseq = 0;
- do {
- seq = nextseq;
- flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
+ guard(rcu)();
+ scoped_seqlock_read (&sig->stats_lock, ss_lock_irqsave) {
times->utime = sig->utime;
times->stime = sig->stime;
times->sum_exec_runtime = sig->sum_sched_runtime;
- for_each_thread(tsk, t) {
+ __for_each_thread(sig, t) {
task_cputime(t, &utime, &stime);
times->utime += utime;
times->stime += stime;
times->sum_exec_runtime += read_sum_exec_runtime(t);
}
- /* If lockless access failed, take the lock. */
- nextseq = 1;
- } while (need_seqretry(&sig->stats_lock, seq));
- done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
- rcu_read_unlock();
+ }
}
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index ecb251e883ea..979484dab2d3 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -25,7 +25,7 @@ static struct scx_sched __rcu *scx_root;
* guarantee system safety. Maintain a dedicated task list which contains every
* task between its fork and eventual free.
*/
-static DEFINE_SPINLOCK(scx_tasks_lock);
+static DEFINE_RAW_SPINLOCK(scx_tasks_lock);
static LIST_HEAD(scx_tasks);
/* ops enable/disable */
@@ -476,7 +476,7 @@ static void scx_task_iter_start(struct scx_task_iter *iter)
BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS &
((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1));
- spin_lock_irq(&scx_tasks_lock);
+ raw_spin_lock_irq(&scx_tasks_lock);
iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR };
list_add(&iter->cursor.tasks_node, &scx_tasks);
@@ -507,14 +507,14 @@ static void scx_task_iter_unlock(struct scx_task_iter *iter)
__scx_task_iter_rq_unlock(iter);
if (iter->list_locked) {
iter->list_locked = false;
- spin_unlock_irq(&scx_tasks_lock);
+ raw_spin_unlock_irq(&scx_tasks_lock);
}
}
static void __scx_task_iter_maybe_relock(struct scx_task_iter *iter)
{
if (!iter->list_locked) {
- spin_lock_irq(&scx_tasks_lock);
+ raw_spin_lock_irq(&scx_tasks_lock);
iter->list_locked = true;
}
}
@@ -2940,9 +2940,9 @@ void scx_post_fork(struct task_struct *p)
}
}
- spin_lock_irq(&scx_tasks_lock);
+ raw_spin_lock_irq(&scx_tasks_lock);
list_add_tail(&p->scx.tasks_node, &scx_tasks);
- spin_unlock_irq(&scx_tasks_lock);
+ raw_spin_unlock_irq(&scx_tasks_lock);
percpu_up_read(&scx_fork_rwsem);
}
@@ -2966,9 +2966,9 @@ void sched_ext_free(struct task_struct *p)
{
unsigned long flags;
- spin_lock_irqsave(&scx_tasks_lock, flags);
+ raw_spin_lock_irqsave(&scx_tasks_lock, flags);
list_del_init(&p->scx.tasks_node);
- spin_unlock_irqrestore(&scx_tasks_lock, flags);
+ raw_spin_unlock_irqrestore(&scx_tasks_lock, flags);
/*
* @p is off scx_tasks and wholly ours. scx_enable()'s READY -> ENABLED
@@ -4276,7 +4276,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
size_t avail, used;
bool idle;
- rq_lock(rq, &rf);
+ rq_lock_irqsave(rq, &rf);
idle = list_empty(&rq->scx.runnable_list) &&
rq->curr->sched_class == &idle_sched_class;
@@ -4345,7 +4345,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node)
scx_dump_task(&s, &dctx, p, ' ');
next:
- rq_unlock(rq, &rf);
+ rq_unlock_irqrestore(rq, &rf);
}
dump_newline(&s);
@@ -4479,8 +4479,11 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
goto err_free_gdsqs;
sch->helper = kthread_run_worker(0, "sched_ext_helper");
- if (!sch->helper)
+ if (IS_ERR(sch->helper)) {
+ ret = PTR_ERR(sch->helper);
goto err_free_pcpu;
+ }
+
sched_set_fifo(sch->helper->task);
atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
@@ -5321,8 +5324,8 @@ void __init init_sched_ext_class(void)
BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n));
BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n));
BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n));
- init_irq_work(&rq->scx.deferred_irq_work, deferred_irq_workfn);
- init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn);
+ rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn);
+ rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn);
if (cpu_online(cpu))
cpu_rq(cpu)->scx.flags |= SCX_RQ_ONLINE;
@@ -6401,7 +6404,7 @@ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf)
guard(rcu)();
- sch = rcu_dereference(sch);
+ sch = rcu_dereference(scx_root);
if (unlikely(!sch))
return;
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index 5b6997f4dc3d..e76be24b132c 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -478,11 +478,8 @@ const struct proc_ns_operations timens_for_children_operations = {
};
struct time_namespace init_time_ns = {
- .ns.ns_type = ns_common_type(&init_time_ns),
- .ns.__ns_ref = REFCOUNT_INIT(3),
+ .ns = NS_COMMON_INIT(init_time_ns),
.user_ns = &init_user_ns,
- .ns.inum = ns_init_inum(&init_time_ns),
- .ns.ops = &timens_operations,
.frozen_offsets = true,
};
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index aa3120104a51..56e17b625c72 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -475,12 +475,6 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event,
if (!kc->timer_create)
return -EOPNOTSUPP;
- new_timer = alloc_posix_timer();
- if (unlikely(!new_timer))
- return -EAGAIN;
-
- spin_lock_init(&new_timer->it_lock);
-
/* Special case for CRIU to restore timers with a given timer ID. */
if (unlikely(current->signal->timer_create_restore_ids)) {
if (copy_from_user(&req_id, created_timer_id, sizeof(req_id)))
@@ -490,6 +484,12 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event,
return -EINVAL;
}
+ new_timer = alloc_posix_timer();
+ if (unlikely(!new_timer))
+ return -EAGAIN;
+
+ spin_lock_init(&new_timer->it_lock);
+
/*
* Add the timer to the hash table. The timer is not yet valid
* after insertion, but has a unique ID allocated.
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index c527b421c865..466e083c8272 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -1152,16 +1152,15 @@ static bool report_idle_softirq(void)
return false;
}
- if (ratelimit >= 10)
- return false;
-
/* On RT, softirq handling may be waiting on some lock */
if (local_bh_blocked())
return false;
- pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n",
- pending);
- ratelimit++;
+ if (ratelimit < 10) {
+ pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n",
+ pending);
+ ratelimit++;
+ }
return true;
}
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 3a4d3b2e3f74..4790da895203 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -3060,29 +3060,34 @@ static const struct attribute_group aux_clock_enable_attr_group = {
static int __init tk_aux_sysfs_init(void)
{
struct kobject *auxo, *tko = kobject_create_and_add("time", kernel_kobj);
+ int ret = -ENOMEM;
if (!tko)
- return -ENOMEM;
+ return ret;
auxo = kobject_create_and_add("aux_clocks", tko);
- if (!auxo) {
- kobject_put(tko);
- return -ENOMEM;
- }
+ if (!auxo)
+ goto err_clean;
for (int i = 0; i < MAX_AUX_CLOCKS; i++) {
char id[2] = { [0] = '0' + i, };
struct kobject *clk = kobject_create_and_add(id, auxo);
- if (!clk)
- return -ENOMEM;
-
- int ret = sysfs_create_group(clk, &aux_clock_enable_attr_group);
+ if (!clk) {
+ ret = -ENOMEM;
+ goto err_clean;
+ }
+ ret = sysfs_create_group(clk, &aux_clock_enable_attr_group);
if (ret)
- return ret;
+ goto err_clean;
}
return 0;
+
+err_clean:
+ kobject_put(auxo);
+ kobject_put(tko);
+ return ret;
}
late_initcall(tk_aux_sysfs_init);
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 553fa469d7cc..d5ebb1d927ea 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1458,10 +1458,11 @@ static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown)
base = lock_timer_base(timer, &flags);
- if (base->running_timer != timer)
+ if (base->running_timer != timer) {
ret = detach_if_pending(timer, base, true);
- if (shutdown)
- timer->function = NULL;
+ if (shutdown)
+ timer->function = NULL;
+ }
raw_spin_unlock_irqrestore(&base->lock, flags);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 42bd2ba68a82..59cfacb8a5bb 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1971,7 +1971,8 @@ static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops)
*/
static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
struct ftrace_hash *old_hash,
- struct ftrace_hash *new_hash)
+ struct ftrace_hash *new_hash,
+ bool update_target)
{
struct ftrace_page *pg;
struct dyn_ftrace *rec, *end = NULL;
@@ -2006,10 +2007,13 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
if (rec->flags & FTRACE_FL_DISABLED)
continue;
- /* We need to update only differences of filter_hash */
+ /*
+ * Unless we are updating the target of a direct function,
+ * we only need to update differences of filter_hash
+ */
in_old = !!ftrace_lookup_ip(old_hash, rec->ip);
in_new = !!ftrace_lookup_ip(new_hash, rec->ip);
- if (in_old == in_new)
+ if (!update_target && (in_old == in_new))
continue;
if (in_new) {
@@ -2020,7 +2024,16 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
if (is_ipmodify)
goto rollback;
- FTRACE_WARN_ON(rec->flags & FTRACE_FL_DIRECT);
+ /*
+ * If this is called by __modify_ftrace_direct()
+ * then it is only changing where the direct
+ * pointer is jumping to, and the record already
+ * points to a direct trampoline. If it isn't,
+ * then it is a bug to update ipmodify on a direct
+ * caller.
+ */
+ FTRACE_WARN_ON(!update_target &&
+ (rec->flags & FTRACE_FL_DIRECT));
/*
* Another ops with IPMODIFY is already
@@ -2076,7 +2089,7 @@ static int ftrace_hash_ipmodify_enable(struct ftrace_ops *ops)
if (ftrace_hash_empty(hash))
hash = NULL;
- return __ftrace_hash_update_ipmodify(ops, EMPTY_HASH, hash);
+ return __ftrace_hash_update_ipmodify(ops, EMPTY_HASH, hash, false);
}
/* Disabling always succeeds */
@@ -2087,7 +2100,7 @@ static void ftrace_hash_ipmodify_disable(struct ftrace_ops *ops)
if (ftrace_hash_empty(hash))
hash = NULL;
- __ftrace_hash_update_ipmodify(ops, hash, EMPTY_HASH);
+ __ftrace_hash_update_ipmodify(ops, hash, EMPTY_HASH, false);
}
static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
@@ -2101,7 +2114,7 @@ static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
if (ftrace_hash_empty(new_hash))
new_hash = NULL;
- return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash);
+ return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash, false);
}
static void print_ip_ins(const char *fmt, const unsigned char *p)
@@ -5953,6 +5966,17 @@ static void register_ftrace_direct_cb(struct rcu_head *rhp)
free_ftrace_hash(fhp);
}
+static void reset_direct(struct ftrace_ops *ops, unsigned long addr)
+{
+ struct ftrace_hash *hash = ops->func_hash->filter_hash;
+
+ remove_direct_functions_hash(hash, addr);
+
+ /* cleanup for possible another register call */
+ ops->func = NULL;
+ ops->trampoline = 0;
+}
+
/**
* register_ftrace_direct - Call a custom trampoline directly
* for multiple functions registered in @ops
@@ -6048,6 +6072,8 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
ops->direct_call = addr;
err = register_ftrace_function_nolock(ops);
+ if (err)
+ reset_direct(ops, addr);
out_unlock:
mutex_unlock(&direct_mutex);
@@ -6080,7 +6106,6 @@ EXPORT_SYMBOL_GPL(register_ftrace_direct);
int unregister_ftrace_direct(struct ftrace_ops *ops, unsigned long addr,
bool free_filters)
{
- struct ftrace_hash *hash = ops->func_hash->filter_hash;
int err;
if (check_direct_multi(ops))
@@ -6090,13 +6115,9 @@ int unregister_ftrace_direct(struct ftrace_ops *ops, unsigned long addr,
mutex_lock(&direct_mutex);
err = unregister_ftrace_function(ops);
- remove_direct_functions_hash(hash, addr);
+ reset_direct(ops, addr);
mutex_unlock(&direct_mutex);
- /* cleanup for possible another register call */
- ops->func = NULL;
- ops->trampoline = 0;
-
if (free_filters)
ftrace_free_filter(ops);
return err;
@@ -6106,7 +6127,7 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_direct);
static int
__modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
{
- struct ftrace_hash *hash;
+ struct ftrace_hash *hash = ops->func_hash->filter_hash;
struct ftrace_func_entry *entry, *iter;
static struct ftrace_ops tmp_ops = {
.func = ftrace_stub,
@@ -6127,12 +6148,20 @@ __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
return err;
/*
+ * Call __ftrace_hash_update_ipmodify() here, so that we can call
+ * ops->ops_func for the ops. This is needed because the above
+ * register_ftrace_function_nolock() worked on tmp_ops.
+ */
+ err = __ftrace_hash_update_ipmodify(ops, hash, hash, true);
+ if (err)
+ goto out;
+
+ /*
* Now the ftrace_ops_list_func() is called to do the direct callers.
* We can safely change the direct functions attached to each entry.
*/
mutex_lock(&ftrace_lock);
- hash = ops->func_hash->filter_hash;
size = 1 << hash->size_bits;
for (i = 0; i < size; i++) {
hlist_for_each_entry(iter, &hash->buckets[i], hlist) {
@@ -6147,6 +6176,7 @@ __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
mutex_unlock(&ftrace_lock);
+out:
/* Removing the tmp_ops will add the updated direct callers to the functions */
unregister_ftrace_function(&tmp_ops);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d1e527cf2aae..304e93597126 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -8781,8 +8781,18 @@ static void tracing_buffers_mmap_close(struct vm_area_struct *vma)
put_snapshot_map(iter->tr);
}
+static int tracing_buffers_may_split(struct vm_area_struct *vma, unsigned long addr)
+{
+ /*
+ * Trace buffer mappings require the complete buffer including
+ * the meta page. Partial mappings are not supported.
+ */
+ return -EINVAL;
+}
+
static const struct vm_operations_struct tracing_buffers_vmops = {
.close = tracing_buffers_mmap_close,
+ .may_split = tracing_buffers_may_split,
};
static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma)
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index c428dafe7496..b15854c75d4f 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -1449,12 +1449,7 @@ static struct trace_event_functions user_event_funcs = {
static int user_event_set_call_visible(struct user_event *user, bool visible)
{
- int ret;
- const struct cred *old_cred;
- struct cred *cred;
-
- cred = prepare_creds();
-
+ CLASS(prepare_creds, cred)();
if (!cred)
return -ENOMEM;
@@ -1469,17 +1464,12 @@ static int user_event_set_call_visible(struct user_event *user, bool visible)
*/
cred->fsuid = GLOBAL_ROOT_UID;
- old_cred = override_creds(cred);
-
- if (visible)
- ret = trace_add_event_call(&user->call);
- else
- ret = trace_remove_event_call(&user->call);
-
- revert_creds(old_cred);
- put_cred(cred);
+ scoped_with_creds(cred) {
+ if (visible)
+ return trace_add_event_call(&user->call);
- return ret;
+ return trace_remove_event_call(&user->call);
+ }
}
static int destroy_user_event(struct user_event *user)
diff --git a/kernel/user.c b/kernel/user.c
index 0163665914c9..7aef4e679a6a 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -35,6 +35,7 @@ EXPORT_SYMBOL_GPL(init_binfmt_misc);
* and 1 for... ?
*/
struct user_namespace init_user_ns = {
+ .ns = NS_COMMON_INIT(init_user_ns),
.uid_map = {
{
.extent[0] = {
@@ -65,14 +66,8 @@ struct user_namespace init_user_ns = {
.nr_extents = 1,
},
},
- .ns.ns_type = ns_common_type(&init_user_ns),
- .ns.__ns_ref = REFCOUNT_INIT(3),
.owner = GLOBAL_ROOT_UID,
.group = GLOBAL_ROOT_GID,
- .ns.inum = ns_init_inum(&init_user_ns),
-#ifdef CONFIG_USER_NS
- .ns.ops = &userns_operations,
-#endif
.flags = USERNS_INIT_FLAGS,
#ifdef CONFIG_KEYS
.keyring_name_list = LIST_HEAD_INIT(init_user_ns.keyring_name_list),
diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c
index 7e45559521af..52f89f1137da 100644
--- a/kernel/watch_queue.c
+++ b/kernel/watch_queue.c
@@ -119,9 +119,9 @@ static bool post_one_notification(struct watch_queue *wqueue,
offset = note % WATCH_QUEUE_NOTES_PER_PAGE * WATCH_QUEUE_NOTE_SIZE;
get_page(page);
len = n->info & WATCH_INFO_LENGTH;
- p = kmap_atomic(page);
+ p = kmap_local_page(page);
memcpy(p + offset, n, len);
- kunmap_atomic(p);
+ kunmap_local(p);
buf = pipe_buf(pipe, head);
buf->page = page;
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 3034e294d50d..713cc94caa02 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -445,8 +445,7 @@ config FRAME_WARN
default 2048 if GCC_PLUGIN_LATENT_ENTROPY
default 2048 if PARISC
default 1536 if (!64BIT && XTENSA)
- default 1280 if KASAN && !64BIT
- default 1024 if !64BIT
+ default 1280 if !64BIT
default 2048 if 64BIT
help
Tell the compiler to warn at build time for stack frames larger than this.
diff --git a/lib/crypto/tests/sha256_kunit.c b/lib/crypto/tests/sha256_kunit.c
index dcedfca06df6..5dccdee79693 100644
--- a/lib/crypto/tests/sha256_kunit.c
+++ b/lib/crypto/tests/sha256_kunit.c
@@ -68,6 +68,7 @@ static void test_sha256_finup_2x(struct kunit *test)
rand_bytes(data1_buf, max_data_len);
rand_bytes(data2_buf, max_data_len);
rand_bytes(salt, sizeof(salt));
+ memset(ctx, 0, sizeof(*ctx));
for (size_t i = 0; i < 500; i++) {
size_t salt_len = rand_length(sizeof(salt));
diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index 39bb779cb311..5aa4c9500018 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -64,6 +64,8 @@
#define CREATE_TRACE_POINTS
#include <trace/events/maple_tree.h>
+#define TP_FCT tracepoint_string(__func__)
+
/*
* Kernel pointer hashing renders much of the maple tree dump useless as tagged
* pointers get hashed to arbitrary values.
@@ -2756,7 +2758,7 @@ static inline void mas_rebalance(struct ma_state *mas,
MA_STATE(l_mas, mas->tree, mas->index, mas->last);
MA_STATE(r_mas, mas->tree, mas->index, mas->last);
- trace_ma_op(__func__, mas);
+ trace_ma_op(TP_FCT, mas);
/*
* Rebalancing occurs if a node is insufficient. Data is rebalanced
@@ -2997,7 +2999,7 @@ static void mas_split(struct ma_state *mas, struct maple_big_node *b_node)
MA_STATE(prev_l_mas, mas->tree, mas->index, mas->last);
MA_STATE(prev_r_mas, mas->tree, mas->index, mas->last);
- trace_ma_op(__func__, mas);
+ trace_ma_op(TP_FCT, mas);
mast.l = &l_mas;
mast.r = &r_mas;
@@ -3172,7 +3174,7 @@ static bool mas_is_span_wr(struct ma_wr_state *wr_mas)
return false;
}
- trace_ma_write(__func__, wr_mas->mas, wr_mas->r_max, entry);
+ trace_ma_write(TP_FCT, wr_mas->mas, wr_mas->r_max, entry);
return true;
}
@@ -3416,7 +3418,7 @@ static noinline void mas_wr_spanning_store(struct ma_wr_state *wr_mas)
* of data may happen.
*/
mas = wr_mas->mas;
- trace_ma_op(__func__, mas);
+ trace_ma_op(TP_FCT, mas);
if (unlikely(!mas->index && mas->last == ULONG_MAX))
return mas_new_root(mas, wr_mas->entry);
@@ -3552,7 +3554,7 @@ done:
} else {
memcpy(wr_mas->node, newnode, sizeof(struct maple_node));
}
- trace_ma_write(__func__, mas, 0, wr_mas->entry);
+ trace_ma_write(TP_FCT, mas, 0, wr_mas->entry);
mas_update_gap(mas);
mas->end = new_end;
return;
@@ -3596,7 +3598,7 @@ static inline void mas_wr_slot_store(struct ma_wr_state *wr_mas)
mas->offset++; /* Keep mas accurate. */
}
- trace_ma_write(__func__, mas, 0, wr_mas->entry);
+ trace_ma_write(TP_FCT, mas, 0, wr_mas->entry);
/*
* Only update gap when the new entry is empty or there is an empty
* entry in the original two ranges.
@@ -3717,7 +3719,7 @@ static inline void mas_wr_append(struct ma_wr_state *wr_mas,
mas_update_gap(mas);
mas->end = new_end;
- trace_ma_write(__func__, mas, new_end, wr_mas->entry);
+ trace_ma_write(TP_FCT, mas, new_end, wr_mas->entry);
return;
}
@@ -3731,7 +3733,7 @@ static void mas_wr_bnode(struct ma_wr_state *wr_mas)
{
struct maple_big_node b_node;
- trace_ma_write(__func__, wr_mas->mas, 0, wr_mas->entry);
+ trace_ma_write(TP_FCT, wr_mas->mas, 0, wr_mas->entry);
memset(&b_node, 0, sizeof(struct maple_big_node));
mas_store_b_node(wr_mas, &b_node, wr_mas->offset_end);
mas_commit_b_node(wr_mas, &b_node);
@@ -5062,7 +5064,7 @@ void *mas_store(struct ma_state *mas, void *entry)
{
MA_WR_STATE(wr_mas, mas, entry);
- trace_ma_write(__func__, mas, 0, entry);
+ trace_ma_write(TP_FCT, mas, 0, entry);
#ifdef CONFIG_DEBUG_MAPLE_TREE
if (MAS_WARN_ON(mas, mas->index > mas->last))
pr_err("Error %lX > %lX " PTR_FMT "\n", mas->index, mas->last,
@@ -5163,7 +5165,7 @@ void mas_store_prealloc(struct ma_state *mas, void *entry)
}
store:
- trace_ma_write(__func__, mas, 0, entry);
+ trace_ma_write(TP_FCT, mas, 0, entry);
mas_wr_store_entry(&wr_mas);
MAS_WR_BUG_ON(&wr_mas, mas_is_err(mas));
mas_destroy(mas);
@@ -5882,7 +5884,7 @@ void *mtree_load(struct maple_tree *mt, unsigned long index)
MA_STATE(mas, mt, index, index);
void *entry;
- trace_ma_read(__func__, &mas);
+ trace_ma_read(TP_FCT, &mas);
rcu_read_lock();
retry:
entry = mas_start(&mas);
@@ -5925,7 +5927,7 @@ int mtree_store_range(struct maple_tree *mt, unsigned long index,
MA_STATE(mas, mt, index, last);
int ret = 0;
- trace_ma_write(__func__, &mas, 0, entry);
+ trace_ma_write(TP_FCT, &mas, 0, entry);
if (WARN_ON_ONCE(xa_is_advanced(entry)))
return -EINVAL;
@@ -6148,7 +6150,7 @@ void *mtree_erase(struct maple_tree *mt, unsigned long index)
void *entry = NULL;
MA_STATE(mas, mt, index, index);
- trace_ma_op(__func__, &mas);
+ trace_ma_op(TP_FCT, &mas);
mtree_lock(mt);
entry = mas_erase(&mas);
@@ -6485,7 +6487,7 @@ void *mt_find(struct maple_tree *mt, unsigned long *index, unsigned long max)
unsigned long copy = *index;
#endif
- trace_ma_read(__func__, &mas);
+ trace_ma_read(TP_FCT, &mas);
if ((*index) > max)
return NULL;
diff --git a/lib/test_kho.c b/lib/test_kho.c
index 60cd899ea745..fff018e5548d 100644
--- a/lib/test_kho.c
+++ b/lib/test_kho.c
@@ -301,6 +301,9 @@ static int __init kho_test_init(void)
phys_addr_t fdt_phys;
int err;
+ if (!kho_is_enabled())
+ return 0;
+
err = kho_retrieve_subtree(KHO_TEST_FDT, &fdt_phys);
if (!err)
return kho_test_restore(fdt_phys);
diff --git a/mm/Kconfig b/mm/Kconfig
index 0e26f4fc8717..ca3f146bc705 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -908,6 +908,13 @@ config PAGE_MAPCOUNT
config PGTABLE_HAS_HUGE_LEAVES
def_bool TRANSPARENT_HUGEPAGE || HUGETLB_PAGE
+#
+# We can end up creating gigantic folio.
+#
+config HAVE_GIGANTIC_FOLIOS
+ def_bool (HUGETLB_PAGE && ARCH_HAS_GIGANTIC_PAGE) || \
+ (ZONE_DEVICE && HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+
# TODO: Allow to be enabled without THP
config ARCH_SUPPORTS_HUGE_PFNMAP
def_bool n
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 41b6c9386b69..c5740c6d37a2 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -72,7 +72,7 @@ static void collect_wb_stats(struct wb_stats *stats,
list_for_each_entry(inode, &wb->b_more_io, i_io_list)
stats->nr_more_io++;
list_for_each_entry(inode, &wb->b_dirty_time, i_io_list)
- if (inode->i_state & I_DIRTY_TIME)
+ if (inode_state_read_once(inode) & I_DIRTY_TIME)
stats->nr_dirty_time++;
spin_unlock(&wb->list_lock);
diff --git a/mm/damon/stat.c b/mm/damon/stat.c
index d8010968bbed..bf8626859902 100644
--- a/mm/damon/stat.c
+++ b/mm/damon/stat.c
@@ -46,6 +46,8 @@ MODULE_PARM_DESC(aggr_interval_us,
static struct damon_ctx *damon_stat_context;
+static unsigned long damon_stat_last_refresh_jiffies;
+
static void damon_stat_set_estimated_memory_bandwidth(struct damon_ctx *c)
{
struct damon_target *t;
@@ -130,13 +132,12 @@ static void damon_stat_set_idletime_percentiles(struct damon_ctx *c)
static int damon_stat_damon_call_fn(void *data)
{
struct damon_ctx *c = data;
- static unsigned long last_refresh_jiffies;
/* avoid unnecessarily frequent stat update */
- if (time_before_eq(jiffies, last_refresh_jiffies +
+ if (time_before_eq(jiffies, damon_stat_last_refresh_jiffies +
msecs_to_jiffies(5 * MSEC_PER_SEC)))
return 0;
- last_refresh_jiffies = jiffies;
+ damon_stat_last_refresh_jiffies = jiffies;
aggr_interval_us = c->attrs.aggr_interval;
damon_stat_set_estimated_memory_bandwidth(c);
@@ -210,6 +211,8 @@ static int damon_stat_start(void)
err = damon_start(&damon_stat_context, 1, true);
if (err)
return err;
+
+ damon_stat_last_refresh_jiffies = jiffies;
call_control.data = damon_stat_context;
return damon_call(damon_stat_context, &call_control);
}
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index cd6815ecc04e..3c0d727788c8 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -1552,16 +1552,17 @@ static struct damon_ctx *damon_sysfs_build_ctx(
return ctx;
}
+static unsigned long damon_sysfs_next_update_jiffies;
+
static int damon_sysfs_repeat_call_fn(void *data)
{
struct damon_sysfs_kdamond *sysfs_kdamond = data;
- static unsigned long next_update_jiffies;
if (!sysfs_kdamond->refresh_ms)
return 0;
- if (time_before(jiffies, next_update_jiffies))
+ if (time_before(jiffies, damon_sysfs_next_update_jiffies))
return 0;
- next_update_jiffies = jiffies +
+ damon_sysfs_next_update_jiffies = jiffies +
msecs_to_jiffies(sysfs_kdamond->refresh_ms);
if (!mutex_trylock(&damon_sysfs_lock))
@@ -1607,6 +1608,9 @@ static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond)
}
kdamond->damon_ctx = ctx;
+ damon_sysfs_next_update_jiffies =
+ jiffies + msecs_to_jiffies(kdamond->refresh_ms);
+
repeat_call_control->fn = damon_sysfs_repeat_call_fn;
repeat_call_control->data = kdamond;
repeat_call_control->repeat = true;
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 588fe76c5a14..67028e30aa91 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -111,8 +111,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
spin_unlock(&file->f_lock);
break;
case POSIX_FADV_DONTNEED:
- __filemap_fdatawrite_range(mapping, offset, endbyte,
- WB_SYNC_NONE);
+ filemap_flush_range(mapping, offset, endbyte);
/*
* First and last FULL page! Partial pages are deliberately
diff --git a/mm/filemap.c b/mm/filemap.c
index 13f0259d993c..dfc8a31f1222 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -256,7 +256,7 @@ void filemap_remove_folio(struct folio *folio)
__filemap_remove_folio(folio, NULL);
xa_unlock_irq(&mapping->i_pages);
if (mapping_shrinkable(mapping))
- inode_add_lru(mapping->host);
+ inode_lru_list_add(mapping->host);
spin_unlock(&mapping->host->i_lock);
filemap_free_folio(mapping, folio);
@@ -335,7 +335,7 @@ void delete_from_page_cache_batch(struct address_space *mapping,
page_cache_delete_batch(mapping, fbatch);
xa_unlock_irq(&mapping->i_pages);
if (mapping_shrinkable(mapping))
- inode_add_lru(mapping->host);
+ inode_lru_list_add(mapping->host);
spin_unlock(&mapping->host->i_lock);
for (i = 0; i < folio_batch_count(fbatch); i++)
@@ -366,83 +366,60 @@ static int filemap_check_and_keep_errors(struct address_space *mapping)
return 0;
}
-/**
- * filemap_fdatawrite_wbc - start writeback on mapping dirty pages in range
- * @mapping: address space structure to write
- * @wbc: the writeback_control controlling the writeout
- *
- * Call writepages on the mapping using the provided wbc to control the
- * writeout.
- *
- * Return: %0 on success, negative error code otherwise.
- */
-int filemap_fdatawrite_wbc(struct address_space *mapping,
- struct writeback_control *wbc)
+static int filemap_writeback(struct address_space *mapping, loff_t start,
+ loff_t end, enum writeback_sync_modes sync_mode,
+ long *nr_to_write)
{
+ struct writeback_control wbc = {
+ .sync_mode = sync_mode,
+ .nr_to_write = nr_to_write ? *nr_to_write : LONG_MAX,
+ .range_start = start,
+ .range_end = end,
+ };
int ret;
if (!mapping_can_writeback(mapping) ||
!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
return 0;
- wbc_attach_fdatawrite_inode(wbc, mapping->host);
- ret = do_writepages(mapping, wbc);
- wbc_detach_inode(wbc);
+ wbc_attach_fdatawrite_inode(&wbc, mapping->host);
+ ret = do_writepages(mapping, &wbc);
+ wbc_detach_inode(&wbc);
+
+ if (!ret && nr_to_write)
+ *nr_to_write = wbc.nr_to_write;
return ret;
}
-EXPORT_SYMBOL(filemap_fdatawrite_wbc);
/**
- * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
+ * filemap_fdatawrite_range - start writeback on mapping dirty pages in range
* @mapping: address space structure to write
* @start: offset in bytes where the range starts
* @end: offset in bytes where the range ends (inclusive)
- * @sync_mode: enable synchronous operation
*
* Start writeback against all of a mapping's dirty pages that lie
* within the byte offsets <start, end> inclusive.
*
- * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
- * opposed to a regular memory cleansing writeback. The difference between
- * these two operations is that if a dirty page/buffer is encountered, it must
- * be waited upon, and not just skipped over.
+ * This is a data integrity operation that waits upon dirty or in writeback
+ * pages.
*
* Return: %0 on success, negative error code otherwise.
*/
-int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
- loff_t end, int sync_mode)
-{
- struct writeback_control wbc = {
- .sync_mode = sync_mode,
- .nr_to_write = LONG_MAX,
- .range_start = start,
- .range_end = end,
- };
-
- return filemap_fdatawrite_wbc(mapping, &wbc);
-}
-
-static inline int __filemap_fdatawrite(struct address_space *mapping,
- int sync_mode)
+int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
+ loff_t end)
{
- return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);
+ return filemap_writeback(mapping, start, end, WB_SYNC_ALL, NULL);
}
+EXPORT_SYMBOL(filemap_fdatawrite_range);
int filemap_fdatawrite(struct address_space *mapping)
{
- return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
+ return filemap_fdatawrite_range(mapping, 0, LLONG_MAX);
}
EXPORT_SYMBOL(filemap_fdatawrite);
-int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
- loff_t end)
-{
- return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
-}
-EXPORT_SYMBOL(filemap_fdatawrite_range);
-
/**
- * filemap_fdatawrite_range_kick - start writeback on a range
+ * filemap_flush_range - start writeback on a range
* @mapping: target address_space
* @start: index to start writeback on
* @end: last (inclusive) index for writeback
@@ -452,12 +429,12 @@ EXPORT_SYMBOL(filemap_fdatawrite_range);
*
* Return: %0 on success, negative error code otherwise.
*/
-int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start,
+int filemap_flush_range(struct address_space *mapping, loff_t start,
loff_t end)
{
- return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_NONE);
+ return filemap_writeback(mapping, start, end, WB_SYNC_NONE, NULL);
}
-EXPORT_SYMBOL_GPL(filemap_fdatawrite_range_kick);
+EXPORT_SYMBOL_GPL(filemap_flush_range);
/**
* filemap_flush - mostly a non-blocking flush
@@ -470,10 +447,22 @@ EXPORT_SYMBOL_GPL(filemap_fdatawrite_range_kick);
*/
int filemap_flush(struct address_space *mapping)
{
- return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
+ return filemap_flush_range(mapping, 0, LLONG_MAX);
}
EXPORT_SYMBOL(filemap_flush);
+/*
+ * Start writeback on @nr_to_write pages from @mapping. No one but the existing
+ * btrfs caller should be using this. Talk to linux-mm if you think adding a
+ * new caller is a good idea.
+ */
+int filemap_flush_nr(struct address_space *mapping, long *nr_to_write)
+{
+ return filemap_writeback(mapping, 0, LLONG_MAX, WB_SYNC_NONE,
+ nr_to_write);
+}
+EXPORT_SYMBOL_FOR_MODULES(filemap_flush_nr, "btrfs");
+
/**
* filemap_range_has_page - check if a page exists in range.
* @mapping: address space within which to check
@@ -691,8 +680,7 @@ int filemap_write_and_wait_range(struct address_space *mapping,
return 0;
if (mapping_needs_writeback(mapping)) {
- err = __filemap_fdatawrite_range(mapping, lstart, lend,
- WB_SYNC_ALL);
+ err = filemap_fdatawrite_range(mapping, lstart, lend);
/*
* Even if the above returned error, the pages may be
* written partially (e.g. -ENOSPC), so we wait for it.
@@ -794,8 +782,7 @@ int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)
return 0;
if (mapping_needs_writeback(mapping)) {
- err = __filemap_fdatawrite_range(mapping, lstart, lend,
- WB_SYNC_ALL);
+ err = filemap_fdatawrite_range(mapping, lstart, lend);
/* See comment of filemap_write_and_wait() */
if (err != -EIO)
__filemap_fdatawait_range(mapping, lstart, lend);
@@ -2366,6 +2353,64 @@ out:
}
EXPORT_SYMBOL(filemap_get_folios_tag);
+/**
+ * filemap_get_folios_dirty - Get a batch of dirty folios
+ * @mapping: The address_space to search
+ * @start: The starting folio index
+ * @end: The final folio index (inclusive)
+ * @fbatch: The batch to fill
+ *
+ * filemap_get_folios_dirty() works exactly like filemap_get_folios(), except
+ * the returned folios are presumed to be dirty or undergoing writeback. Dirty
+ * state is presumed because we don't block on folio lock nor want to miss
+ * folios. Callers that need to can recheck state upon locking the folio.
+ *
+ * This may not return all dirty folios if the batch gets filled up.
+ *
+ * Return: The number of folios found.
+ * Also update @start to be positioned for traversal of the next folio.
+ */
+unsigned filemap_get_folios_dirty(struct address_space *mapping, pgoff_t *start,
+ pgoff_t end, struct folio_batch *fbatch)
+{
+ XA_STATE(xas, &mapping->i_pages, *start);
+ struct folio *folio;
+
+ rcu_read_lock();
+ while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) {
+ if (xa_is_value(folio))
+ continue;
+ if (folio_trylock(folio)) {
+ bool clean = !folio_test_dirty(folio) &&
+ !folio_test_writeback(folio);
+ folio_unlock(folio);
+ if (clean) {
+ folio_put(folio);
+ continue;
+ }
+ }
+ if (!folio_batch_add(fbatch, folio)) {
+ unsigned long nr = folio_nr_pages(folio);
+ *start = folio->index + nr;
+ goto out;
+ }
+ }
+ /*
+ * We come here when there is no folio beyond @end. We take care to not
+ * overflow the index @start as it confuses some of the callers. This
+ * breaks the iteration when there is a folio at index -1 but that is
+ * already broke anyway.
+ */
+ if (end == (pgoff_t)-1)
+ *start = (pgoff_t)-1;
+ else
+ *start = end + 1;
+out:
+ rcu_read_unlock();
+
+ return folio_batch_count(fbatch);
+}
+
/*
* CD/DVDs are error prone. When a medium error occurs, the driver may fail
* a _large_ part of the i/o request. Imagine the worst scenario:
@@ -3681,8 +3726,10 @@ skip:
static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
struct folio *folio, unsigned long start,
unsigned long addr, unsigned int nr_pages,
- unsigned long *rss, unsigned short *mmap_miss)
+ unsigned long *rss, unsigned short *mmap_miss,
+ pgoff_t file_end)
{
+ struct address_space *mapping = folio->mapping;
unsigned int ref_from_caller = 1;
vm_fault_t ret = 0;
struct page *page = folio_page(folio, start);
@@ -3691,12 +3738,16 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
unsigned long addr0;
/*
- * Map the large folio fully where possible.
+ * Map the large folio fully where possible:
*
- * The folio must not cross VMA or page table boundary.
+ * - The folio is fully within size of the file or belong
+ * to shmem/tmpfs;
+ * - The folio doesn't cross VMA boundary;
+ * - The folio doesn't cross page table boundary;
*/
addr0 = addr - start * PAGE_SIZE;
- if (folio_within_vma(folio, vmf->vma) &&
+ if ((file_end >= folio_next_index(folio) || shmem_mapping(mapping)) &&
+ folio_within_vma(folio, vmf->vma) &&
(addr0 & PMD_MASK) == ((addr0 + folio_size(folio) - 1) & PMD_MASK)) {
vmf->pte -= start;
page -= start;
@@ -3817,7 +3868,18 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
if (!folio)
goto out;
- if (filemap_map_pmd(vmf, folio, start_pgoff)) {
+ file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE) - 1;
+ end_pgoff = min(end_pgoff, file_end);
+
+ /*
+ * Do not allow to map with PMD across i_size to preserve
+ * SIGBUS semantics.
+ *
+ * Make an exception for shmem/tmpfs that for long time
+ * intentionally mapped with PMDs across i_size.
+ */
+ if ((file_end >= folio_next_index(folio) || shmem_mapping(mapping)) &&
+ filemap_map_pmd(vmf, folio, start_pgoff)) {
ret = VM_FAULT_NOPAGE;
goto out;
}
@@ -3830,10 +3892,6 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
goto out;
}
- file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE) - 1;
- if (end_pgoff > file_end)
- end_pgoff = file_end;
-
folio_type = mm_counter_file(folio);
do {
unsigned long end;
@@ -3850,7 +3908,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
else
ret |= filemap_map_folio_range(vmf, folio,
xas.xa_index - folio->index, addr,
- nr_pages, &rss, &mmap_miss);
+ nr_pages, &rss, &mmap_miss, file_end);
folio_unlock(folio);
} while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL);
@@ -4457,16 +4515,8 @@ int filemap_invalidate_inode(struct inode *inode, bool flush,
unmap_mapping_pages(mapping, first, nr, false);
/* Write back the data if we're asked to. */
- if (flush) {
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_ALL,
- .nr_to_write = LONG_MAX,
- .range_start = start,
- .range_end = end,
- };
-
- filemap_fdatawrite_wbc(mapping, &wbc);
- }
+ if (flush)
+ filemap_fdatawrite_range(mapping, start, end);
/* Wait for writeback to complete on all folios and discard. */
invalidate_inode_pages2_range(mapping, start / PAGE_SIZE, end / PAGE_SIZE);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1d1b74950332..6cba1cb14b23 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -214,7 +214,8 @@ retry:
if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
return true;
- zero_folio = folio_alloc((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
+ zero_folio = folio_alloc((GFP_TRANSHUGE | __GFP_ZERO | __GFP_ZEROTAGS) &
+ ~__GFP_MOVABLE,
HPAGE_PMD_ORDER);
if (!zero_folio) {
count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
@@ -3263,6 +3264,14 @@ bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
caller_pins;
}
+static bool page_range_has_hwpoisoned(struct page *page, long nr_pages)
+{
+ for (; nr_pages; page++, nr_pages--)
+ if (PageHWPoison(page))
+ return true;
+ return false;
+}
+
/*
* It splits @folio into @new_order folios and copies the @folio metadata to
* all the resulting folios.
@@ -3270,17 +3279,24 @@ bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
static void __split_folio_to_order(struct folio *folio, int old_order,
int new_order)
{
+ /* Scan poisoned pages when split a poisoned folio to large folios */
+ const bool handle_hwpoison = folio_test_has_hwpoisoned(folio) && new_order;
long new_nr_pages = 1 << new_order;
long nr_pages = 1 << old_order;
long i;
+ folio_clear_has_hwpoisoned(folio);
+
+ /* Check first new_nr_pages since the loop below skips them */
+ if (handle_hwpoison &&
+ page_range_has_hwpoisoned(folio_page(folio, 0), new_nr_pages))
+ folio_set_has_hwpoisoned(folio);
/*
* Skip the first new_nr_pages, since the new folio from them have all
* the flags from the original folio.
*/
for (i = new_nr_pages; i < nr_pages; i += new_nr_pages) {
struct page *new_head = &folio->page + i;
-
/*
* Careful: new_folio is not a "real" folio before we cleared PageTail.
* Don't pass it around before clear_compound_head().
@@ -3322,6 +3338,10 @@ static void __split_folio_to_order(struct folio *folio, int old_order,
(1L << PG_dirty) |
LRU_GEN_MASK | LRU_REFS_MASK));
+ if (handle_hwpoison &&
+ page_range_has_hwpoisoned(new_head, new_nr_pages))
+ folio_set_has_hwpoisoned(new_folio);
+
new_folio->mapping = folio->mapping;
new_folio->index = folio->index + i;
@@ -3422,8 +3442,6 @@ static int __split_unmapped_folio(struct folio *folio, int new_order,
if (folio_test_anon(folio))
mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
- folio_clear_has_hwpoisoned(folio);
-
/*
* split to new_order one order at a time. For uniform split,
* folio is split to new_order directly.
@@ -3504,7 +3522,8 @@ bool non_uniform_split_supported(struct folio *folio, unsigned int new_order,
/* order-1 is not supported for anonymous THP. */
VM_WARN_ONCE(warns && new_order == 1,
"Cannot split to order-1 folio");
- return new_order != 1;
+ if (new_order == 1)
+ return false;
} else if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
!mapping_large_folio_support(folio->mapping)) {
/*
@@ -3535,7 +3554,8 @@ bool uniform_split_supported(struct folio *folio, unsigned int new_order,
if (folio_test_anon(folio)) {
VM_WARN_ONCE(warns && new_order == 1,
"Cannot split to order-1 folio");
- return new_order != 1;
+ if (new_order == 1)
+ return false;
} else if (new_order) {
if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
!mapping_large_folio_support(folio->mapping)) {
@@ -3599,6 +3619,16 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
if (folio != page_folio(split_at) || folio != page_folio(lock_at))
return -EINVAL;
+ /*
+ * Folios that just got truncated cannot get split. Signal to the
+ * caller that there was a race.
+ *
+ * TODO: this will also currently refuse shmem folios that are in the
+ * swapcache.
+ */
+ if (!is_anon && !folio->mapping)
+ return -EBUSY;
+
if (new_order >= folio_order(folio))
return -EINVAL;
@@ -3639,22 +3669,8 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
gfp_t gfp;
mapping = folio->mapping;
-
- /* Truncated ? */
- /*
- * TODO: add support for large shmem folio in swap cache.
- * When shmem is in swap cache, mapping is NULL and
- * folio_test_swapcache() is true.
- */
- if (!mapping) {
- ret = -EBUSY;
- goto out;
- }
-
min_order = mapping_min_folio_order(folio->mapping);
if (new_order < min_order) {
- VM_WARN_ONCE(1, "Cannot split mapped folio below min-order: %u",
- min_order);
ret = -EINVAL;
goto out;
}
@@ -3986,12 +4002,7 @@ int min_order_for_split(struct folio *folio)
int split_folio_to_list(struct folio *folio, struct list_head *list)
{
- int ret = min_order_for_split(folio);
-
- if (ret < 0)
- return ret;
-
- return split_huge_page_to_list_to_order(&folio->page, list, ret);
+ return split_huge_page_to_list_to_order(&folio->page, list, 0);
}
/*
diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c
index 8bca7fece47f..35ceaa8adb41 100644
--- a/mm/kmsan/core.c
+++ b/mm/kmsan/core.c
@@ -72,9 +72,6 @@ depot_stack_handle_t kmsan_save_stack_with_flags(gfp_t flags,
nr_entries = stack_trace_save(entries, KMSAN_STACK_DEPTH, 0);
- /* Don't sleep. */
- flags &= ~(__GFP_DIRECT_RECLAIM | __GFP_KSWAPD_RECLAIM);
-
handle = stack_depot_save(entries, nr_entries, flags);
return stack_depot_set_extra_bits(handle, extra);
}
diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c
index 2cee59d89c80..8f22d1f22981 100644
--- a/mm/kmsan/hooks.c
+++ b/mm/kmsan/hooks.c
@@ -84,7 +84,8 @@ void kmsan_slab_free(struct kmem_cache *s, void *object)
if (s->ctor)
return;
kmsan_enter_runtime();
- kmsan_internal_poison_memory(object, s->object_size, GFP_KERNEL,
+ kmsan_internal_poison_memory(object, s->object_size,
+ GFP_KERNEL & ~(__GFP_RECLAIM),
KMSAN_POISON_CHECK | KMSAN_POISON_FREE);
kmsan_leave_runtime();
}
@@ -114,7 +115,8 @@ void kmsan_kfree_large(const void *ptr)
kmsan_enter_runtime();
page = virt_to_head_page((void *)ptr);
KMSAN_WARN_ON(ptr != page_address(page));
- kmsan_internal_poison_memory((void *)ptr, page_size(page), GFP_KERNEL,
+ kmsan_internal_poison_memory((void *)ptr, page_size(page),
+ GFP_KERNEL & ~(__GFP_RECLAIM),
KMSAN_POISON_CHECK | KMSAN_POISON_FREE);
kmsan_leave_runtime();
}
diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c
index 54f3c3c962f0..55fdea199aaf 100644
--- a/mm/kmsan/shadow.c
+++ b/mm/kmsan/shadow.c
@@ -208,7 +208,7 @@ void kmsan_free_page(struct page *page, unsigned int order)
return;
kmsan_enter_runtime();
kmsan_internal_poison_memory(page_address(page), page_size(page),
- GFP_KERNEL,
+ GFP_KERNEL & ~(__GFP_RECLAIM),
KMSAN_POISON_CHECK | KMSAN_POISON_FREE);
kmsan_leave_runtime();
}
diff --git a/mm/ksm.c b/mm/ksm.c
index 7bc726b50b2f..c4e730409949 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2455,6 +2455,95 @@ static bool should_skip_rmap_item(struct folio *folio,
return true;
}
+struct ksm_next_page_arg {
+ struct folio *folio;
+ struct page *page;
+ unsigned long addr;
+};
+
+static int ksm_next_page_pmd_entry(pmd_t *pmdp, unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct ksm_next_page_arg *private = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ pte_t *start_ptep = NULL, *ptep, pte;
+ struct mm_struct *mm = walk->mm;
+ struct folio *folio;
+ struct page *page;
+ spinlock_t *ptl;
+ pmd_t pmd;
+
+ if (ksm_test_exit(mm))
+ return 0;
+
+ cond_resched();
+
+ pmd = pmdp_get_lockless(pmdp);
+ if (!pmd_present(pmd))
+ return 0;
+
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && pmd_leaf(pmd)) {
+ ptl = pmd_lock(mm, pmdp);
+ pmd = pmdp_get(pmdp);
+
+ if (!pmd_present(pmd)) {
+ goto not_found_unlock;
+ } else if (pmd_leaf(pmd)) {
+ page = vm_normal_page_pmd(vma, addr, pmd);
+ if (!page)
+ goto not_found_unlock;
+ folio = page_folio(page);
+
+ if (folio_is_zone_device(folio) || !folio_test_anon(folio))
+ goto not_found_unlock;
+
+ page += ((addr & (PMD_SIZE - 1)) >> PAGE_SHIFT);
+ goto found_unlock;
+ }
+ spin_unlock(ptl);
+ }
+
+ start_ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ if (!start_ptep)
+ return 0;
+
+ for (ptep = start_ptep; addr < end; ptep++, addr += PAGE_SIZE) {
+ pte = ptep_get(ptep);
+
+ if (!pte_present(pte))
+ continue;
+
+ page = vm_normal_page(vma, addr, pte);
+ if (!page)
+ continue;
+ folio = page_folio(page);
+
+ if (folio_is_zone_device(folio) || !folio_test_anon(folio))
+ continue;
+ goto found_unlock;
+ }
+
+not_found_unlock:
+ spin_unlock(ptl);
+ if (start_ptep)
+ pte_unmap(start_ptep);
+ return 0;
+found_unlock:
+ folio_get(folio);
+ spin_unlock(ptl);
+ if (start_ptep)
+ pte_unmap(start_ptep);
+ private->page = page;
+ private->folio = folio;
+ private->addr = addr;
+ return 1;
+}
+
+static struct mm_walk_ops ksm_next_page_ops = {
+ .pmd_entry = ksm_next_page_pmd_entry,
+ .walk_lock = PGWALK_RDLOCK,
+};
+
static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
{
struct mm_struct *mm;
@@ -2542,21 +2631,27 @@ next_mm:
ksm_scan.address = vma->vm_end;
while (ksm_scan.address < vma->vm_end) {
+ struct ksm_next_page_arg ksm_next_page_arg;
struct page *tmp_page = NULL;
- struct folio_walk fw;
struct folio *folio;
if (ksm_test_exit(mm))
break;
- folio = folio_walk_start(&fw, vma, ksm_scan.address, 0);
- if (folio) {
- if (!folio_is_zone_device(folio) &&
- folio_test_anon(folio)) {
- folio_get(folio);
- tmp_page = fw.page;
- }
- folio_walk_end(&fw, vma);
+ int found;
+
+ found = walk_page_range_vma(vma, ksm_scan.address,
+ vma->vm_end,
+ &ksm_next_page_ops,
+ &ksm_next_page_arg);
+
+ if (found > 0) {
+ folio = ksm_next_page_arg.folio;
+ tmp_page = ksm_next_page_arg.page;
+ ksm_scan.address = ksm_next_page_arg.addr;
+ } else {
+ VM_WARN_ON_ONCE(found < 0);
+ ksm_scan.address = vma->vm_end - PAGE_SIZE;
}
if (tmp_page) {
diff --git a/mm/memblock.c b/mm/memblock.c
index e23e16618e9b..f0f2dc66e9a2 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1826,7 +1826,8 @@ phys_addr_t __init_memblock memblock_reserved_kern_size(phys_addr_t limit, int n
*/
unsigned long __init memblock_estimated_nr_free_pages(void)
{
- return PHYS_PFN(memblock_phys_mem_size() - memblock_reserved_size());
+ return PHYS_PFN(memblock_phys_mem_size() -
+ memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE, NUMA_NO_NODE));
}
/* lowest address */
diff --git a/mm/memfd.c b/mm/memfd.c
index 1d109c1acf21..805e297916e5 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -96,9 +96,36 @@ struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx)
NULL,
gfp_mask);
if (folio) {
+ u32 hash;
+
+ /*
+ * Zero the folio to prevent information leaks to userspace.
+ * Use folio_zero_user() which is optimized for huge/gigantic
+ * pages. Pass 0 as addr_hint since this is not a faulting path
+ * and we don't have a user virtual address yet.
+ */
+ folio_zero_user(folio, 0);
+
+ /*
+ * Mark the folio uptodate before adding to page cache,
+ * as required by filemap.c and other hugetlb paths.
+ */
+ __folio_mark_uptodate(folio);
+
+ /*
+ * Serialize hugepage allocation and instantiation to prevent
+ * races with concurrent allocations, as required by all other
+ * callers of hugetlb_add_to_page_cache().
+ */
+ hash = hugetlb_fault_mutex_hash(memfd->f_mapping, idx);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
err = hugetlb_add_to_page_cache(folio,
memfd->f_mapping,
idx);
+
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+
if (err) {
folio_put(folio);
goto err_unresv;
@@ -470,9 +497,9 @@ SYSCALL_DEFINE2(memfd_create,
const char __user *, uname,
unsigned int, flags)
{
- struct file *file;
- int fd, error;
- char *name;
+ char *name __free(kfree) = NULL;
+ unsigned int fd_flags;
+ int error;
error = sanitize_flags(&flags);
if (error < 0)
@@ -482,25 +509,6 @@ SYSCALL_DEFINE2(memfd_create,
if (IS_ERR(name))
return PTR_ERR(name);
- fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
- if (fd < 0) {
- error = fd;
- goto err_free_name;
- }
-
- file = alloc_file(name, flags);
- if (IS_ERR(file)) {
- error = PTR_ERR(file);
- goto err_free_fd;
- }
-
- fd_install(fd, file);
- kfree(name);
- return fd;
-
-err_free_fd:
- put_unused_fd(fd);
-err_free_name:
- kfree(name);
- return error;
+ fd_flags = (flags & MFD_CLOEXEC) ? O_CLOEXEC : 0;
+ return FD_ADD(fd_flags, alloc_file(name, flags));
}
diff --git a/mm/memory.c b/mm/memory.c
index 74b45e258323..b59ae7ce42eb 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -65,6 +65,7 @@
#include <linux/gfp.h>
#include <linux/migrate.h>
#include <linux/string.h>
+#include <linux/shmem_fs.h>
#include <linux/memory-tiers.h>
#include <linux/debugfs.h>
#include <linux/userfaultfd_k.h>
@@ -5501,8 +5502,25 @@ fallback:
return ret;
}
+ if (!needs_fallback && vma->vm_file) {
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ pgoff_t file_end;
+
+ file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
+
+ /*
+ * Do not allow to map with PTEs beyond i_size and with PMD
+ * across i_size to preserve SIGBUS semantics.
+ *
+ * Make an exception for shmem/tmpfs that for long time
+ * intentionally mapped with PMDs across i_size.
+ */
+ needs_fallback = !shmem_mapping(mapping) &&
+ file_end < folio_next_index(folio);
+ }
+
if (pmd_none(*vmf->pmd)) {
- if (folio_test_pmd_mappable(folio)) {
+ if (!needs_fallback && folio_test_pmd_mappable(folio)) {
ret = do_set_pmd(vmf, folio, page);
if (ret != VM_FAULT_FALLBACK)
return ret;
diff --git a/mm/mempool.c b/mm/mempool.c
index 1c38e873e546..d7bbf1189db9 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -68,10 +68,20 @@ static void check_element(mempool_t *pool, void *element)
} else if (pool->free == mempool_free_pages) {
/* Mempools backed by page allocator */
int order = (int)(long)pool->pool_data;
- void *addr = kmap_local_page((struct page *)element);
- __check_element(pool, addr, 1UL << (PAGE_SHIFT + order));
- kunmap_local(addr);
+#ifdef CONFIG_HIGHMEM
+ for (int i = 0; i < (1 << order); i++) {
+ struct page *page = (struct page *)element;
+ void *addr = kmap_local_page(page + i);
+
+ __check_element(pool, addr, PAGE_SIZE);
+ kunmap_local(addr);
+ }
+#else
+ void *addr = page_address((struct page *)element);
+
+ __check_element(pool, addr, PAGE_SIZE << order);
+#endif
}
}
@@ -97,10 +107,20 @@ static void poison_element(mempool_t *pool, void *element)
} else if (pool->alloc == mempool_alloc_pages) {
/* Mempools backed by page allocator */
int order = (int)(long)pool->pool_data;
- void *addr = kmap_local_page((struct page *)element);
- __poison_element(addr, 1UL << (PAGE_SHIFT + order));
- kunmap_local(addr);
+#ifdef CONFIG_HIGHMEM
+ for (int i = 0; i < (1 << order); i++) {
+ struct page *page = (struct page *)element;
+ void *addr = kmap_local_page(page + i);
+
+ __poison_element(addr, PAGE_SIZE);
+ kunmap_local(addr);
+ }
+#else
+ void *addr = page_address((struct page *)element);
+
+ __poison_element(addr, PAGE_SIZE << order);
+#endif
}
}
#else /* CONFIG_SLUB_DEBUG_ON */
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 3db2dea7db4c..7712d887b696 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -2469,7 +2469,7 @@ void *__init alloc_large_system_hash(const char *tablename,
panic("Failed to allocate %s hash table\n", tablename);
pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",
- tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size,
+ tablename, 1UL << log2qty, get_order(size), size,
virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear");
if (_hash_shift)
diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
index 0a0db5849b8e..42e3dde73e74 100644
--- a/mm/mmap_lock.c
+++ b/mm/mmap_lock.c
@@ -241,6 +241,7 @@ retry:
if (PTR_ERR(vma) == -EAGAIN) {
count_vm_vma_lock_event(VMA_LOCK_MISS);
/* The area was replaced with another one */
+ mas_set(&mas, address);
goto retry;
}
diff --git a/mm/mremap.c b/mm/mremap.c
index bd7314898ec5..419a0ea0a870 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -187,7 +187,7 @@ static int mremap_folio_pte_batch(struct vm_area_struct *vma, unsigned long addr
if (!folio || !folio_test_large(folio))
return 1;
- return folio_pte_batch(folio, ptep, pte, max_nr);
+ return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr, FPB_RESPECT_WRITE);
}
static int move_ptes(struct pagetable_move_control *pmc,
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 757bc4d3b5b5..a124ab6a205d 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2434,12 +2434,6 @@ static bool folio_prepare_writeback(struct address_space *mapping,
return true;
}
-static xa_mark_t wbc_to_tag(struct writeback_control *wbc)
-{
- if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
- return PAGECACHE_TAG_TOWRITE;
- return PAGECACHE_TAG_DIRTY;
-}
static pgoff_t wbc_end(struct writeback_control *wbc)
{
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 600d9e981c23..ed82ee55e66a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1822,14 +1822,9 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
* If memory tags should be zeroed
* (which happens only when memory should be initialized as well).
*/
- if (zero_tags) {
- /* Initialize both memory and memory tags. */
- for (i = 0; i != 1 << order; ++i)
- tag_clear_highpage(page + i);
+ if (zero_tags)
+ init = !tag_clear_highpages(page, 1 << order);
- /* Take note that memory was initialized by the loop above. */
- init = false;
- }
if (!should_skip_kasan_unpoison(gfp_flags) &&
kasan_unpoison_pages(page, order, init)) {
/* Take note that memory was initialized by KASAN. */
diff --git a/mm/secretmem.c b/mm/secretmem.c
index 60137305bc20..f0ef4e198884 100644
--- a/mm/secretmem.c
+++ b/mm/secretmem.c
@@ -82,13 +82,13 @@ retry:
__folio_mark_uptodate(folio);
err = filemap_add_folio(mapping, folio, offset, gfp);
if (unlikely(err)) {
- folio_put(folio);
/*
* If a split of large page was required, it
* already happened when we marked the page invalid
* which guarantees that this call won't fail
*/
set_direct_map_default_noflush(folio_page(folio, 0));
+ folio_put(folio);
if (err == -EEXIST)
goto retry;
@@ -224,9 +224,6 @@ err_free_inode:
SYSCALL_DEFINE1(memfd_secret, unsigned int, flags)
{
- struct file *file;
- int fd, err;
-
/* make sure local flags do not confict with global fcntl.h */
BUILD_BUG_ON(SECRETMEM_FLAGS_MASK & O_CLOEXEC);
@@ -238,22 +235,7 @@ SYSCALL_DEFINE1(memfd_secret, unsigned int, flags)
if (atomic_read(&secretmem_users) < 0)
return -ENFILE;
- fd = get_unused_fd_flags(flags & O_CLOEXEC);
- if (fd < 0)
- return fd;
-
- file = secretmem_file_create(flags);
- if (IS_ERR(file)) {
- err = PTR_ERR(file);
- goto err_put_fd;
- }
-
- fd_install(fd, file);
- return fd;
-
-err_put_fd:
- put_unused_fd(fd);
- return err;
+ return FD_ADD(flags & O_CLOEXEC, secretmem_file_create(flags));
}
static int secretmem_init_fs_context(struct fs_context *fc)
diff --git a/mm/shmem.c b/mm/shmem.c
index b9081b817d28..899303d8c9aa 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -131,8 +131,7 @@ struct shmem_options {
#define SHMEM_SEEN_INODES 2
#define SHMEM_SEEN_HUGE 4
#define SHMEM_SEEN_INUMS 8
-#define SHMEM_SEEN_NOSWAP 16
-#define SHMEM_SEEN_QUOTA 32
+#define SHMEM_SEEN_QUOTA 16
};
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -1076,7 +1075,7 @@ static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
* Remove range of pages and swap entries from page cache, and free them.
* If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
*/
-static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
+static void shmem_undo_range(struct inode *inode, loff_t lstart, uoff_t lend,
bool unfalloc)
{
struct address_space *mapping = inode->i_mapping;
@@ -1133,7 +1132,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);
folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT);
if (folio) {
- same_folio = lend < folio_pos(folio) + folio_size(folio);
+ same_folio = lend < folio_next_pos(folio);
folio_mark_dirty(folio);
if (!truncate_inode_partial_folio(folio, lstart, lend)) {
start = folio_next_index(folio);
@@ -1227,7 +1226,7 @@ whole_folios:
shmem_recalc_inode(inode, 0, -nr_swaps_freed);
}
-void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
+void shmem_truncate_range(struct inode *inode, loff_t lstart, uoff_t lend)
{
shmem_undo_range(inode, lstart, lend, false);
inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
@@ -1882,6 +1881,7 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf,
struct shmem_inode_info *info = SHMEM_I(inode);
unsigned long suitable_orders = 0;
struct folio *folio = NULL;
+ pgoff_t aligned_index;
long pages;
int error, order;
@@ -1895,10 +1895,12 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf,
order = highest_order(suitable_orders);
while (suitable_orders) {
pages = 1UL << order;
- index = round_down(index, pages);
- folio = shmem_alloc_folio(gfp, order, info, index);
- if (folio)
+ aligned_index = round_down(index, pages);
+ folio = shmem_alloc_folio(gfp, order, info, aligned_index);
+ if (folio) {
+ index = aligned_index;
goto allocated;
+ }
if (pages == HPAGE_PMD_NR)
count_vm_event(THP_FILE_FALLBACK);
@@ -4677,7 +4679,6 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
"Turning off swap in unprivileged tmpfs mounts unsupported");
}
ctx->noswap = true;
- ctx->seen |= SHMEM_SEEN_NOSWAP;
break;
case Opt_quota:
if (fc->user_ns != &init_user_ns)
@@ -4827,14 +4828,15 @@ static int shmem_reconfigure(struct fs_context *fc)
err = "Current inum too high to switch to 32-bit inums";
goto out;
}
- if ((ctx->seen & SHMEM_SEEN_NOSWAP) && ctx->noswap && !sbinfo->noswap) {
+
+ /*
+ * "noswap" doesn't use fsparam_flag_no, i.e. there's no "swap"
+ * counterpart for (re-)enabling swap.
+ */
+ if (ctx->noswap && !sbinfo->noswap) {
err = "Cannot disable swap on remount";
goto out;
}
- if (!(ctx->seen & SHMEM_SEEN_NOSWAP) && !ctx->noswap && sbinfo->noswap) {
- err = "Cannot enable swap on remount if it was disabled on first mount";
- goto out;
- }
if (ctx->seen & SHMEM_SEEN_QUOTA &&
!sb_any_quota_loaded(fc->root->d_sb)) {
@@ -5776,7 +5778,7 @@ unsigned long shmem_get_unmapped_area(struct file *file,
}
#endif
-void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
+void shmem_truncate_range(struct inode *inode, loff_t lstart, uoff_t lend)
{
truncate_inode_pages_range(inode->i_mapping, lstart, lend);
}
diff --git a/mm/slub.c b/mm/slub.c
index f1a5373eee7b..a0b905c2a557 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2046,7 +2046,11 @@ static inline void mark_objexts_empty(struct slabobj_ext *obj_exts)
if (slab_exts) {
unsigned int offs = obj_to_index(obj_exts_slab->slab_cache,
obj_exts_slab, obj_exts);
- /* codetag should be NULL */
+
+ if (unlikely(is_codetag_empty(&slab_exts[offs].ref)))
+ return;
+
+ /* codetag should be NULL here */
WARN_ON(slab_exts[offs].ref.ct);
set_codetag_empty(&slab_exts[offs].ref);
}
@@ -6332,8 +6336,6 @@ next_remote_batch:
if (unlikely(!slab_free_hook(s, p[i], init, false))) {
p[i] = p[--size];
- if (!size)
- goto flush_remote;
continue;
}
@@ -6348,6 +6350,9 @@ next_remote_batch:
i++;
}
+ if (!size)
+ goto flush_remote;
+
next_batch:
if (!local_trylock(&s->cpu_sheaves->lock))
goto fallback;
@@ -6402,6 +6407,9 @@ do_free:
goto next_batch;
}
+ if (remote_nr)
+ goto flush_remote;
+
return;
no_empty:
diff --git a/mm/swap_state.c b/mm/swap_state.c
index b13e9c4baa90..f4980dde5394 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -748,6 +748,8 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
blk_start_plug(&plug);
for (addr = start; addr < end; ilx++, addr += PAGE_SIZE) {
+ struct swap_info_struct *si = NULL;
+
if (!pte++) {
pte = pte_offset_map(vmf->pmd, addr);
if (!pte)
@@ -761,8 +763,19 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
continue;
pte_unmap(pte);
pte = NULL;
+ /*
+ * Readahead entry may come from a device that we are not
+ * holding a reference to, try to grab a reference, or skip.
+ */
+ if (swp_type(entry) != swp_type(targ_entry)) {
+ si = get_swap_device(entry);
+ if (!si)
+ continue;
+ }
folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
&page_allocated, false);
+ if (si)
+ put_swap_device(si);
if (!folio)
continue;
if (page_allocated) {
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 10760240a3a2..a1b4b9d80e3b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2005,10 +2005,8 @@ swp_entry_t get_swap_page_of_type(int type)
local_lock(&percpu_swap_cluster.lock);
offset = cluster_alloc_swap_entry(si, 0, 1);
local_unlock(&percpu_swap_cluster.lock);
- if (offset) {
+ if (offset)
entry = swp_entry(si->type, offset);
- atomic_long_dec(&nr_swap_pages);
- }
}
put_swap_device(si);
}
diff --git a/mm/truncate.c b/mm/truncate.c
index 91eb92a5ce4f..12467c1bd711 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -46,7 +46,7 @@ static void clear_shadow_entries(struct address_space *mapping,
xas_unlock_irq(&xas);
if (mapping_shrinkable(mapping))
- inode_add_lru(mapping->host);
+ inode_lru_list_add(mapping->host);
spin_unlock(&mapping->host->i_lock);
}
@@ -111,7 +111,7 @@ static void truncate_folio_batch_exceptionals(struct address_space *mapping,
xas_unlock_irq(&xas);
if (mapping_shrinkable(mapping))
- inode_add_lru(mapping->host);
+ inode_lru_list_add(mapping->host);
spin_unlock(&mapping->host->i_lock);
out:
folio_batch_remove_exceptionals(fbatch);
@@ -177,6 +177,32 @@ int truncate_inode_folio(struct address_space *mapping, struct folio *folio)
return 0;
}
+static int try_folio_split_or_unmap(struct folio *folio, struct page *split_at,
+ unsigned long min_order)
+{
+ enum ttu_flags ttu_flags =
+ TTU_SYNC |
+ TTU_SPLIT_HUGE_PMD |
+ TTU_IGNORE_MLOCK;
+ int ret;
+
+ ret = try_folio_split_to_order(folio, split_at, min_order);
+
+ /*
+ * If the split fails, unmap the folio, so it will be refaulted
+ * with PTEs to respect SIGBUS semantics.
+ *
+ * Make an exception for shmem/tmpfs that for long time
+ * intentionally mapped with PMDs across i_size.
+ */
+ if (ret && !shmem_mapping(folio->mapping)) {
+ try_to_unmap(folio, ttu_flags);
+ WARN_ON(folio_mapped(folio));
+ }
+
+ return ret;
+}
+
/*
* Handle partial folios. The folio may be entirely within the
* range if a split has raced with us. If not, we zero the part of the
@@ -194,6 +220,7 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
size_t size = folio_size(folio);
unsigned int offset, length;
struct page *split_at, *split_at2;
+ unsigned int min_order;
if (pos < start)
offset = start - pos;
@@ -223,8 +250,9 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
if (!folio_test_large(folio))
return true;
+ min_order = mapping_min_folio_order(folio->mapping);
split_at = folio_page(folio, PAGE_ALIGN_DOWN(offset) / PAGE_SIZE);
- if (!try_folio_split(folio, split_at, NULL)) {
+ if (!try_folio_split_or_unmap(folio, split_at, min_order)) {
/*
* try to split at offset + length to make sure folios within
* the range can be dropped, especially to avoid memory waste
@@ -248,13 +276,10 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
if (!folio_trylock(folio2))
goto out;
- /*
- * make sure folio2 is large and does not change its mapping.
- * Its split result does not matter here.
- */
+ /* make sure folio2 is large and does not change its mapping */
if (folio_test_large(folio2) &&
folio2->mapping == folio->mapping)
- try_folio_split(folio2, split_at2, NULL);
+ try_folio_split_or_unmap(folio2, split_at2, min_order);
folio_unlock(folio2);
out:
@@ -339,7 +364,7 @@ long mapping_evict_folio(struct address_space *mapping, struct folio *folio)
* page aligned properly.
*/
void truncate_inode_pages_range(struct address_space *mapping,
- loff_t lstart, loff_t lend)
+ loff_t lstart, uoff_t lend)
{
pgoff_t start; /* inclusive */
pgoff_t end; /* exclusive */
@@ -387,7 +412,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);
folio = __filemap_get_folio(mapping, lstart >> PAGE_SHIFT, FGP_LOCK, 0);
if (!IS_ERR(folio)) {
- same_folio = lend < folio_pos(folio) + folio_size(folio);
+ same_folio = lend < folio_next_pos(folio);
if (!truncate_inode_partial_folio(folio, lstart, lend)) {
start = folio_next_index(folio);
if (same_folio)
@@ -622,7 +647,7 @@ int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio,
__filemap_remove_folio(folio, NULL);
xa_unlock_irq(&mapping->i_pages);
if (mapping_shrinkable(mapping))
- inode_add_lru(mapping->host);
+ inode_lru_list_add(mapping->host);
spin_unlock(&mapping->host->i_lock);
filemap_free_folio(mapping, folio);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b2fc8b626d3d..bb4a96c7b682 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -811,7 +811,7 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio,
__filemap_remove_folio(folio, shadow);
xa_unlock_irq(&mapping->i_pages);
if (mapping_shrinkable(mapping))
- inode_add_lru(mapping->host);
+ inode_lru_list_add(mapping->host);
spin_unlock(&mapping->host->i_lock);
if (free_folio)
diff --git a/mm/workingset.c b/mm/workingset.c
index 68a76a91111f..d32dc2e02a61 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -755,7 +755,7 @@ out_invalid:
xa_unlock_irq(&mapping->i_pages);
if (mapping->host != NULL) {
if (mapping_shrinkable(mapping))
- inode_add_lru(mapping->host);
+ inode_lru_list_add(mapping->host);
spin_unlock(&mapping->host->i_lock);
}
ret = LRU_REMOVED_RETRY;
diff --git a/net/atm/common.c b/net/atm/common.c
index 881c7f259dbd..c4edc1111bf0 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -881,7 +881,7 @@ out_atmproc_exit:
out_atmsvc_exit:
atmsvc_exit();
out_atmpvc_exit:
- atmsvc_exit();
+ atmpvc_exit();
out_unregister_vcc_proto:
proto_unregister(&vcc_proto);
goto out;
diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c
index f0c862091bff..2c21ae8abadc 100644
--- a/net/bluetooth/6lowpan.c
+++ b/net/bluetooth/6lowpan.c
@@ -53,6 +53,11 @@ static bool enable_6lowpan;
static struct l2cap_chan *listen_chan;
static DEFINE_MUTEX(set_lock);
+enum {
+ LOWPAN_PEER_CLOSING,
+ LOWPAN_PEER_MAXBITS
+};
+
struct lowpan_peer {
struct list_head list;
struct rcu_head rcu;
@@ -61,6 +66,8 @@ struct lowpan_peer {
/* peer addresses in various formats */
unsigned char lladdr[ETH_ALEN];
struct in6_addr peer_addr;
+
+ DECLARE_BITMAP(flags, LOWPAN_PEER_MAXBITS);
};
struct lowpan_btle_dev {
@@ -289,6 +296,7 @@ static int recv_pkt(struct sk_buff *skb, struct net_device *dev,
local_skb->pkt_type = PACKET_HOST;
local_skb->dev = dev;
+ skb_reset_mac_header(local_skb);
skb_set_transport_header(local_skb, sizeof(struct ipv6hdr));
if (give_skb_to_upper(local_skb, dev) != NET_RX_SUCCESS) {
@@ -919,7 +927,9 @@ static int bt_6lowpan_disconnect(struct l2cap_conn *conn, u8 dst_type)
BT_DBG("peer %p chan %p", peer, peer->chan);
+ l2cap_chan_lock(peer->chan);
l2cap_chan_close(peer->chan, ENOENT);
+ l2cap_chan_unlock(peer->chan);
return 0;
}
@@ -956,10 +966,11 @@ static struct l2cap_chan *bt_6lowpan_listen(void)
}
static int get_l2cap_conn(char *buf, bdaddr_t *addr, u8 *addr_type,
- struct l2cap_conn **conn)
+ struct l2cap_conn **conn, bool disconnect)
{
struct hci_conn *hcon;
struct hci_dev *hdev;
+ int le_addr_type;
int n;
n = sscanf(buf, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx %hhu",
@@ -970,13 +981,32 @@ static int get_l2cap_conn(char *buf, bdaddr_t *addr, u8 *addr_type,
if (n < 7)
return -EINVAL;
+ if (disconnect) {
+ /* The "disconnect" debugfs command has used different address
+ * type constants than "connect" since 2015. Let's retain that
+ * for now even though it's obviously buggy...
+ */
+ *addr_type += 1;
+ }
+
+ switch (*addr_type) {
+ case BDADDR_LE_PUBLIC:
+ le_addr_type = ADDR_LE_DEV_PUBLIC;
+ break;
+ case BDADDR_LE_RANDOM:
+ le_addr_type = ADDR_LE_DEV_RANDOM;
+ break;
+ default:
+ return -EINVAL;
+ }
+
/* The LE_PUBLIC address type is ignored because of BDADDR_ANY */
hdev = hci_get_route(addr, BDADDR_ANY, BDADDR_LE_PUBLIC);
if (!hdev)
return -ENOENT;
hci_dev_lock(hdev);
- hcon = hci_conn_hash_lookup_le(hdev, addr, *addr_type);
+ hcon = hci_conn_hash_lookup_le(hdev, addr, le_addr_type);
hci_dev_unlock(hdev);
hci_dev_put(hdev);
@@ -993,41 +1023,52 @@ static int get_l2cap_conn(char *buf, bdaddr_t *addr, u8 *addr_type,
static void disconnect_all_peers(void)
{
struct lowpan_btle_dev *entry;
- struct lowpan_peer *peer, *tmp_peer, *new_peer;
- struct list_head peers;
-
- INIT_LIST_HEAD(&peers);
+ struct lowpan_peer *peer;
+ int nchans;
- /* We make a separate list of peers as the close_cb() will
- * modify the device peers list so it is better not to mess
- * with the same list at the same time.
+ /* l2cap_chan_close() cannot be called from RCU, and lock ordering
+ * chan->lock > devices_lock prevents taking write side lock, so copy
+ * then close.
*/
rcu_read_lock();
+ list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list)
+ list_for_each_entry_rcu(peer, &entry->peers, list)
+ clear_bit(LOWPAN_PEER_CLOSING, peer->flags);
+ rcu_read_unlock();
- list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) {
- list_for_each_entry_rcu(peer, &entry->peers, list) {
- new_peer = kmalloc(sizeof(*new_peer), GFP_ATOMIC);
- if (!new_peer)
- break;
+ do {
+ struct l2cap_chan *chans[32];
+ int i;
- new_peer->chan = peer->chan;
- INIT_LIST_HEAD(&new_peer->list);
+ nchans = 0;
- list_add(&new_peer->list, &peers);
- }
- }
+ spin_lock(&devices_lock);
- rcu_read_unlock();
+ list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) {
+ list_for_each_entry_rcu(peer, &entry->peers, list) {
+ if (test_and_set_bit(LOWPAN_PEER_CLOSING,
+ peer->flags))
+ continue;
- spin_lock(&devices_lock);
- list_for_each_entry_safe(peer, tmp_peer, &peers, list) {
- l2cap_chan_close(peer->chan, ENOENT);
+ l2cap_chan_hold(peer->chan);
+ chans[nchans++] = peer->chan;
- list_del_rcu(&peer->list);
- kfree_rcu(peer, rcu);
- }
- spin_unlock(&devices_lock);
+ if (nchans >= ARRAY_SIZE(chans))
+ goto done;
+ }
+ }
+
+done:
+ spin_unlock(&devices_lock);
+
+ for (i = 0; i < nchans; ++i) {
+ l2cap_chan_lock(chans[i]);
+ l2cap_chan_close(chans[i], ENOENT);
+ l2cap_chan_unlock(chans[i]);
+ l2cap_chan_put(chans[i]);
+ }
+ } while (nchans);
}
struct set_enable {
@@ -1050,7 +1091,9 @@ static void do_enable_set(struct work_struct *work)
mutex_lock(&set_lock);
if (listen_chan) {
+ l2cap_chan_lock(listen_chan);
l2cap_chan_close(listen_chan, 0);
+ l2cap_chan_unlock(listen_chan);
l2cap_chan_put(listen_chan);
}
@@ -1103,13 +1146,15 @@ static ssize_t lowpan_control_write(struct file *fp,
buf[buf_size] = '\0';
if (memcmp(buf, "connect ", 8) == 0) {
- ret = get_l2cap_conn(&buf[8], &addr, &addr_type, &conn);
+ ret = get_l2cap_conn(&buf[8], &addr, &addr_type, &conn, false);
if (ret == -EINVAL)
return ret;
mutex_lock(&set_lock);
if (listen_chan) {
+ l2cap_chan_lock(listen_chan);
l2cap_chan_close(listen_chan, 0);
+ l2cap_chan_unlock(listen_chan);
l2cap_chan_put(listen_chan);
listen_chan = NULL;
}
@@ -1140,7 +1185,7 @@ static ssize_t lowpan_control_write(struct file *fp,
}
if (memcmp(buf, "disconnect ", 11) == 0) {
- ret = get_l2cap_conn(&buf[11], &addr, &addr_type, &conn);
+ ret = get_l2cap_conn(&buf[11], &addr, &addr_type, &conn, true);
if (ret < 0)
return ret;
@@ -1271,7 +1316,9 @@ static void __exit bt_6lowpan_exit(void)
debugfs_remove(lowpan_control_debugfs);
if (listen_chan) {
+ l2cap_chan_lock(listen_chan);
l2cap_chan_close(listen_chan, 0);
+ l2cap_chan_unlock(listen_chan);
l2cap_chan_put(listen_chan);
}
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index c5dedf39a129..6fc0692abf05 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -769,21 +769,23 @@ static void find_bis(struct hci_conn *conn, void *data)
d->count++;
}
-static int hci_le_big_terminate(struct hci_dev *hdev, u8 big, struct hci_conn *conn)
+static int hci_le_big_terminate(struct hci_dev *hdev, struct hci_conn *conn)
{
struct iso_list_data *d;
int ret;
- bt_dev_dbg(hdev, "big 0x%2.2x sync_handle 0x%4.4x", big, conn->sync_handle);
+ bt_dev_dbg(hdev, "hcon %p big 0x%2.2x sync_handle 0x%4.4x", conn,
+ conn->iso_qos.bcast.big, conn->sync_handle);
d = kzalloc(sizeof(*d), GFP_KERNEL);
if (!d)
return -ENOMEM;
- d->big = big;
+ d->big = conn->iso_qos.bcast.big;
d->sync_handle = conn->sync_handle;
- if (test_and_clear_bit(HCI_CONN_PA_SYNC, &conn->flags)) {
+ if (conn->type == PA_LINK &&
+ test_and_clear_bit(HCI_CONN_PA_SYNC, &conn->flags)) {
hci_conn_hash_list_flag(hdev, find_bis, PA_LINK,
HCI_CONN_PA_SYNC, d);
@@ -801,6 +803,9 @@ static int hci_le_big_terminate(struct hci_dev *hdev, u8 big, struct hci_conn *c
d->big_sync_term = true;
}
+ if (!d->pa_sync_term && !d->big_sync_term)
+ return 0;
+
ret = hci_cmd_sync_queue(hdev, big_terminate_sync, d,
terminate_big_destroy);
if (ret)
@@ -852,8 +857,7 @@ static void bis_cleanup(struct hci_conn *conn)
hci_le_terminate_big(hdev, conn);
} else {
- hci_le_big_terminate(hdev, conn->iso_qos.bcast.big,
- conn);
+ hci_le_big_terminate(hdev, conn);
}
}
@@ -994,19 +998,20 @@ static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t
conn->mtu = hdev->le_mtu ? hdev->le_mtu : hdev->acl_mtu;
break;
case CIS_LINK:
- case BIS_LINK:
- case PA_LINK:
/* conn->src should reflect the local identity address */
hci_copy_identity_address(hdev, &conn->src, &conn->src_type);
- /* set proper cleanup function */
- if (!bacmp(dst, BDADDR_ANY))
- conn->cleanup = bis_cleanup;
- else if (conn->role == HCI_ROLE_MASTER)
+ if (conn->role == HCI_ROLE_MASTER)
conn->cleanup = cis_cleanup;
- conn->mtu = hdev->iso_mtu ? hdev->iso_mtu :
- hdev->le_mtu ? hdev->le_mtu : hdev->acl_mtu;
+ conn->mtu = hdev->iso_mtu;
+ break;
+ case PA_LINK:
+ case BIS_LINK:
+ /* conn->src should reflect the local identity address */
+ hci_copy_identity_address(hdev, &conn->src, &conn->src_type);
+ conn->cleanup = bis_cleanup;
+ conn->mtu = hdev->iso_mtu;
break;
case SCO_LINK:
if (lmp_esco_capable(hdev))
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 3418d7b964a1..8ccec73dce45 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -3832,13 +3832,14 @@ static void hci_tx_work(struct work_struct *work)
static void hci_acldata_packet(struct hci_dev *hdev, struct sk_buff *skb)
{
struct hci_acl_hdr *hdr;
- struct hci_conn *conn;
__u16 handle, flags;
+ int err;
hdr = skb_pull_data(skb, sizeof(*hdr));
if (!hdr) {
bt_dev_err(hdev, "ACL packet too small");
- goto drop;
+ kfree_skb(skb);
+ return;
}
handle = __le16_to_cpu(hdr->handle);
@@ -3850,36 +3851,27 @@ static void hci_acldata_packet(struct hci_dev *hdev, struct sk_buff *skb)
hdev->stat.acl_rx++;
- hci_dev_lock(hdev);
- conn = hci_conn_hash_lookup_handle(hdev, handle);
- hci_dev_unlock(hdev);
-
- if (conn) {
- hci_conn_enter_active_mode(conn, BT_POWER_FORCE_ACTIVE_OFF);
-
- /* Send to upper protocol */
- l2cap_recv_acldata(conn, skb, flags);
- return;
- } else {
+ err = l2cap_recv_acldata(hdev, handle, skb, flags);
+ if (err == -ENOENT)
bt_dev_err(hdev, "ACL packet for unknown connection handle %d",
handle);
- }
-
-drop:
- kfree_skb(skb);
+ else if (err)
+ bt_dev_dbg(hdev, "ACL packet recv for handle %d failed: %d",
+ handle, err);
}
/* SCO data packet */
static void hci_scodata_packet(struct hci_dev *hdev, struct sk_buff *skb)
{
struct hci_sco_hdr *hdr;
- struct hci_conn *conn;
__u16 handle, flags;
+ int err;
hdr = skb_pull_data(skb, sizeof(*hdr));
if (!hdr) {
bt_dev_err(hdev, "SCO packet too small");
- goto drop;
+ kfree_skb(skb);
+ return;
}
handle = __le16_to_cpu(hdr->handle);
@@ -3891,34 +3883,28 @@ static void hci_scodata_packet(struct hci_dev *hdev, struct sk_buff *skb)
hdev->stat.sco_rx++;
- hci_dev_lock(hdev);
- conn = hci_conn_hash_lookup_handle(hdev, handle);
- hci_dev_unlock(hdev);
+ hci_skb_pkt_status(skb) = flags & 0x03;
- if (conn) {
- /* Send to upper protocol */
- hci_skb_pkt_status(skb) = flags & 0x03;
- sco_recv_scodata(conn, skb);
- return;
- } else {
+ err = sco_recv_scodata(hdev, handle, skb);
+ if (err == -ENOENT)
bt_dev_err_ratelimited(hdev, "SCO packet for unknown connection handle %d",
handle);
- }
-
-drop:
- kfree_skb(skb);
+ else if (err)
+ bt_dev_dbg(hdev, "SCO packet recv for handle %d failed: %d",
+ handle, err);
}
static void hci_isodata_packet(struct hci_dev *hdev, struct sk_buff *skb)
{
struct hci_iso_hdr *hdr;
- struct hci_conn *conn;
__u16 handle, flags;
+ int err;
hdr = skb_pull_data(skb, sizeof(*hdr));
if (!hdr) {
bt_dev_err(hdev, "ISO packet too small");
- goto drop;
+ kfree_skb(skb);
+ return;
}
handle = __le16_to_cpu(hdr->handle);
@@ -3928,22 +3914,13 @@ static void hci_isodata_packet(struct hci_dev *hdev, struct sk_buff *skb)
bt_dev_dbg(hdev, "len %d handle 0x%4.4x flags 0x%4.4x", skb->len,
handle, flags);
- hci_dev_lock(hdev);
- conn = hci_conn_hash_lookup_handle(hdev, handle);
- hci_dev_unlock(hdev);
-
- if (!conn) {
+ err = iso_recv(hdev, handle, skb, flags);
+ if (err == -ENOENT)
bt_dev_err(hdev, "ISO packet for unknown connection handle %d",
handle);
- goto drop;
- }
-
- /* Send to upper protocol */
- iso_recv(conn, skb, flags);
- return;
-
-drop:
- kfree_skb(skb);
+ else if (err)
+ bt_dev_dbg(hdev, "ISO packet recv for handle %d failed: %d",
+ handle, err);
}
static bool hci_req_is_complete(struct hci_dev *hdev)
@@ -4121,7 +4098,7 @@ static void hci_rx_work(struct work_struct *work)
}
}
-static void hci_send_cmd_sync(struct hci_dev *hdev, struct sk_buff *skb)
+static int hci_send_cmd_sync(struct hci_dev *hdev, struct sk_buff *skb)
{
int err;
@@ -4133,16 +4110,19 @@ static void hci_send_cmd_sync(struct hci_dev *hdev, struct sk_buff *skb)
if (!hdev->sent_cmd) {
skb_queue_head(&hdev->cmd_q, skb);
queue_work(hdev->workqueue, &hdev->cmd_work);
- return;
+ return -EINVAL;
}
if (hci_skb_opcode(skb) != HCI_OP_NOP) {
err = hci_send_frame(hdev, skb);
if (err < 0) {
hci_cmd_sync_cancel_sync(hdev, -err);
- return;
+ return err;
}
atomic_dec(&hdev->cmd_cnt);
+ } else {
+ err = -ENODATA;
+ kfree_skb(skb);
}
if (hdev->req_status == HCI_REQ_PEND &&
@@ -4150,12 +4130,15 @@ static void hci_send_cmd_sync(struct hci_dev *hdev, struct sk_buff *skb)
kfree_skb(hdev->req_skb);
hdev->req_skb = skb_clone(hdev->sent_cmd, GFP_KERNEL);
}
+
+ return err;
}
static void hci_cmd_work(struct work_struct *work)
{
struct hci_dev *hdev = container_of(work, struct hci_dev, cmd_work);
struct sk_buff *skb;
+ int err;
BT_DBG("%s cmd_cnt %d cmd queued %d", hdev->name,
atomic_read(&hdev->cmd_cnt), skb_queue_len(&hdev->cmd_q));
@@ -4166,7 +4149,9 @@ static void hci_cmd_work(struct work_struct *work)
if (!skb)
return;
- hci_send_cmd_sync(hdev, skb);
+ err = hci_send_cmd_sync(hdev, skb);
+ if (err)
+ return;
rcu_read_lock();
if (test_bit(HCI_RESET, &hdev->flags) ||
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index f20c826509b6..3838b90343d9 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -5843,6 +5843,29 @@ static void hci_le_enh_conn_complete_evt(struct hci_dev *hdev, void *data,
le16_to_cpu(ev->supervision_timeout));
}
+static void hci_le_pa_sync_lost_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_le_pa_sync_lost *ev = data;
+ u16 handle = le16_to_cpu(ev->handle);
+ struct hci_conn *conn;
+
+ bt_dev_dbg(hdev, "sync handle 0x%4.4x", handle);
+
+ hci_dev_lock(hdev);
+
+ /* Delete the pa sync connection */
+ conn = hci_conn_hash_lookup_pa_sync_handle(hdev, handle);
+ if (conn) {
+ clear_bit(HCI_CONN_BIG_SYNC, &conn->flags);
+ clear_bit(HCI_CONN_PA_SYNC, &conn->flags);
+ hci_disconn_cfm(conn, HCI_ERROR_REMOTE_USER_TERM);
+ hci_conn_del(conn);
+ }
+
+ hci_dev_unlock(hdev);
+}
+
static void hci_le_ext_adv_term_evt(struct hci_dev *hdev, void *data,
struct sk_buff *skb)
{
@@ -7001,14 +7024,9 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data,
continue;
}
- if (ev->status != 0x42) {
+ if (ev->status != 0x42)
/* Mark PA sync as established */
set_bit(HCI_CONN_PA_SYNC, &bis->flags);
- /* Reset cleanup callback of PA Sync so it doesn't
- * terminate the sync when deleting the connection.
- */
- conn->cleanup = NULL;
- }
bis->sync_handle = conn->sync_handle;
bis->iso_qos.bcast.big = ev->handle;
@@ -7051,29 +7069,24 @@ static void hci_le_big_sync_lost_evt(struct hci_dev *hdev, void *data,
struct sk_buff *skb)
{
struct hci_evt_le_big_sync_lost *ev = data;
- struct hci_conn *bis, *conn;
- bool mgmt_conn;
+ struct hci_conn *bis;
+ bool mgmt_conn = false;
bt_dev_dbg(hdev, "big handle 0x%2.2x", ev->handle);
hci_dev_lock(hdev);
- /* Delete the pa sync connection */
- bis = hci_conn_hash_lookup_pa_sync_big_handle(hdev, ev->handle);
- if (bis) {
- conn = hci_conn_hash_lookup_pa_sync_handle(hdev,
- bis->sync_handle);
- if (conn)
- hci_conn_del(conn);
- }
-
/* Delete each bis connection */
while ((bis = hci_conn_hash_lookup_big_state(hdev, ev->handle,
BT_CONNECTED,
HCI_ROLE_SLAVE))) {
- mgmt_conn = test_and_clear_bit(HCI_CONN_MGMT_CONNECTED, &bis->flags);
- mgmt_device_disconnected(hdev, &bis->dst, bis->type, bis->dst_type,
- ev->reason, mgmt_conn);
+ if (!mgmt_conn) {
+ mgmt_conn = test_and_clear_bit(HCI_CONN_MGMT_CONNECTED,
+ &bis->flags);
+ mgmt_device_disconnected(hdev, &bis->dst, bis->type,
+ bis->dst_type, ev->reason,
+ mgmt_conn);
+ }
clear_bit(HCI_CONN_BIG_SYNC, &bis->flags);
hci_disconn_cfm(bis, ev->reason);
@@ -7187,6 +7200,9 @@ static const struct hci_le_ev {
hci_le_per_adv_report_evt,
sizeof(struct hci_ev_le_per_adv_report),
HCI_MAX_EVENT_SIZE),
+ /* [0x10 = HCI_EV_LE_PA_SYNC_LOST] */
+ HCI_LE_EV(HCI_EV_LE_PA_SYNC_LOST, hci_le_pa_sync_lost_evt,
+ sizeof(struct hci_ev_le_pa_sync_lost)),
/* [0x12 = HCI_EV_LE_EXT_ADV_SET_TERM] */
HCI_LE_EV(HCI_EV_LE_EXT_ADV_SET_TERM, hci_le_ext_adv_term_evt,
sizeof(struct hci_evt_le_ext_adv_set_term)),
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index fc866759910d..ad19022ae127 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -1311,7 +1311,9 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
goto done;
}
+ hci_dev_lock(hdev);
mgmt_index_removed(hdev);
+ hci_dev_unlock(hdev);
err = hci_dev_open(hdev->id);
if (err) {
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 73fc41b68b68..6e76798ec786 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -6999,7 +6999,7 @@ static void create_pa_complete(struct hci_dev *hdev, void *data, int err)
hci_dev_lock(hdev);
- if (!hci_conn_valid(hdev, conn))
+ if (hci_conn_valid(hdev, conn))
clear_bit(HCI_CONN_CREATE_PA_SYNC, &conn->flags);
if (!err)
diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c
index 3d98cb6291da..616c2fef91d2 100644
--- a/net/bluetooth/iso.c
+++ b/net/bluetooth/iso.c
@@ -2314,14 +2314,31 @@ static void iso_disconn_cfm(struct hci_conn *hcon, __u8 reason)
iso_conn_del(hcon, bt_to_errno(reason));
}
-void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags)
+int iso_recv(struct hci_dev *hdev, u16 handle, struct sk_buff *skb, u16 flags)
{
- struct iso_conn *conn = hcon->iso_data;
+ struct hci_conn *hcon;
+ struct iso_conn *conn;
struct skb_shared_hwtstamps *hwts;
__u16 pb, ts, len, sn;
- if (!conn)
- goto drop;
+ hci_dev_lock(hdev);
+
+ hcon = hci_conn_hash_lookup_handle(hdev, handle);
+ if (!hcon) {
+ hci_dev_unlock(hdev);
+ kfree_skb(skb);
+ return -ENOENT;
+ }
+
+ conn = iso_conn_hold_unless_zero(hcon->iso_data);
+ hcon = NULL;
+
+ hci_dev_unlock(hdev);
+
+ if (!conn) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
pb = hci_iso_flags_pb(flags);
ts = hci_iso_flags_ts(flags);
@@ -2377,7 +2394,7 @@ void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags)
hci_skb_pkt_status(skb) = flags & 0x03;
hci_skb_pkt_seqnum(skb) = sn;
iso_recv_frame(conn, skb);
- return;
+ goto done;
}
if (pb == ISO_SINGLE) {
@@ -2455,6 +2472,9 @@ void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags)
drop:
kfree_skb(skb);
+done:
+ iso_conn_put(conn);
+ return 0;
}
static struct hci_cb iso_cb = {
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index d08320380ad6..07b493331fd7 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -497,6 +497,7 @@ void l2cap_chan_hold(struct l2cap_chan *c)
kref_get(&c->kref);
}
+EXPORT_SYMBOL_GPL(l2cap_chan_hold);
struct l2cap_chan *l2cap_chan_hold_unless_zero(struct l2cap_chan *c)
{
@@ -7509,13 +7510,24 @@ struct l2cap_conn *l2cap_conn_hold_unless_zero(struct l2cap_conn *c)
return c;
}
-void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags)
+int l2cap_recv_acldata(struct hci_dev *hdev, u16 handle,
+ struct sk_buff *skb, u16 flags)
{
+ struct hci_conn *hcon;
struct l2cap_conn *conn;
int len;
- /* Lock hdev to access l2cap_data to avoid race with l2cap_conn_del */
- hci_dev_lock(hcon->hdev);
+ /* Lock hdev for hci_conn, and race on l2cap_data vs. l2cap_conn_del */
+ hci_dev_lock(hdev);
+
+ hcon = hci_conn_hash_lookup_handle(hdev, handle);
+ if (!hcon) {
+ hci_dev_unlock(hdev);
+ kfree_skb(skb);
+ return -ENOENT;
+ }
+
+ hci_conn_enter_active_mode(hcon, BT_POWER_FORCE_ACTIVE_OFF);
conn = hcon->l2cap_data;
@@ -7523,12 +7535,13 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags)
conn = l2cap_conn_add(hcon);
conn = l2cap_conn_hold_unless_zero(conn);
+ hcon = NULL;
- hci_dev_unlock(hcon->hdev);
+ hci_dev_unlock(hdev);
if (!conn) {
kfree_skb(skb);
- return;
+ return -EINVAL;
}
BT_DBG("conn %p len %u flags 0x%x", conn, skb->len, flags);
@@ -7642,6 +7655,7 @@ drop:
unlock:
mutex_unlock(&conn->lock);
l2cap_conn_put(conn);
+ return 0;
}
static struct hci_cb l2cap_cb = {
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 79762bfaea5f..262bf984d2aa 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -9497,6 +9497,7 @@ void mgmt_index_removed(struct hci_dev *hdev)
cancel_delayed_work_sync(&hdev->discov_off);
cancel_delayed_work_sync(&hdev->service_cache);
cancel_delayed_work_sync(&hdev->rpa_expired);
+ cancel_delayed_work_sync(&hdev->mesh_send_done);
}
void mgmt_power_on(struct hci_dev *hdev, int err)
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index ab0cf442d57b..298c2a9ab4df 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -1458,22 +1458,39 @@ static void sco_disconn_cfm(struct hci_conn *hcon, __u8 reason)
sco_conn_del(hcon, bt_to_errno(reason));
}
-void sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb)
+int sco_recv_scodata(struct hci_dev *hdev, u16 handle, struct sk_buff *skb)
{
- struct sco_conn *conn = hcon->sco_data;
+ struct hci_conn *hcon;
+ struct sco_conn *conn;
- if (!conn)
- goto drop;
+ hci_dev_lock(hdev);
+
+ hcon = hci_conn_hash_lookup_handle(hdev, handle);
+ if (!hcon) {
+ hci_dev_unlock(hdev);
+ kfree_skb(skb);
+ return -ENOENT;
+ }
+
+ conn = sco_conn_hold_unless_zero(hcon->sco_data);
+ hcon = NULL;
+
+ hci_dev_unlock(hdev);
+
+ if (!conn) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
BT_DBG("conn %p len %u", conn, skb->len);
- if (skb->len) {
+ if (skb->len)
sco_recv_frame(conn, skb);
- return;
- }
+ else
+ kfree_skb(skb);
-drop:
- kfree_skb(skb);
+ sco_conn_put(conn);
+ return 0;
}
static struct hci_cb sco_cb = {
diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index 45512b2ba951..3a1ce04a7a53 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -2136,7 +2136,7 @@ static u8 smp_cmd_pairing_random(struct l2cap_conn *conn, struct sk_buff *skb)
struct smp_chan *smp = chan->data;
struct hci_conn *hcon = conn->hcon;
u8 *pkax, *pkbx, *na, *nb, confirm_hint;
- u32 passkey;
+ u32 passkey = 0;
int err;
bt_dev_dbg(hcon->hdev, "conn %p", conn);
@@ -2188,24 +2188,6 @@ static u8 smp_cmd_pairing_random(struct l2cap_conn *conn, struct sk_buff *skb)
smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd),
smp->prnd);
SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK);
-
- /* Only Just-Works pairing requires extra checks */
- if (smp->method != JUST_WORKS)
- goto mackey_and_ltk;
-
- /* If there already exists long term key in local host, leave
- * the decision to user space since the remote device could
- * be legitimate or malicious.
- */
- if (hci_find_ltk(hcon->hdev, &hcon->dst, hcon->dst_type,
- hcon->role)) {
- /* Set passkey to 0. The value can be any number since
- * it'll be ignored anyway.
- */
- passkey = 0;
- confirm_hint = 1;
- goto confirm;
- }
}
mackey_and_ltk:
@@ -2226,11 +2208,12 @@ mackey_and_ltk:
if (err)
return SMP_UNSPECIFIED;
- confirm_hint = 0;
-
-confirm:
- if (smp->method == JUST_WORKS)
- confirm_hint = 1;
+ /* Always require user confirmation for Just-Works pairing to prevent
+ * impersonation attacks, or in case of a legitimate device that is
+ * repairing use the confirmation as acknowledgment to proceed with the
+ * creation of new keys.
+ */
+ confirm_hint = smp->method == JUST_WORKS ? 1 : 0;
err = mgmt_user_confirm_request(hcon->hdev, &hcon->dst, hcon->type,
hcon->dst_type, passkey, confirm_hint);
diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c
index b71b1635916e..a21c157daf7d 100644
--- a/net/ceph/auth_x.c
+++ b/net/ceph/auth_x.c
@@ -631,6 +631,7 @@ static int handle_auth_session_key(struct ceph_auth_client *ac, u64 global_id,
/* connection secret */
ceph_decode_32_safe(p, end, len, e_inval);
+ ceph_decode_need(p, end, len, e_inval);
dout("%s connection secret blob len %d\n", __func__, len);
if (len > 0) {
dp = *p + ceph_x_encrypt_offset();
@@ -648,6 +649,7 @@ static int handle_auth_session_key(struct ceph_auth_client *ac, u64 global_id,
/* service tickets */
ceph_decode_32_safe(p, end, len, e_inval);
+ ceph_decode_need(p, end, len, e_inval);
dout("%s service tickets blob len %d\n", __func__, len);
if (len > 0) {
ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 4c6441536d55..e734e57be083 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -786,41 +786,52 @@ void ceph_reset_client_addr(struct ceph_client *client)
EXPORT_SYMBOL(ceph_reset_client_addr);
/*
- * true if we have the mon map (and have thus joined the cluster)
- */
-static bool have_mon_and_osd_map(struct ceph_client *client)
-{
- return client->monc.monmap && client->monc.monmap->epoch &&
- client->osdc.osdmap && client->osdc.osdmap->epoch;
-}
-
-/*
* mount: join the ceph cluster, and open root directory.
*/
-int __ceph_open_session(struct ceph_client *client, unsigned long started)
+int __ceph_open_session(struct ceph_client *client)
{
- unsigned long timeout = client->options->mount_timeout;
- long err;
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ long timeout = ceph_timeout_jiffies(client->options->mount_timeout);
+ bool have_monmap, have_osdmap;
+ int err;
/* open session, and wait for mon and osd maps */
err = ceph_monc_open_session(&client->monc);
if (err < 0)
return err;
- while (!have_mon_and_osd_map(client)) {
- if (timeout && time_after_eq(jiffies, started + timeout))
- return -ETIMEDOUT;
+ add_wait_queue(&client->auth_wq, &wait);
+ for (;;) {
+ mutex_lock(&client->monc.mutex);
+ err = client->auth_err;
+ have_monmap = client->monc.monmap && client->monc.monmap->epoch;
+ mutex_unlock(&client->monc.mutex);
+
+ down_read(&client->osdc.lock);
+ have_osdmap = client->osdc.osdmap && client->osdc.osdmap->epoch;
+ up_read(&client->osdc.lock);
+
+ if (err || (have_monmap && have_osdmap))
+ break;
+
+ if (signal_pending(current)) {
+ err = -ERESTARTSYS;
+ break;
+ }
+
+ if (!timeout) {
+ err = -ETIMEDOUT;
+ break;
+ }
/* wait */
dout("mount waiting for mon_map\n");
- err = wait_event_interruptible_timeout(client->auth_wq,
- have_mon_and_osd_map(client) || (client->auth_err < 0),
- ceph_timeout_jiffies(timeout));
- if (err < 0)
- return err;
- if (client->auth_err < 0)
- return client->auth_err;
+ timeout = wait_woken(&wait, TASK_INTERRUPTIBLE, timeout);
}
+ remove_wait_queue(&client->auth_wq, &wait);
+
+ if (err)
+ return err;
pr_info("client%llu fsid %pU\n", ceph_client_gid(client),
&client->fsid);
@@ -833,12 +844,11 @@ EXPORT_SYMBOL(__ceph_open_session);
int ceph_open_session(struct ceph_client *client)
{
int ret;
- unsigned long started = jiffies; /* note the start time */
dout("open_session start\n");
mutex_lock(&client->mount_mutex);
- ret = __ceph_open_session(client, started);
+ ret = __ceph_open_session(client);
mutex_unlock(&client->mount_mutex);
return ret;
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 2110439f8a24..83c270bce63c 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -36,8 +36,9 @@ static int monmap_show(struct seq_file *s, void *p)
int i;
struct ceph_client *client = s->private;
+ mutex_lock(&client->monc.mutex);
if (client->monc.monmap == NULL)
- return 0;
+ goto out_unlock;
seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
for (i = 0; i < client->monc.monmap->num_mon; i++) {
@@ -48,6 +49,9 @@ static int monmap_show(struct seq_file *s, void *p)
ENTITY_NAME(inst->name),
ceph_pr_addr(&inst->addr));
}
+
+out_unlock:
+ mutex_unlock(&client->monc.mutex);
return 0;
}
@@ -56,13 +60,14 @@ static int osdmap_show(struct seq_file *s, void *p)
int i;
struct ceph_client *client = s->private;
struct ceph_osd_client *osdc = &client->osdc;
- struct ceph_osdmap *map = osdc->osdmap;
+ struct ceph_osdmap *map;
struct rb_node *n;
+ down_read(&osdc->lock);
+ map = osdc->osdmap;
if (map == NULL)
- return 0;
+ goto out_unlock;
- down_read(&osdc->lock);
seq_printf(s, "epoch %u barrier %u flags 0x%x\n", map->epoch,
osdc->epoch_barrier, map->flags);
@@ -131,6 +136,7 @@ static int osdmap_show(struct seq_file *s, void *p)
seq_printf(s, "]\n");
}
+out_unlock:
up_read(&osdc->lock);
return 0;
}
diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c
index 9e39378eda00..9e48623018a3 100644
--- a/net/ceph/messenger_v2.c
+++ b/net/ceph/messenger_v2.c
@@ -1061,13 +1061,16 @@ static int decrypt_control_remainder(struct ceph_connection *con)
static int process_v2_sparse_read(struct ceph_connection *con,
struct page **pages, int spos)
{
- struct ceph_msg_data_cursor *cursor = &con->v2.in_cursor;
+ struct ceph_msg_data_cursor cursor;
int ret;
+ ceph_msg_data_cursor_init(&cursor, con->in_msg,
+ con->in_msg->sparse_read_total);
+
for (;;) {
char *buf = NULL;
- ret = con->ops->sparse_read(con, cursor, &buf);
+ ret = con->ops->sparse_read(con, &cursor, &buf);
if (ret <= 0)
return ret;
@@ -1085,11 +1088,11 @@ static int process_v2_sparse_read(struct ceph_connection *con,
} else {
struct bio_vec bv;
- get_bvec_at(cursor, &bv);
+ get_bvec_at(&cursor, &bv);
len = min_t(int, len, bv.bv_len);
memcpy_page(bv.bv_page, bv.bv_offset,
spage, soff, len);
- ceph_msg_data_advance(cursor, len);
+ ceph_msg_data_advance(&cursor, len);
}
spos += len;
ret -= len;
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 295098873861..d245fa508e1c 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1504,8 +1504,6 @@ static int decode_new_primary_temp(void **p, void *end,
u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd)
{
- BUG_ON(osd >= map->max_osd);
-
if (!map->osd_primary_affinity)
return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
@@ -1514,8 +1512,6 @@ u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd)
static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff)
{
- BUG_ON(osd >= map->max_osd);
-
if (!map->osd_primary_affinity) {
int i;
@@ -1577,6 +1573,8 @@ static int decode_new_primary_affinity(void **p, void *end,
ceph_decode_32_safe(p, end, osd, e_inval);
ceph_decode_32_safe(p, end, aff, e_inval);
+ if (osd >= map->max_osd)
+ goto e_inval;
ret = set_primary_affinity(map, osd, aff);
if (ret)
@@ -1879,7 +1877,9 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
ceph_decode_need(p, end, 2*sizeof(u32), e_inval);
osd = ceph_decode_32(p);
w = ceph_decode_32(p);
- BUG_ON(osd >= map->max_osd);
+ if (osd >= map->max_osd)
+ goto e_inval;
+
osdmap_info(map, "osd%d weight 0x%x %s\n", osd, w,
w == CEPH_OSD_IN ? "(in)" :
(w == CEPH_OSD_OUT ? "(out)" : ""));
@@ -1905,13 +1905,15 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
u32 xorstate;
osd = ceph_decode_32(p);
+ if (osd >= map->max_osd)
+ goto e_inval;
+
if (struct_v >= 5)
xorstate = ceph_decode_32(p);
else
xorstate = ceph_decode_8(p);
if (xorstate == 0)
xorstate = CEPH_OSD_UP;
- BUG_ON(osd >= map->max_osd);
if ((map->osd_state[osd] & CEPH_OSD_UP) &&
(xorstate & CEPH_OSD_UP))
osdmap_info(map, "osd%d down\n", osd);
@@ -1937,7 +1939,9 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
struct ceph_entity_addr addr;
osd = ceph_decode_32(p);
- BUG_ON(osd >= map->max_osd);
+ if (osd >= map->max_osd)
+ goto e_inval;
+
if (struct_v >= 7)
ret = ceph_decode_entity_addrvec(p, end, msgr2, &addr);
else
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index ad54b12d4b4c..8bb71a10dba0 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -443,6 +443,9 @@ static int generic_hwtstamp_ioctl_lower(struct net_device *dev, int cmd,
struct ifreq ifrr;
int err;
+ if (!kernel_cfg->ifr)
+ return -EINVAL;
+
strscpy_pad(ifrr.ifr_name, dev->name, IFNAMSIZ);
ifrr.ifr_ifru = kernel_cfg->ifr->ifr_ifru;
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index b0e0f22d7b21..83cbec4afcb3 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -439,7 +439,7 @@ static __net_init int setup_net(struct net *net)
LIST_HEAD(net_exit_list);
int error = 0;
- net->net_cookie = ns_tree_gen_id(&net->ns);
+ net->net_cookie = ns_tree_gen_id(net);
list_for_each_entry(ops, &pernet_list, list) {
error = ops_init(ops, net);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index c85f740065fc..331764845e8f 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -811,6 +811,10 @@ static void __netpoll_cleanup(struct netpoll *np)
if (!npinfo)
return;
+ /* At this point, there is a single npinfo instance per netdevice, and
+ * its refcnt tracks how many netpoll structures are linked to it. We
+ * only perform npinfo cleanup when the refcnt decrements to zero.
+ */
if (refcount_dec_and_test(&npinfo->refcnt)) {
const struct net_device_ops *ops;
@@ -820,8 +824,7 @@ static void __netpoll_cleanup(struct netpoll *np)
RCU_INIT_POINTER(np->dev->npinfo, NULL);
call_rcu(&npinfo->rcu, rcu_cleanup_netpoll_info);
- } else
- RCU_INIT_POINTER(np->dev->npinfo, NULL);
+ }
skb_pool_flush(np);
}
diff --git a/net/devlink/rate.c b/net/devlink/rate.c
index 264fb82cba19..d157a8419bca 100644
--- a/net/devlink/rate.c
+++ b/net/devlink/rate.c
@@ -828,13 +828,15 @@ void devl_rate_nodes_destroy(struct devlink *devlink)
if (!devlink_rate->parent)
continue;
- refcount_dec(&devlink_rate->parent->refcnt);
if (devlink_rate_is_leaf(devlink_rate))
ops->rate_leaf_parent_set(devlink_rate, NULL, devlink_rate->priv,
NULL, NULL);
else if (devlink_rate_is_node(devlink_rate))
ops->rate_node_parent_set(devlink_rate, NULL, devlink_rate->priv,
NULL, NULL);
+
+ refcount_dec(&devlink_rate->parent->refcnt);
+ devlink_rate->parent = NULL;
}
list_for_each_entry_safe(devlink_rate, tmp, &devlink->rate_list, list) {
if (devlink_rate_is_node(devlink_rate)) {
diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c
index 82b084cc1cc6..53da62984447 100644
--- a/net/dns_resolver/dns_query.c
+++ b/net/dns_resolver/dns_query.c
@@ -78,7 +78,6 @@ int dns_query(struct net *net,
{
struct key *rkey;
struct user_key_payload *upayload;
- const struct cred *saved_cred;
size_t typelen, desclen;
char *desc, *cp;
int ret, len;
@@ -124,9 +123,8 @@ int dns_query(struct net *net,
/* make the upcall, using special credentials to prevent the use of
* add_key() to preinstall malicious redirections
*/
- saved_cred = override_creds(dns_resolver_cache);
- rkey = request_key_net(&key_type_dns_resolver, desc, net, options);
- revert_creds(saved_cred);
+ scoped_with_creds(dns_resolver_cache)
+ rkey = request_key_net(&key_type_dns_resolver, desc, net, options);
kfree(desc);
if (IS_ERR(rkey)) {
ret = PTR_ERR(rkey);
diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c
index d9c77fa553b5..eadb358179ce 100644
--- a/net/dsa/tag_brcm.c
+++ b/net/dsa/tag_brcm.c
@@ -176,7 +176,8 @@ static struct sk_buff *brcm_tag_rcv_ll(struct sk_buff *skb,
/* Remove Broadcom tag and update checksum */
skb_pull_rcsum(skb, BRCM_TAG_LEN);
- dsa_default_offload_fwd_mark(skb);
+ if (likely(!is_link_local_ether_addr(eth_hdr(skb)->h_dest)))
+ dsa_default_offload_fwd_mark(skb);
return skb;
}
@@ -250,7 +251,8 @@ static struct sk_buff *brcm_leg_tag_rcv(struct sk_buff *skb,
/* Remove Broadcom tag and update checksum */
skb_pull_rcsum(skb, len);
- dsa_default_offload_fwd_mark(skb);
+ if (likely(!is_link_local_ether_addr(eth_hdr(skb)->h_dest)))
+ dsa_default_offload_fwd_mark(skb);
dsa_strip_etype_header(skb, len);
diff --git a/net/handshake/netlink.c b/net/handshake/netlink.c
index 7e46d130dce2..1d33a4675a48 100644
--- a/net/handshake/netlink.c
+++ b/net/handshake/netlink.c
@@ -93,7 +93,7 @@ int handshake_nl_accept_doit(struct sk_buff *skb, struct genl_info *info)
struct handshake_net *hn = handshake_pernet(net);
struct handshake_req *req = NULL;
struct socket *sock;
- int class, fd, err;
+ int class, err;
err = -EOPNOTSUPP;
if (!hn)
@@ -106,27 +106,25 @@ int handshake_nl_accept_doit(struct sk_buff *skb, struct genl_info *info)
err = -EAGAIN;
req = handshake_req_next(hn, class);
- if (!req)
- goto out_status;
-
- sock = req->hr_sk->sk_socket;
- fd = get_unused_fd_flags(O_CLOEXEC);
- if (fd < 0) {
- err = fd;
- goto out_complete;
- }
-
- err = req->hr_proto->hp_accept(req, info, fd);
- if (err) {
- put_unused_fd(fd);
- goto out_complete;
+ if (req) {
+ sock = req->hr_sk->sk_socket;
+
+ FD_PREPARE(fdf, O_CLOEXEC, sock->file);
+ if (fdf.err) {
+ err = fdf.err;
+ goto out_complete;
+ }
+
+ get_file(sock->file); /* FD_PREPARE() consumes a reference. */
+ err = req->hr_proto->hp_accept(req, info, fd_prepare_fd(fdf));
+ if (err)
+ goto out_complete; /* Automatic cleanup handles fput */
+
+ trace_handshake_cmd_accept(net, req, req->hr_sk, fd_prepare_fd(fdf));
+ fd_publish(fdf);
+ return 0;
}
- fd_install(fd, get_file(sock->file));
-
- trace_handshake_cmd_accept(net, req, req->hr_sk, fd);
- return 0;
-
out_complete:
handshake_complete(req, -EIO, NULL);
out_status:
diff --git a/net/handshake/tlshd.c b/net/handshake/tlshd.c
index 081093dfd553..8f9532a15f43 100644
--- a/net/handshake/tlshd.c
+++ b/net/handshake/tlshd.c
@@ -259,6 +259,7 @@ static int tls_handshake_accept(struct handshake_req *req,
out_cancel:
genlmsg_cancel(msg, hdr);
+ nlmsg_free(msg);
out:
return ret;
}
diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c
index fbbc3ccf9df6..492cbc78ab75 100644
--- a/net/hsr/hsr_device.c
+++ b/net/hsr/hsr_device.c
@@ -320,6 +320,9 @@ static void send_hsr_supervision_frame(struct hsr_port *port,
}
hsr_stag = skb_put(skb, sizeof(struct hsr_sup_tag));
+ skb_set_network_header(skb, ETH_HLEN + HSR_HLEN);
+ skb_reset_mac_len(skb);
+
set_hsr_stag_path(hsr_stag, (hsr->prot_version ? 0x0 : 0xf));
set_hsr_stag_HSR_ver(hsr_stag, hsr->prot_version);
@@ -334,7 +337,7 @@ static void send_hsr_supervision_frame(struct hsr_port *port,
}
hsr_stag->tlv.HSR_TLV_type = type;
- /* TODO: Why 12 in HSRv0? */
+ /* HSRv0 has 6 unused bytes after the MAC */
hsr_stag->tlv.HSR_TLV_length = hsr->prot_version ?
sizeof(struct hsr_sup_payload) : 12;
diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c
index c67c0d35921d..339f0d220212 100644
--- a/net/hsr/hsr_forward.c
+++ b/net/hsr/hsr_forward.c
@@ -262,15 +262,23 @@ static struct sk_buff *prp_fill_rct(struct sk_buff *skb,
return skb;
}
-static void hsr_set_path_id(struct hsr_ethhdr *hsr_ethhdr,
+static void hsr_set_path_id(struct hsr_frame_info *frame,
+ struct hsr_ethhdr *hsr_ethhdr,
struct hsr_port *port)
{
int path_id;
- if (port->type == HSR_PT_SLAVE_A)
- path_id = 0;
- else
- path_id = 1;
+ if (port->hsr->prot_version) {
+ if (port->type == HSR_PT_SLAVE_A)
+ path_id = 0;
+ else
+ path_id = 1;
+ } else {
+ if (frame->is_supervision)
+ path_id = 0xf;
+ else
+ path_id = 1;
+ }
set_hsr_tag_path(&hsr_ethhdr->hsr_tag, path_id);
}
@@ -304,7 +312,7 @@ static struct sk_buff *hsr_fill_tag(struct sk_buff *skb,
else
hsr_ethhdr = (struct hsr_ethhdr *)pc;
- hsr_set_path_id(hsr_ethhdr, port);
+ hsr_set_path_id(frame, hsr_ethhdr, port);
set_hsr_tag_LSDU_size(&hsr_ethhdr->hsr_tag, lsdu_size);
hsr_ethhdr->hsr_tag.sequence_nr = htons(frame->sequence_nr);
hsr_ethhdr->hsr_tag.encap_proto = hsr_ethhdr->ethhdr.h_proto;
@@ -330,7 +338,7 @@ struct sk_buff *hsr_create_tagged_frame(struct hsr_frame_info *frame,
(struct hsr_ethhdr *)skb_mac_header(frame->skb_hsr);
/* set the lane id properly */
- hsr_set_path_id(hsr_ethhdr, port);
+ hsr_set_path_id(frame, hsr_ethhdr, port);
return skb_clone(frame->skb_hsr, GFP_ATOMIC);
} else if (port->dev->features & NETIF_F_HW_HSR_TAG_INS) {
return skb_clone(frame->skb_std, GFP_ATOMIC);
diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
index e0d94270da28..05828d4cb6cd 100644
--- a/net/ipv4/esp4_offload.c
+++ b/net/ipv4/esp4_offload.c
@@ -122,8 +122,10 @@ static struct sk_buff *xfrm4_tunnel_gso_segment(struct xfrm_state *x,
struct sk_buff *skb,
netdev_features_t features)
{
- __be16 type = x->inner_mode.family == AF_INET6 ? htons(ETH_P_IPV6)
- : htons(ETH_P_IP);
+ const struct xfrm_mode *inner_mode = xfrm_ip2inner_mode(x,
+ XFRM_MODE_SKB_CB(skb)->protocol);
+ __be16 type = inner_mode->family == AF_INET6 ? htons(ETH_P_IPV6)
+ : htons(ETH_P_IP);
return skb_eth_gso_segment(skb, features, type);
}
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 6d27d3610c1c..b549d6a57307 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -607,6 +607,11 @@ static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
oldest_p = fnhe_p;
}
}
+
+ /* Clear oldest->fnhe_daddr to prevent this fnhe from being
+ * rebound with new dsts in rt_bind_exception().
+ */
+ oldest->fnhe_daddr = 0;
fnhe_flush_routes(oldest);
*oldest_p = oldest->fnhe_next;
kfree_rcu(oldest, rcu);
diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c
index 7b41fb4f00b5..22410243ebe8 100644
--- a/net/ipv6/esp6_offload.c
+++ b/net/ipv6/esp6_offload.c
@@ -158,8 +158,10 @@ static struct sk_buff *xfrm6_tunnel_gso_segment(struct xfrm_state *x,
struct sk_buff *skb,
netdev_features_t features)
{
- __be16 type = x->inner_mode.family == AF_INET ? htons(ETH_P_IP)
- : htons(ETH_P_IPV6);
+ const struct xfrm_mode *inner_mode = xfrm_ip2inner_mode(x,
+ XFRM_MODE_SKB_CB(skb)->protocol);
+ __be16 type = inner_mode->family == AF_INET ? htons(ETH_P_IP)
+ : htons(ETH_P_IPV6);
return skb_eth_gso_segment(skb, features, type);
}
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index b4f01cb07561..5dd7e0509a48 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -1560,24 +1560,16 @@ static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
}
case SIOCKCMCLONE: {
struct kcm_clone info;
- struct file *file;
- info.fd = get_unused_fd_flags(0);
- if (unlikely(info.fd < 0))
- return info.fd;
+ FD_PREPARE(fdf, 0, kcm_clone(sock));
+ if (fdf.err)
+ return fdf.err;
- file = kcm_clone(sock);
- if (IS_ERR(file)) {
- put_unused_fd(info.fd);
- return PTR_ERR(file);
- }
- if (copy_to_user((void __user *)arg, &info,
- sizeof(info))) {
- put_unused_fd(info.fd);
- fput(file);
+ info.fd = fd_prepare_fd(fdf);
+ if (copy_to_user((void __user *)arg, &info, sizeof(info)))
return -EFAULT;
- }
- fd_install(info.fd, file);
+
+ fd_publish(fdf);
err = 0;
break;
}
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 369a2f2e459c..0710281dd95a 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -1246,9 +1246,9 @@ static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, uns
else
l2tp_build_l2tpv3_header(session, __skb_push(skb, session->hdr_len));
- /* Reset skb netfilter state */
- memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
- IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | IPSKB_REROUTED);
+ /* Reset control buffer */
+ memset(skb->cb, 0, sizeof(skb->cb));
+
nf_reset_ct(skb);
/* L2TP uses its own lockdep subclass to avoid lockdep splats caused by
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index a7873832d4fa..0ca55b9655a7 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -223,6 +223,10 @@ static int ieee80211_can_powered_addr_change(struct ieee80211_sub_if_data *sdata
if (netif_carrier_ok(sdata->dev))
return -EBUSY;
+ /* if any stations are set known (so they know this vif too), reject */
+ if (sta_info_get_by_idx(sdata, 0))
+ return -EBUSY;
+
/* First check no ROC work is happening on this iface */
list_for_each_entry(roc, &local->roc_list, list) {
if (roc->sdata != sdata)
@@ -242,12 +246,16 @@ static int ieee80211_can_powered_addr_change(struct ieee80211_sub_if_data *sdata
ret = -EBUSY;
}
+ /*
+ * More interface types could be added here but changing the
+ * address while powered makes the most sense in client modes.
+ */
switch (sdata->vif.type) {
case NL80211_IFTYPE_STATION:
case NL80211_IFTYPE_P2P_CLIENT:
- /* More interface types could be added here but changing the
- * address while powered makes the most sense in client modes.
- */
+ /* refuse while connecting */
+ if (sdata->u.mgd.auth_data || sdata->u.mgd.assoc_data)
+ return -EBUSY;
break;
default:
ret = -EOPNOTSUPP;
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 6af43dfefdd6..5b4c3fe9970a 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -5360,10 +5360,14 @@ void ieee80211_rx_list(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta,
if (WARN_ON(!local->started))
goto drop;
- if (likely(!(status->flag & RX_FLAG_FAILED_PLCP_CRC))) {
+ if (likely(!(status->flag & RX_FLAG_FAILED_PLCP_CRC) &&
+ !(status->flag & RX_FLAG_NO_PSDU &&
+ status->zero_length_psdu_type ==
+ IEEE80211_RADIOTAP_ZERO_LEN_PSDU_NOT_CAPTURED))) {
/*
- * Validate the rate, unless a PLCP error means that
- * we probably can't have a valid rate here anyway.
+ * Validate the rate, unless there was a PLCP error which may
+ * have an invalid rate or the PSDU was not capture and may be
+ * missing rate information.
*/
switch (status->encoding) {
diff --git a/net/mctp/route.c b/net/mctp/route.c
index 4d314e062ba9..2ac4011a953f 100644
--- a/net/mctp/route.c
+++ b/net/mctp/route.c
@@ -623,6 +623,7 @@ static int mctp_dst_output(struct mctp_dst *dst, struct sk_buff *skb)
skb->protocol = htons(ETH_P_MCTP);
skb->pkt_type = PACKET_OUTGOING;
+ skb->dev = dst->dev->dev;
if (skb->len > dst->mtu) {
kfree_skb(skb);
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index 1103b3341a70..f24ae7d40e88 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -838,8 +838,11 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
opts->suboptions = 0;
+ /* Force later mptcp_write_options(), but do not use any actual
+ * option space.
+ */
if (unlikely(__mptcp_check_fallback(msk) && !mptcp_check_infinite_map(skb)))
- return false;
+ return true;
if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) {
if (mptcp_established_options_fastclose(sk, &opt_size, remaining, opts) ||
@@ -1041,6 +1044,31 @@ static void __mptcp_snd_una_update(struct mptcp_sock *msk, u64 new_snd_una)
WRITE_ONCE(msk->snd_una, new_snd_una);
}
+static void rwin_update(struct mptcp_sock *msk, struct sock *ssk,
+ struct sk_buff *skb)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ struct tcp_sock *tp = tcp_sk(ssk);
+ u64 mptcp_rcv_wnd;
+
+ /* Avoid touching extra cachelines if TCP is going to accept this
+ * skb without filling the TCP-level window even with a possibly
+ * outdated mptcp-level rwin.
+ */
+ if (!skb->len || skb->len < tcp_receive_window(tp))
+ return;
+
+ mptcp_rcv_wnd = atomic64_read(&msk->rcv_wnd_sent);
+ if (!after64(mptcp_rcv_wnd, subflow->rcv_wnd_sent))
+ return;
+
+ /* Some other subflow grew the mptcp-level rwin since rcv_wup,
+ * resync.
+ */
+ tp->rcv_wnd += mptcp_rcv_wnd - subflow->rcv_wnd_sent;
+ subflow->rcv_wnd_sent = mptcp_rcv_wnd;
+}
+
static void ack_update_msk(struct mptcp_sock *msk,
struct sock *ssk,
struct mptcp_options_received *mp_opt)
@@ -1208,6 +1236,7 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
*/
if (mp_opt.use_ack)
ack_update_msk(msk, sk, &mp_opt);
+ rwin_update(msk, sk, skb);
/* Zero-data-length packets are dropped by the caller and not
* propagated to the MPTCP layer, so the skb extension does not
@@ -1294,6 +1323,10 @@ static void mptcp_set_rwin(struct tcp_sock *tp, struct tcphdr *th)
if (rcv_wnd_new != rcv_wnd_old) {
raise_win:
+ /* The msk-level rcv wnd is after the tcp level one,
+ * sync the latter.
+ */
+ rcv_wnd_new = rcv_wnd_old;
win = rcv_wnd_old - ack_seq;
tp->rcv_wnd = min_t(u64, win, U32_MAX);
new_win = tp->rcv_wnd;
@@ -1317,6 +1350,21 @@ raise_win:
update_wspace:
WRITE_ONCE(msk->old_wspace, tp->rcv_wnd);
+ subflow->rcv_wnd_sent = rcv_wnd_new;
+}
+
+static void mptcp_track_rwin(struct tcp_sock *tp)
+{
+ const struct sock *ssk = (const struct sock *)tp;
+ struct mptcp_subflow_context *subflow;
+ struct mptcp_sock *msk;
+
+ if (!ssk)
+ return;
+
+ subflow = mptcp_subflow_ctx(ssk);
+ msk = mptcp_sk(subflow->conn);
+ WRITE_ONCE(msk->old_wspace, tp->rcv_wnd);
}
__sum16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum)
@@ -1611,6 +1659,10 @@ mp_rst:
opts->reset_transient,
opts->reset_reason);
return;
+ } else if (unlikely(!opts->suboptions)) {
+ /* Fallback to TCP */
+ mptcp_track_rwin(tp);
+ return;
}
if (OPTION_MPTCP_PRIO & opts->suboptions) {
diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c
index 2ff1b9499568..9604b91902b8 100644
--- a/net/mptcp/pm.c
+++ b/net/mptcp/pm.c
@@ -18,6 +18,7 @@ struct mptcp_pm_add_entry {
u8 retrans_times;
struct timer_list add_timer;
struct mptcp_sock *sock;
+ struct rcu_head rcu;
};
static DEFINE_SPINLOCK(mptcp_pm_list_lock);
@@ -155,7 +156,7 @@ bool mptcp_remove_anno_list_by_saddr(struct mptcp_sock *msk,
entry = mptcp_pm_del_add_timer(msk, addr, false);
ret = entry;
- kfree(entry);
+ kfree_rcu(entry, rcu);
return ret;
}
@@ -345,22 +346,27 @@ mptcp_pm_del_add_timer(struct mptcp_sock *msk,
{
struct mptcp_pm_add_entry *entry;
struct sock *sk = (struct sock *)msk;
- struct timer_list *add_timer = NULL;
+ bool stop_timer = false;
+
+ rcu_read_lock();
spin_lock_bh(&msk->pm.lock);
entry = mptcp_lookup_anno_list_by_saddr(msk, addr);
if (entry && (!check_id || entry->addr.id == addr->id)) {
entry->retrans_times = ADD_ADDR_RETRANS_MAX;
- add_timer = &entry->add_timer;
+ stop_timer = true;
}
if (!check_id && entry)
list_del(&entry->list);
spin_unlock_bh(&msk->pm.lock);
- /* no lock, because sk_stop_timer_sync() is calling timer_delete_sync() */
- if (add_timer)
- sk_stop_timer_sync(sk, add_timer);
+ /* Note: entry might have been removed by another thread.
+ * We hold rcu_read_lock() to ensure it is not freed under us.
+ */
+ if (stop_timer)
+ sk_stop_timer_sync(sk, &entry->add_timer);
+ rcu_read_unlock();
return entry;
}
@@ -415,7 +421,7 @@ static void mptcp_pm_free_anno_list(struct mptcp_sock *msk)
list_for_each_entry_safe(entry, tmp, &free_list, list) {
sk_stop_timer_sync(sk, &entry->add_timer);
- kfree(entry);
+ kfree_rcu(entry, rcu);
}
}
diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c
index 2ae95476dba3..0a50fd5edc06 100644
--- a/net/mptcp/pm_kernel.c
+++ b/net/mptcp/pm_kernel.c
@@ -672,7 +672,7 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk)
void mptcp_pm_nl_rm_addr(struct mptcp_sock *msk, u8 rm_id)
{
- if (rm_id && WARN_ON_ONCE(msk->pm.add_addr_accepted == 0)) {
+ if (rm_id && !WARN_ON_ONCE(msk->pm.add_addr_accepted == 0)) {
u8 limit_add_addr_accepted =
mptcp_pm_get_limit_add_addr_accepted(msk);
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 2d6b8de35c44..1e413426deee 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -61,11 +61,13 @@ static u64 mptcp_wnd_end(const struct mptcp_sock *msk)
static const struct proto_ops *mptcp_fallback_tcp_ops(const struct sock *sk)
{
+ unsigned short family = READ_ONCE(sk->sk_family);
+
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
- if (sk->sk_prot == &tcpv6_prot)
+ if (family == AF_INET6)
return &inet6_stream_ops;
#endif
- WARN_ON_ONCE(sk->sk_prot != &tcp_prot);
+ WARN_ON_ONCE(family != AF_INET);
return &inet_stream_ops;
}
@@ -76,6 +78,13 @@ bool __mptcp_try_fallback(struct mptcp_sock *msk, int fb_mib)
if (__mptcp_check_fallback(msk))
return true;
+ /* The caller possibly is not holding the msk socket lock, but
+ * in the fallback case only the current subflow is touching
+ * the OoO queue.
+ */
+ if (!RB_EMPTY_ROOT(&msk->out_of_order_queue))
+ return false;
+
spin_lock_bh(&msk->fallback_lock);
if (!msk->allow_infinite_fallback) {
spin_unlock_bh(&msk->fallback_lock);
@@ -935,14 +944,19 @@ static void mptcp_reset_rtx_timer(struct sock *sk)
bool mptcp_schedule_work(struct sock *sk)
{
- if (inet_sk_state_load(sk) != TCP_CLOSE &&
- schedule_work(&mptcp_sk(sk)->work)) {
- /* each subflow already holds a reference to the sk, and the
- * workqueue is invoked by a subflow, so sk can't go away here.
- */
- sock_hold(sk);
+ if (inet_sk_state_load(sk) == TCP_CLOSE)
+ return false;
+
+ /* Get a reference on this socket, mptcp_worker() will release it.
+ * As mptcp_worker() might complete before us, we can not avoid
+ * a sock_hold()/sock_put() if schedule_work() returns false.
+ */
+ sock_hold(sk);
+
+ if (schedule_work(&mptcp_sk(sk)->work))
return true;
- }
+
+ sock_put(sk);
return false;
}
@@ -2397,7 +2411,6 @@ bool __mptcp_retransmit_pending_data(struct sock *sk)
/* flags for __mptcp_close_ssk() */
#define MPTCP_CF_PUSH BIT(1)
-#define MPTCP_CF_FASTCLOSE BIT(2)
/* be sure to send a reset only if the caller asked for it, also
* clean completely the subflow status when the subflow reaches
@@ -2408,7 +2421,7 @@ static void __mptcp_subflow_disconnect(struct sock *ssk,
unsigned int flags)
{
if (((1 << ssk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) ||
- (flags & MPTCP_CF_FASTCLOSE)) {
+ subflow->send_fastclose) {
/* The MPTCP code never wait on the subflow sockets, TCP-level
* disconnect should never fail
*/
@@ -2455,14 +2468,8 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);
- if ((flags & MPTCP_CF_FASTCLOSE) && !__mptcp_check_fallback(msk)) {
- /* be sure to force the tcp_close path
- * to generate the egress reset
- */
- ssk->sk_lingertime = 0;
- sock_set_flag(ssk, SOCK_LINGER);
- subflow->send_fastclose = 1;
- }
+ if (subflow->send_fastclose && ssk->sk_state != TCP_CLOSE)
+ tcp_set_state(ssk, TCP_CLOSE);
need_push = (flags & MPTCP_CF_PUSH) && __mptcp_retransmit_pending_data(sk);
if (!dispose_it) {
@@ -2558,7 +2565,8 @@ static void __mptcp_close_subflow(struct sock *sk)
if (ssk_state != TCP_CLOSE &&
(ssk_state != TCP_CLOSE_WAIT ||
- inet_sk_state_load(sk) != TCP_ESTABLISHED))
+ inet_sk_state_load(sk) != TCP_ESTABLISHED ||
+ __mptcp_check_fallback(msk)))
continue;
/* 'subflow_data_ready' will re-sched once rx queue is empty */
@@ -2657,7 +2665,7 @@ static void __mptcp_retrans(struct sock *sk)
}
if (!mptcp_send_head(sk))
- return;
+ goto clear_scheduled;
goto reset_timer;
}
@@ -2688,7 +2696,7 @@ static void __mptcp_retrans(struct sock *sk)
if (__mptcp_check_fallback(msk)) {
spin_unlock_bh(&msk->fallback_lock);
release_sock(ssk);
- return;
+ goto clear_scheduled;
}
while (info.sent < info.limit) {
@@ -2720,6 +2728,15 @@ reset_timer:
if (!mptcp_rtx_timer_pending(sk))
mptcp_reset_rtx_timer(sk);
+
+clear_scheduled:
+ /* If no rtx data was available or in case of fallback, there
+ * could be left-over scheduled subflows; clear them all
+ * or later xmit could use bad ones
+ */
+ mptcp_for_each_subflow(msk, subflow)
+ if (READ_ONCE(subflow->scheduled))
+ mptcp_subflow_set_scheduled(subflow, false);
}
/* schedule the timeout timer for the relevant event: either close timeout
@@ -2766,9 +2783,32 @@ static void mptcp_do_fastclose(struct sock *sk)
struct mptcp_sock *msk = mptcp_sk(sk);
mptcp_set_state(sk, TCP_CLOSE);
- mptcp_for_each_subflow_safe(msk, subflow, tmp)
- __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow),
- subflow, MPTCP_CF_FASTCLOSE);
+
+ /* Explicitly send the fastclose reset as need */
+ if (__mptcp_check_fallback(msk))
+ return;
+
+ mptcp_for_each_subflow_safe(msk, subflow, tmp) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ lock_sock(ssk);
+
+ /* Some subflow socket states don't allow/need a reset.*/
+ if ((1 << ssk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
+ goto unlock;
+
+ subflow->send_fastclose = 1;
+
+ /* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0
+ * issue in __tcp_select_window(), see tcp_disconnect().
+ */
+ inet_csk(ssk)->icsk_ack.rcv_mss = TCP_MIN_MSS;
+
+ tcp_send_active_reset(ssk, ssk->sk_allocation,
+ SK_RST_REASON_TCP_ABORT_ON_CLOSE);
+unlock:
+ release_sock(ssk);
+ }
}
static void mptcp_worker(struct work_struct *work)
@@ -2795,7 +2835,11 @@ static void mptcp_worker(struct work_struct *work)
__mptcp_close_subflow(sk);
if (mptcp_close_tout_expired(sk)) {
+ struct mptcp_subflow_context *subflow, *tmp;
+
mptcp_do_fastclose(sk);
+ mptcp_for_each_subflow_safe(msk, subflow, tmp)
+ __mptcp_close_ssk(sk, subflow->tcp_sock, subflow, 0);
mptcp_close_wake_up(sk);
}
@@ -3220,7 +3264,8 @@ static int mptcp_disconnect(struct sock *sk, int flags)
/* msk->subflow is still intact, the following will not free the first
* subflow
*/
- mptcp_destroy_common(msk, MPTCP_CF_FASTCLOSE);
+ mptcp_do_fastclose(sk);
+ mptcp_destroy_common(msk);
/* The first subflow is already in TCP_CLOSE status, the following
* can't overlap with a fallback anymore
@@ -3399,7 +3444,7 @@ void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk)
msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT;
}
-void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags)
+void mptcp_destroy_common(struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow, *tmp;
struct sock *sk = (struct sock *)msk;
@@ -3408,7 +3453,7 @@ void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags)
/* join list will be eventually flushed (with rst) at sock lock release time */
mptcp_for_each_subflow_safe(msk, subflow, tmp)
- __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), subflow, flags);
+ __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), subflow, 0);
__skb_queue_purge(&sk->sk_receive_queue);
skb_rbtree_purge(&msk->out_of_order_queue);
@@ -3426,7 +3471,7 @@ static void mptcp_destroy(struct sock *sk)
/* allow the following to close even the initial subflow */
msk->free_first = 1;
- mptcp_destroy_common(msk, 0);
+ mptcp_destroy_common(msk);
sk_sockets_allocated_dec(sk);
}
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 379a88e14e8d..6ca97096607c 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -509,6 +509,7 @@ struct mptcp_subflow_context {
u64 remote_key;
u64 idsn;
u64 map_seq;
+ u64 rcv_wnd_sent;
u32 snd_isn;
u32 token;
u32 rel_write_seq;
@@ -976,7 +977,7 @@ static inline void mptcp_propagate_sndbuf(struct sock *sk, struct sock *ssk)
local_bh_enable();
}
-void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags);
+void mptcp_destroy_common(struct mptcp_sock *msk);
#define MPTCP_TOKEN_MAX_RETRIES 4
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index e8325890a322..af707ce0f624 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -2144,6 +2144,10 @@ void __init mptcp_subflow_init(void)
tcp_prot_override = tcp_prot;
tcp_prot_override.release_cb = tcp_release_cb_override;
tcp_prot_override.diag_destroy = tcp_abort_override;
+#ifdef CONFIG_BPF_SYSCALL
+ /* Disable sockmap processing for subflows */
+ tcp_prot_override.psock_update_sk_prot = NULL;
+#endif
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
/* In struct mptcp_subflow_request_sock, we assume the TCP request sock
@@ -2180,6 +2184,10 @@ void __init mptcp_subflow_init(void)
tcpv6_prot_override = tcpv6_prot;
tcpv6_prot_override.release_cb = tcp_release_cb_override;
tcpv6_prot_override.diag_destroy = tcp_abort_override;
+#ifdef CONFIG_BPF_SYSCALL
+ /* Disable sockmap processing for subflows */
+ tcpv6_prot_override.psock_update_sk_prot = NULL;
+#endif
#endif
mptcp_diag_subflow_init(&subflow_ulp_ops);
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 2832e0794197..792ca44a461d 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -572,69 +572,6 @@ static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *flow_key,
return 0;
}
-static int set_nsh(struct sk_buff *skb, struct sw_flow_key *flow_key,
- const struct nlattr *a)
-{
- struct nshhdr *nh;
- size_t length;
- int err;
- u8 flags;
- u8 ttl;
- int i;
-
- struct ovs_key_nsh key;
- struct ovs_key_nsh mask;
-
- err = nsh_key_from_nlattr(a, &key, &mask);
- if (err)
- return err;
-
- /* Make sure the NSH base header is there */
- if (!pskb_may_pull(skb, skb_network_offset(skb) + NSH_BASE_HDR_LEN))
- return -ENOMEM;
-
- nh = nsh_hdr(skb);
- length = nsh_hdr_len(nh);
-
- /* Make sure the whole NSH header is there */
- err = skb_ensure_writable(skb, skb_network_offset(skb) +
- length);
- if (unlikely(err))
- return err;
-
- nh = nsh_hdr(skb);
- skb_postpull_rcsum(skb, nh, length);
- flags = nsh_get_flags(nh);
- flags = OVS_MASKED(flags, key.base.flags, mask.base.flags);
- flow_key->nsh.base.flags = flags;
- ttl = nsh_get_ttl(nh);
- ttl = OVS_MASKED(ttl, key.base.ttl, mask.base.ttl);
- flow_key->nsh.base.ttl = ttl;
- nsh_set_flags_and_ttl(nh, flags, ttl);
- nh->path_hdr = OVS_MASKED(nh->path_hdr, key.base.path_hdr,
- mask.base.path_hdr);
- flow_key->nsh.base.path_hdr = nh->path_hdr;
- switch (nh->mdtype) {
- case NSH_M_TYPE1:
- for (i = 0; i < NSH_MD1_CONTEXT_SIZE; i++) {
- nh->md1.context[i] =
- OVS_MASKED(nh->md1.context[i], key.context[i],
- mask.context[i]);
- }
- memcpy(flow_key->nsh.context, nh->md1.context,
- sizeof(nh->md1.context));
- break;
- case NSH_M_TYPE2:
- memset(flow_key->nsh.context, 0,
- sizeof(flow_key->nsh.context));
- break;
- default:
- return -EINVAL;
- }
- skb_postpush_rcsum(skb, nh, length);
- return 0;
-}
-
/* Must follow skb_ensure_writable() since that can move the skb data. */
static void set_tp_port(struct sk_buff *skb, __be16 *port,
__be16 new_port, __sum16 *check)
@@ -1130,10 +1067,6 @@ static int execute_masked_set_action(struct sk_buff *skb,
get_mask(a, struct ovs_key_ethernet *));
break;
- case OVS_KEY_ATTR_NSH:
- err = set_nsh(skb, flow_key, a);
- break;
-
case OVS_KEY_ATTR_IPV4:
err = set_ipv4(skb, flow_key, nla_data(a),
get_mask(a, struct ovs_key_ipv4 *));
@@ -1170,6 +1103,7 @@ static int execute_masked_set_action(struct sk_buff *skb,
case OVS_KEY_ATTR_CT_LABELS:
case OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4:
case OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6:
+ case OVS_KEY_ATTR_NSH:
err = -EINVAL;
break;
}
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index ad64bb9ab5e2..1cb4f97335d8 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -1305,6 +1305,11 @@ static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match,
return 0;
}
+/*
+ * Constructs NSH header 'nh' from attributes of OVS_ACTION_ATTR_PUSH_NSH,
+ * where 'nh' points to a memory block of 'size' bytes. It's assumed that
+ * attributes were previously validated with validate_push_nsh().
+ */
int nsh_hdr_from_nlattr(const struct nlattr *attr,
struct nshhdr *nh, size_t size)
{
@@ -1314,8 +1319,6 @@ int nsh_hdr_from_nlattr(const struct nlattr *attr,
u8 ttl = 0;
int mdlen = 0;
- /* validate_nsh has check this, so we needn't do duplicate check here
- */
if (size < NSH_BASE_HDR_LEN)
return -ENOBUFS;
@@ -1359,46 +1362,6 @@ int nsh_hdr_from_nlattr(const struct nlattr *attr,
return 0;
}
-int nsh_key_from_nlattr(const struct nlattr *attr,
- struct ovs_key_nsh *nsh, struct ovs_key_nsh *nsh_mask)
-{
- struct nlattr *a;
- int rem;
-
- /* validate_nsh has check this, so we needn't do duplicate check here
- */
- nla_for_each_nested(a, attr, rem) {
- int type = nla_type(a);
-
- switch (type) {
- case OVS_NSH_KEY_ATTR_BASE: {
- const struct ovs_nsh_key_base *base = nla_data(a);
- const struct ovs_nsh_key_base *base_mask = base + 1;
-
- nsh->base = *base;
- nsh_mask->base = *base_mask;
- break;
- }
- case OVS_NSH_KEY_ATTR_MD1: {
- const struct ovs_nsh_key_md1 *md1 = nla_data(a);
- const struct ovs_nsh_key_md1 *md1_mask = md1 + 1;
-
- memcpy(nsh->context, md1->context, sizeof(*md1));
- memcpy(nsh_mask->context, md1_mask->context,
- sizeof(*md1_mask));
- break;
- }
- case OVS_NSH_KEY_ATTR_MD2:
- /* Not supported yet */
- return -ENOTSUPP;
- default:
- return -EINVAL;
- }
- }
-
- return 0;
-}
-
static int nsh_key_put_from_nlattr(const struct nlattr *attr,
struct sw_flow_match *match, bool is_mask,
bool is_push_nsh, bool log)
@@ -2839,17 +2802,13 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
return err;
}
-static bool validate_nsh(const struct nlattr *attr, bool is_mask,
- bool is_push_nsh, bool log)
+static bool validate_push_nsh(const struct nlattr *attr, bool log)
{
struct sw_flow_match match;
struct sw_flow_key key;
- int ret = 0;
ovs_match_init(&match, &key, true, NULL);
- ret = nsh_key_put_from_nlattr(attr, &match, is_mask,
- is_push_nsh, log);
- return !ret;
+ return !nsh_key_put_from_nlattr(attr, &match, false, true, log);
}
/* Return false if there are any non-masked bits set.
@@ -2997,13 +2956,6 @@ static int validate_set(const struct nlattr *a,
break;
- case OVS_KEY_ATTR_NSH:
- if (eth_type != htons(ETH_P_NSH))
- return -EINVAL;
- if (!validate_nsh(nla_data(a), masked, false, log))
- return -EINVAL;
- break;
-
default:
return -EINVAL;
}
@@ -3437,7 +3389,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
return -EINVAL;
}
mac_proto = MAC_PROTO_NONE;
- if (!validate_nsh(nla_data(a), false, true, true))
+ if (!validate_push_nsh(nla_data(a), log))
return -EINVAL;
break;
diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h
index fe7f77fc5f18..ff8cdecbe346 100644
--- a/net/openvswitch/flow_netlink.h
+++ b/net/openvswitch/flow_netlink.h
@@ -65,8 +65,6 @@ int ovs_nla_put_actions(const struct nlattr *attr,
void ovs_nla_free_flow_actions(struct sw_flow_actions *);
void ovs_nla_free_flow_actions_rcu(struct sw_flow_actions *);
-int nsh_key_from_nlattr(const struct nlattr *attr, struct ovs_key_nsh *nsh,
- struct ovs_key_nsh *nsh_mask);
int nsh_hdr_from_nlattr(const struct nlattr *attr, struct nshhdr *nh,
size_t size);
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 396b576390d0..c2b5bc19e091 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -47,12 +47,10 @@ TC_INDIRECT_SCOPE int tcf_bpf_act(struct sk_buff *skb,
filter = rcu_dereference(prog->filter);
if (at_ingress) {
__skb_push(skb, skb->mac_len);
- bpf_compute_data_pointers(skb);
- filter_res = bpf_prog_run(filter, skb);
+ filter_res = bpf_prog_run_data_pointers(filter, skb);
__skb_pull(skb, skb->mac_len);
} else {
- bpf_compute_data_pointers(skb);
- filter_res = bpf_prog_run(filter, skb);
+ filter_res = bpf_prog_run_data_pointers(filter, skb);
}
if (unlikely(!skb->tstamp && skb->tstamp_type))
skb->tstamp_type = SKB_CLOCK_REALTIME;
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index 3e89927d7116..26ba8c2d20ab 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -195,13 +195,15 @@ static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a,
const struct tcf_connmark_info *ci = to_connmark(a);
unsigned char *b = skb_tail_pointer(skb);
const struct tcf_connmark_parms *parms;
- struct tc_connmark opt = {
- .index = ci->tcf_index,
- .refcnt = refcount_read(&ci->tcf_refcnt) - ref,
- .bindcnt = atomic_read(&ci->tcf_bindcnt) - bind,
- };
+ struct tc_connmark opt;
struct tcf_t t;
+ memset(&opt, 0, sizeof(opt));
+
+ opt.index = ci->tcf_index;
+ opt.refcnt = refcount_read(&ci->tcf_refcnt) - ref;
+ opt.bindcnt = atomic_read(&ci->tcf_bindcnt) - bind;
+
rcu_read_lock();
parms = rcu_dereference(ci->parms);
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 107c6d83dc5c..7c6975632fc2 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -644,13 +644,15 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind,
unsigned char *b = skb_tail_pointer(skb);
struct tcf_ife_info *ife = to_ife(a);
struct tcf_ife_params *p;
- struct tc_ife opt = {
- .index = ife->tcf_index,
- .refcnt = refcount_read(&ife->tcf_refcnt) - ref,
- .bindcnt = atomic_read(&ife->tcf_bindcnt) - bind,
- };
+ struct tc_ife opt;
struct tcf_t t;
+ memset(&opt, 0, sizeof(opt));
+
+ opt.index = ife->tcf_index,
+ opt.refcnt = refcount_read(&ife->tcf_refcnt) - ref,
+ opt.bindcnt = atomic_read(&ife->tcf_bindcnt) - bind,
+
spin_lock_bh(&ife->tcf_lock);
opt.action = ife->tcf_action;
p = rcu_dereference_protected(ife->params,
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 7fbe42f0e5c2..a32754a2658b 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -97,12 +97,10 @@ TC_INDIRECT_SCOPE int cls_bpf_classify(struct sk_buff *skb,
} else if (at_ingress) {
/* It is safe to push/pull even if skb_shared() */
__skb_push(skb, skb->mac_len);
- bpf_compute_data_pointers(skb);
- filter_res = bpf_prog_run(prog->filter, skb);
+ filter_res = bpf_prog_run_data_pointers(prog->filter, skb);
__skb_pull(skb, skb->mac_len);
} else {
- bpf_compute_data_pointers(skb);
- filter_res = bpf_prog_run(prog->filter, skb);
+ filter_res = bpf_prog_run_data_pointers(prog->filter, skb);
}
if (unlikely(!skb->tstamp && skb->tstamp_type))
skb->tstamp_type = SKB_CLOCK_REALTIME;
diff --git a/net/sched/em_canid.c b/net/sched/em_canid.c
index 5337bc462755..2d27f91d8441 100644
--- a/net/sched/em_canid.c
+++ b/net/sched/em_canid.c
@@ -99,6 +99,9 @@ static int em_canid_match(struct sk_buff *skb, struct tcf_ematch *m,
int i;
const struct can_filter *lp;
+ if (!pskb_may_pull(skb, CAN_MTU))
+ return 0;
+
can_id = em_canid_get_id(skb);
if (can_id & CAN_EFF_FLAG) {
diff --git a/net/sched/em_cmp.c b/net/sched/em_cmp.c
index 64b637f18bc7..48c1bce74f49 100644
--- a/net/sched/em_cmp.c
+++ b/net/sched/em_cmp.c
@@ -22,9 +22,12 @@ static int em_cmp_match(struct sk_buff *skb, struct tcf_ematch *em,
struct tcf_pkt_info *info)
{
struct tcf_em_cmp *cmp = (struct tcf_em_cmp *) em->data;
- unsigned char *ptr = tcf_get_base_ptr(skb, cmp->layer) + cmp->off;
+ unsigned char *ptr = tcf_get_base_ptr(skb, cmp->layer);
u32 val = 0;
+ if (!ptr)
+ return 0;
+ ptr += cmp->off;
if (!tcf_valid_offset(skb, ptr, cmp->align))
return 0;
diff --git a/net/sched/em_nbyte.c b/net/sched/em_nbyte.c
index 4f9f21a05d5e..c65ffa5fff94 100644
--- a/net/sched/em_nbyte.c
+++ b/net/sched/em_nbyte.c
@@ -42,6 +42,8 @@ static int em_nbyte_match(struct sk_buff *skb, struct tcf_ematch *em,
struct nbyte_data *nbyte = (struct nbyte_data *) em->data;
unsigned char *ptr = tcf_get_base_ptr(skb, nbyte->hdr.layer);
+ if (!ptr)
+ return 0;
ptr += nbyte->hdr.off;
if (!tcf_valid_offset(skb, ptr, nbyte->hdr.len))
diff --git a/net/sched/em_text.c b/net/sched/em_text.c
index 6b3d0af72c39..692e2be1793e 100644
--- a/net/sched/em_text.c
+++ b/net/sched/em_text.c
@@ -29,12 +29,19 @@ static int em_text_match(struct sk_buff *skb, struct tcf_ematch *m,
struct tcf_pkt_info *info)
{
struct text_match *tm = EM_TEXT_PRIV(m);
+ unsigned char *ptr;
int from, to;
- from = tcf_get_base_ptr(skb, tm->from_layer) - skb->data;
+ ptr = tcf_get_base_ptr(skb, tm->from_layer);
+ if (!ptr)
+ return 0;
+ from = ptr - skb->data;
from += tm->from_offset;
- to = tcf_get_base_ptr(skb, tm->to_layer) - skb->data;
+ ptr = tcf_get_base_ptr(skb, tm->to_layer);
+ if (!ptr)
+ return 0;
+ to = ptr - skb->data;
to += tm->to_offset;
return skb_find_text(skb, from, to, tm->config) != UINT_MAX;
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 1e058b46d3e1..f56b18c8aebf 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1599,6 +1599,11 @@ static int __tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
return -ENOENT;
}
+ if (p->flags & TCQ_F_INGRESS) {
+ NL_SET_ERR_MSG(extack,
+ "Cannot add children to ingress/clsact qdisc");
+ return -EOPNOTSUPP;
+ }
q = qdisc_leaf(p, clid, extack);
if (IS_ERR(q))
return PTR_ERR(q);
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 1e008a228ebd..7dee9748a56b 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -180,9 +180,10 @@ static inline void dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
static void try_bulk_dequeue_skb(struct Qdisc *q,
struct sk_buff *skb,
const struct netdev_queue *txq,
- int *packets)
+ int *packets, int budget)
{
int bytelimit = qdisc_avail_bulklimit(txq) - skb->len;
+ int cnt = 0;
while (bytelimit > 0) {
struct sk_buff *nskb = q->dequeue(q);
@@ -193,8 +194,10 @@ static void try_bulk_dequeue_skb(struct Qdisc *q,
bytelimit -= nskb->len; /* covers GSO len */
skb->next = nskb;
skb = nskb;
- (*packets)++; /* GSO counts as one pkt */
+ if (++cnt >= budget)
+ break;
}
+ (*packets) += cnt;
skb_mark_not_on_list(skb);
}
@@ -228,7 +231,7 @@ static void try_bulk_dequeue_skb_slow(struct Qdisc *q,
* A requeued skb (via q->gso_skb) can also be a SKB list.
*/
static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
- int *packets)
+ int *packets, int budget)
{
const struct netdev_queue *txq = q->dev_queue;
struct sk_buff *skb = NULL;
@@ -295,7 +298,7 @@ validate:
if (skb) {
bulk:
if (qdisc_may_bulk(q))
- try_bulk_dequeue_skb(q, skb, txq, packets);
+ try_bulk_dequeue_skb(q, skb, txq, packets, budget);
else
try_bulk_dequeue_skb_slow(q, skb, packets);
}
@@ -387,7 +390,7 @@ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
* >0 - queue is not empty.
*
*/
-static inline bool qdisc_restart(struct Qdisc *q, int *packets)
+static inline bool qdisc_restart(struct Qdisc *q, int *packets, int budget)
{
spinlock_t *root_lock = NULL;
struct netdev_queue *txq;
@@ -396,7 +399,7 @@ static inline bool qdisc_restart(struct Qdisc *q, int *packets)
bool validate;
/* Dequeue packet */
- skb = dequeue_skb(q, &validate, packets);
+ skb = dequeue_skb(q, &validate, packets, budget);
if (unlikely(!skb))
return false;
@@ -414,7 +417,7 @@ void __qdisc_run(struct Qdisc *q)
int quota = READ_ONCE(net_hotdata.dev_tx_weight);
int packets;
- while (qdisc_restart(q, &packets)) {
+ while (qdisc_restart(q, &packets, quota)) {
quota -= packets;
if (quota <= 0) {
if (q->flags & TCQ_F_NOLOCK)
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 0d48c61fe6ad..0c56d9673cc1 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -486,6 +486,7 @@ void sctp_transport_update_rto(struct sctp_transport *tp, __u32 rtt)
if (tp->rttvar || tp->srtt) {
struct net *net = tp->asoc->base.net;
+ unsigned int rto_beta, rto_alpha;
/* 6.3.1 C3) When a new RTT measurement R' is made, set
* RTTVAR <- (1 - RTO.Beta) * RTTVAR + RTO.Beta * |SRTT - R'|
* SRTT <- (1 - RTO.Alpha) * SRTT + RTO.Alpha * R'
@@ -497,10 +498,14 @@ void sctp_transport_update_rto(struct sctp_transport *tp, __u32 rtt)
* For example, assuming the default value of RTO.Alpha of
* 1/8, rto_alpha would be expressed as 3.
*/
- tp->rttvar = tp->rttvar - (tp->rttvar >> net->sctp.rto_beta)
- + (((__u32)abs((__s64)tp->srtt - (__s64)rtt)) >> net->sctp.rto_beta);
- tp->srtt = tp->srtt - (tp->srtt >> net->sctp.rto_alpha)
- + (rtt >> net->sctp.rto_alpha);
+ rto_beta = READ_ONCE(net->sctp.rto_beta);
+ if (rto_beta < 32)
+ tp->rttvar = tp->rttvar - (tp->rttvar >> rto_beta)
+ + (((__u32)abs((__s64)tp->srtt - (__s64)rtt)) >> rto_beta);
+ rto_alpha = READ_ONCE(net->sctp.rto_alpha);
+ if (rto_alpha < 32)
+ tp->srtt = tp->srtt - (tp->srtt >> rto_alpha)
+ + (rtt >> rto_alpha);
} else {
/* 6.3.1 C2) When the first RTT measurement R is made, set
* SRTT <- R, RTTVAR <- R/2.
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index 157aace169d4..87c87edadde7 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -890,6 +890,7 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini)
return SMC_CLC_DECL_CNFERR;
}
pclc_base->hdr.typev1 = SMC_TYPE_N;
+ ini->smc_type_v1 = SMC_TYPE_N;
} else {
pclc_base->iparea_offset = htons(sizeof(*pclc_smcd));
plen += sizeof(*pclc_prfx) +
diff --git a/net/socket.c b/net/socket.c
index e8892b218708..e1bf93508f05 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -503,21 +503,12 @@ EXPORT_SYMBOL(sock_alloc_file);
static int sock_map_fd(struct socket *sock, int flags)
{
- struct file *newfile;
- int fd = get_unused_fd_flags(flags);
- if (unlikely(fd < 0)) {
- sock_release(sock);
- return fd;
- }
-
- newfile = sock_alloc_file(sock, flags, NULL);
- if (!IS_ERR(newfile)) {
- fd_install(fd, newfile);
- return fd;
- }
+ int fd;
- put_unused_fd(fd);
- return PTR_ERR(newfile);
+ fd = FD_ADD(flags, sock_alloc_file(sock, flags, NULL));
+ if (fd < 0)
+ sock_release(sock);
+ return fd;
}
/**
@@ -2012,8 +2003,6 @@ static int __sys_accept4_file(struct file *file, struct sockaddr __user *upeer_s
int __user *upeer_addrlen, int flags)
{
struct proto_accept_arg arg = { };
- struct file *newfile;
- int newfd;
if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
return -EINVAL;
@@ -2021,18 +2010,7 @@ static int __sys_accept4_file(struct file *file, struct sockaddr __user *upeer_s
if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
- newfd = get_unused_fd_flags(flags);
- if (unlikely(newfd < 0))
- return newfd;
-
- newfile = do_accept(file, &arg, upeer_sockaddr, upeer_addrlen,
- flags);
- if (IS_ERR(newfile)) {
- put_unused_fd(newfd);
- return PTR_ERR(newfile);
- }
- fd_install(newfd, newfile);
- return newfd;
+ return FD_ADD(flags, do_accept(file, &arg, upeer_sockaddr, upeer_addrlen, flags));
}
/*
diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c
index 43b1f558b33d..e659fea2da70 100644
--- a/net/strparser/strparser.c
+++ b/net/strparser/strparser.c
@@ -238,7 +238,7 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
strp_parser_err(strp, -EMSGSIZE, desc);
break;
} else if (len <= (ssize_t)head->len -
- skb->len - stm->strp.offset) {
+ (ssize_t)skb->len - stm->strp.offset) {
/* Length must be into new skb (and also
* greater than zero)
*/
diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig
index 984e0cf9bf8a..a570e7adf270 100644
--- a/net/sunrpc/Kconfig
+++ b/net/sunrpc/Kconfig
@@ -18,10 +18,9 @@ config SUNRPC_SWAP
config RPCSEC_GSS_KRB5
tristate "Secure RPC: Kerberos V mechanism"
- depends on SUNRPC
+ depends on SUNRPC && CRYPTO
default y
select SUNRPC_GSS
- select CRYPTO
select CRYPTO_SKCIPHER
select CRYPTO_HASH
help
diff --git a/net/tipc/net.c b/net/tipc/net.c
index 0e95572e56b4..7e65d0b0c4a8 100644
--- a/net/tipc/net.c
+++ b/net/tipc/net.c
@@ -145,7 +145,9 @@ void tipc_net_finalize_work(struct work_struct *work)
{
struct tipc_net *tn = container_of(work, struct tipc_net, work);
+ rtnl_lock();
tipc_net_finalize(tipc_link_net(tn->bcl), tn->trial_addr);
+ rtnl_unlock();
}
void tipc_net_stop(struct net *net)
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 768098dec231..45a606c013fc 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1210,25 +1210,16 @@ static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
unix_mkname_bsd(sunaddr, addr_len);
if (flags & SOCK_COREDUMP) {
- const struct cred *cred;
- struct cred *kcred;
struct path root;
- kcred = prepare_kernel_cred(&init_task);
- if (!kcred) {
- err = -ENOMEM;
- goto fail;
- }
-
task_lock(&init_task);
get_fs_root(init_task.fs, &root);
task_unlock(&init_task);
- cred = override_creds(kcred);
- err = vfs_path_lookup(root.dentry, root.mnt, sunaddr->sun_path,
- LOOKUP_BENEATH | LOOKUP_NO_SYMLINKS |
- LOOKUP_NO_MAGICLINKS, &path);
- put_cred(revert_creds(cred));
+ scoped_with_kernel_creds()
+ err = vfs_path_lookup(root.dentry, root.mnt, sunaddr->sun_path,
+ LOOKUP_BENEATH | LOOKUP_NO_SYMLINKS |
+ LOOKUP_NO_MAGICLINKS, &path);
path_put(&root);
if (err)
goto fail;
@@ -1399,7 +1390,7 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
idmap = mnt_idmap(parent.mnt);
err = security_path_mknod(&parent, dentry, mode, 0);
if (!err)
- err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
+ err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0, NULL);
if (err)
goto out_path;
err = mutex_lock_interruptible(&u->bindlock);
@@ -2954,6 +2945,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state,
u = unix_sk(sk);
+redo:
/* Lock the socket to prevent queue disordering
* while sleeps in memcpy_tomsg
*/
@@ -2965,7 +2957,6 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state,
struct sk_buff *skb, *last;
int chunk;
-redo:
unix_state_lock(sk);
if (sock_flag(sk, SOCK_DEAD)) {
err = -ECONNRESET;
@@ -3015,7 +3006,6 @@ again:
goto out;
}
- mutex_lock(&u->iolock);
goto redo;
unlock:
unix_state_unlock(sk);
@@ -3286,9 +3276,6 @@ EXPORT_SYMBOL_GPL(unix_outq_len);
static int unix_open_file(struct sock *sk)
{
- struct file *f;
- int fd;
-
if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
return -EPERM;
@@ -3298,18 +3285,7 @@ static int unix_open_file(struct sock *sk)
if (!unix_sk(sk)->path.dentry)
return -ENOENT;
- fd = get_unused_fd_flags(O_CLOEXEC);
- if (fd < 0)
- return fd;
-
- f = dentry_open(&unix_sk(sk)->path, O_PATH, current_cred());
- if (IS_ERR(f)) {
- put_unused_fd(fd);
- return PTR_ERR(f);
- }
-
- fd_install(fd, f);
- return fd;
+ return FD_ADD(O_CLOEXEC, dentry_open(&unix_sk(sk)->path, O_PATH, current_cred()));
}
static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 684ab03137b6..65396a4e1b07 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -145,6 +145,7 @@ enum unix_vertex_index {
};
static unsigned long unix_vertex_unvisited_index = UNIX_VERTEX_INDEX_MARK1;
+static unsigned long unix_vertex_max_scc_index = UNIX_VERTEX_INDEX_START;
static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge)
{
@@ -153,6 +154,7 @@ static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge)
if (!vertex) {
vertex = list_first_entry(&fpl->vertices, typeof(*vertex), entry);
vertex->index = unix_vertex_unvisited_index;
+ vertex->scc_index = ++unix_vertex_max_scc_index;
vertex->out_degree = 0;
INIT_LIST_HEAD(&vertex->edges);
INIT_LIST_HEAD(&vertex->scc_entry);
@@ -489,10 +491,15 @@ prev_vertex:
scc_dead = unix_vertex_dead(v);
}
- if (scc_dead)
+ if (scc_dead) {
unix_collect_skb(&scc, hitlist);
- else if (!unix_graph_maybe_cyclic)
- unix_graph_maybe_cyclic = unix_scc_cyclic(&scc);
+ } else {
+ if (unix_vertex_max_scc_index < vertex->scc_index)
+ unix_vertex_max_scc_index = vertex->scc_index;
+
+ if (!unix_graph_maybe_cyclic)
+ unix_graph_maybe_cyclic = unix_scc_cyclic(&scc);
+ }
list_del(&scc);
}
@@ -507,6 +514,7 @@ static void unix_walk_scc(struct sk_buff_head *hitlist)
unsigned long last_index = UNIX_VERTEX_INDEX_START;
unix_graph_maybe_cyclic = false;
+ unix_vertex_max_scc_index = UNIX_VERTEX_INDEX_START;
/* Visit every vertex exactly once.
* __unix_walk_scc() moves visited vertices to unix_visited_vertices.
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 76763247a377..a9ca9c3b87b3 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1661,18 +1661,40 @@ static int vsock_connect(struct socket *sock, struct sockaddr *addr,
timeout = schedule_timeout(timeout);
lock_sock(sk);
- if (signal_pending(current)) {
- err = sock_intr_errno(timeout);
- sk->sk_state = sk->sk_state == TCP_ESTABLISHED ? TCP_CLOSING : TCP_CLOSE;
- sock->state = SS_UNCONNECTED;
- vsock_transport_cancel_pkt(vsk);
- vsock_remove_connected(vsk);
- goto out_wait;
- } else if ((sk->sk_state != TCP_ESTABLISHED) && (timeout == 0)) {
- err = -ETIMEDOUT;
+ /* Connection established. Whatever happens to socket once we
+ * release it, that's not connect()'s concern. No need to go
+ * into signal and timeout handling. Call it a day.
+ *
+ * Note that allowing to "reset" an already established socket
+ * here is racy and insecure.
+ */
+ if (sk->sk_state == TCP_ESTABLISHED)
+ break;
+
+ /* If connection was _not_ established and a signal/timeout came
+ * to be, we want the socket's state reset. User space may want
+ * to retry.
+ *
+ * sk_state != TCP_ESTABLISHED implies that socket is not on
+ * vsock_connected_table. We keep the binding and the transport
+ * assigned.
+ */
+ if (signal_pending(current) || timeout == 0) {
+ err = timeout == 0 ? -ETIMEDOUT : sock_intr_errno(timeout);
+
+ /* Listener might have already responded with
+ * VIRTIO_VSOCK_OP_RESPONSE. Its handling expects our
+ * sk_state == TCP_SYN_SENT, which hereby we break.
+ * In such case VIRTIO_VSOCK_OP_RST will follow.
+ */
sk->sk_state = TCP_CLOSE;
sock->state = SS_UNCONNECTED;
+
+ /* Try to cancel VIRTIO_VSOCK_OP_REQUEST skb sent out by
+ * transport->connect().
+ */
vsock_transport_cancel_pkt(vsk);
+
goto out_wait;
}
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 7b0c68a70888..69bbcca8ac75 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -36,20 +36,13 @@
#define TX_BATCH_SIZE 32
#define MAX_PER_SOCKET_BUDGET 32
-struct xsk_addr_node {
- u64 addr;
- struct list_head addr_node;
-};
-
-struct xsk_addr_head {
+struct xsk_addrs {
u32 num_descs;
- struct list_head addrs_list;
+ u64 addrs[MAX_SKB_FRAGS + 1];
};
static struct kmem_cache *xsk_tx_generic_cache;
-#define XSKCB(skb) ((struct xsk_addr_head *)((skb)->cb))
-
void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
{
if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
@@ -558,29 +551,68 @@ static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool)
return ret;
}
+static bool xsk_skb_destructor_is_addr(struct sk_buff *skb)
+{
+ return (uintptr_t)skb_shinfo(skb)->destructor_arg & 0x1UL;
+}
+
+static u64 xsk_skb_destructor_get_addr(struct sk_buff *skb)
+{
+ return (u64)((uintptr_t)skb_shinfo(skb)->destructor_arg & ~0x1UL);
+}
+
+static void xsk_skb_destructor_set_addr(struct sk_buff *skb, u64 addr)
+{
+ skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t)addr | 0x1UL);
+}
+
+static void xsk_inc_num_desc(struct sk_buff *skb)
+{
+ struct xsk_addrs *xsk_addr;
+
+ if (!xsk_skb_destructor_is_addr(skb)) {
+ xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
+ xsk_addr->num_descs++;
+ }
+}
+
+static u32 xsk_get_num_desc(struct sk_buff *skb)
+{
+ struct xsk_addrs *xsk_addr;
+
+ if (xsk_skb_destructor_is_addr(skb))
+ return 1;
+
+ xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
+
+ return xsk_addr->num_descs;
+}
+
static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool,
struct sk_buff *skb)
{
- struct xsk_addr_node *pos, *tmp;
+ u32 num_descs = xsk_get_num_desc(skb);
+ struct xsk_addrs *xsk_addr;
u32 descs_processed = 0;
unsigned long flags;
- u32 idx;
+ u32 idx, i;
spin_lock_irqsave(&pool->cq_lock, flags);
idx = xskq_get_prod(pool->cq);
- xskq_prod_write_addr(pool->cq, idx,
- (u64)(uintptr_t)skb_shinfo(skb)->destructor_arg);
- descs_processed++;
+ if (unlikely(num_descs > 1)) {
+ xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
- if (unlikely(XSKCB(skb)->num_descs > 1)) {
- list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) {
+ for (i = 0; i < num_descs; i++) {
xskq_prod_write_addr(pool->cq, idx + descs_processed,
- pos->addr);
+ xsk_addr->addrs[i]);
descs_processed++;
- list_del(&pos->addr_node);
- kmem_cache_free(xsk_tx_generic_cache, pos);
}
+ kmem_cache_free(xsk_tx_generic_cache, xsk_addr);
+ } else {
+ xskq_prod_write_addr(pool->cq, idx,
+ xsk_skb_destructor_get_addr(skb));
+ descs_processed++;
}
xskq_prod_submit_n(pool->cq, descs_processed);
spin_unlock_irqrestore(&pool->cq_lock, flags);
@@ -595,16 +627,6 @@ static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
spin_unlock_irqrestore(&pool->cq_lock, flags);
}
-static void xsk_inc_num_desc(struct sk_buff *skb)
-{
- XSKCB(skb)->num_descs++;
-}
-
-static u32 xsk_get_num_desc(struct sk_buff *skb)
-{
- return XSKCB(skb)->num_descs;
-}
-
static void xsk_destruct_skb(struct sk_buff *skb)
{
struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta;
@@ -621,27 +643,22 @@ static void xsk_destruct_skb(struct sk_buff *skb)
static void xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs,
u64 addr)
{
- BUILD_BUG_ON(sizeof(struct xsk_addr_head) > sizeof(skb->cb));
- INIT_LIST_HEAD(&XSKCB(skb)->addrs_list);
skb->dev = xs->dev;
skb->priority = READ_ONCE(xs->sk.sk_priority);
skb->mark = READ_ONCE(xs->sk.sk_mark);
- XSKCB(skb)->num_descs = 0;
skb->destructor = xsk_destruct_skb;
- skb_shinfo(skb)->destructor_arg = (void *)(uintptr_t)addr;
+ xsk_skb_destructor_set_addr(skb, addr);
}
static void xsk_consume_skb(struct sk_buff *skb)
{
struct xdp_sock *xs = xdp_sk(skb->sk);
u32 num_descs = xsk_get_num_desc(skb);
- struct xsk_addr_node *pos, *tmp;
+ struct xsk_addrs *xsk_addr;
if (unlikely(num_descs > 1)) {
- list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) {
- list_del(&pos->addr_node);
- kmem_cache_free(xsk_tx_generic_cache, pos);
- }
+ xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
+ kmem_cache_free(xsk_tx_generic_cache, xsk_addr);
}
skb->destructor = sock_wfree;
@@ -701,7 +718,6 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
{
struct xsk_buff_pool *pool = xs->pool;
u32 hr, len, ts, offset, copy, copied;
- struct xsk_addr_node *xsk_addr;
struct sk_buff *skb = xs->skb;
struct page *page;
void *buffer;
@@ -727,16 +743,26 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
return ERR_PTR(err);
}
} else {
- xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
- if (!xsk_addr)
- return ERR_PTR(-ENOMEM);
+ struct xsk_addrs *xsk_addr;
+
+ if (xsk_skb_destructor_is_addr(skb)) {
+ xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache,
+ GFP_KERNEL);
+ if (!xsk_addr)
+ return ERR_PTR(-ENOMEM);
+
+ xsk_addr->num_descs = 1;
+ xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb);
+ skb_shinfo(skb)->destructor_arg = (void *)xsk_addr;
+ } else {
+ xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
+ }
/* in case of -EOVERFLOW that could happen below,
* xsk_consume_skb() will release this node as whole skb
* would be dropped, which implies freeing all list elements
*/
- xsk_addr->addr = desc->addr;
- list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list);
+ xsk_addr->addrs[xsk_addr->num_descs] = desc->addr;
}
len = desc->len;
@@ -813,10 +839,25 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
}
} else {
int nr_frags = skb_shinfo(skb)->nr_frags;
- struct xsk_addr_node *xsk_addr;
+ struct xsk_addrs *xsk_addr;
struct page *page;
u8 *vaddr;
+ if (xsk_skb_destructor_is_addr(skb)) {
+ xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache,
+ GFP_KERNEL);
+ if (!xsk_addr) {
+ err = -ENOMEM;
+ goto free_err;
+ }
+
+ xsk_addr->num_descs = 1;
+ xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb);
+ skb_shinfo(skb)->destructor_arg = (void *)xsk_addr;
+ } else {
+ xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
+ }
+
if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) {
err = -EOVERFLOW;
goto free_err;
@@ -828,13 +869,6 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
goto free_err;
}
- xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
- if (!xsk_addr) {
- __free_page(page);
- err = -ENOMEM;
- goto free_err;
- }
-
vaddr = kmap_local_page(page);
memcpy(vaddr, buffer, len);
kunmap_local(vaddr);
@@ -842,8 +876,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE);
refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc);
- xsk_addr->addr = desc->addr;
- list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list);
+ xsk_addr->addrs[xsk_addr->num_descs] = desc->addr;
}
}
@@ -1904,7 +1937,7 @@ static int __init xsk_init(void)
goto out_pernet;
xsk_tx_generic_cache = kmem_cache_create("xsk_generic_xmit_cache",
- sizeof(struct xsk_addr_node),
+ sizeof(struct xsk_addrs),
0, SLAB_HWCACHE_ALIGN, NULL);
if (!xsk_tx_generic_cache) {
err = -ENOMEM;
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 44b9de6e4e77..52ae0e034d29 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -438,7 +438,7 @@ ok:
check_tunnel_size = x->xso.type == XFRM_DEV_OFFLOAD_PACKET &&
x->props.mode == XFRM_MODE_TUNNEL;
- switch (x->inner_mode.family) {
+ switch (skb_dst(skb)->ops->family) {
case AF_INET:
/* Check for IPv4 options */
if (ip_hdr(skb)->ihl != 5)
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index 9077730ff7d0..54222fcbd7fd 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -698,7 +698,7 @@ static void xfrm_get_inner_ipproto(struct sk_buff *skb, struct xfrm_state *x)
return;
if (x->outer_mode.encap == XFRM_MODE_TUNNEL) {
- switch (x->outer_mode.family) {
+ switch (skb_dst(skb)->ops->family) {
case AF_INET:
xo->inner_ipproto = ip_hdr(skb)->protocol;
break;
@@ -772,8 +772,12 @@ int xfrm_output(struct sock *sk, struct sk_buff *skb)
/* Exclusive direct xmit for tunnel mode, as
* some filtering or matching rules may apply
* in transport mode.
+ * Locally generated packets also require
+ * the normal XFRM path for L2 header setup,
+ * as the hardware needs the L2 header to match
+ * for encryption, so skip direct output as well.
*/
- if (x->props.mode == XFRM_MODE_TUNNEL)
+ if (x->props.mode == XFRM_MODE_TUNNEL && !skb->sk)
return xfrm_dev_direct_output(sk, x, skb);
return xfrm_output_resume(sk, skb, 0);
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index d213ca3653a8..9e14e453b55c 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -592,6 +592,7 @@ void xfrm_state_free(struct xfrm_state *x)
}
EXPORT_SYMBOL(xfrm_state_free);
+static void xfrm_state_delete_tunnel(struct xfrm_state *x);
static void xfrm_state_gc_destroy(struct xfrm_state *x)
{
if (x->mode_cbs && x->mode_cbs->destroy_state)
@@ -607,6 +608,7 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x)
kfree(x->replay_esn);
kfree(x->preplay_esn);
xfrm_unset_type_offload(x);
+ xfrm_state_delete_tunnel(x);
if (x->type) {
x->type->destructor(x);
xfrm_put_type(x->type);
@@ -806,7 +808,6 @@ void __xfrm_state_destroy(struct xfrm_state *x)
}
EXPORT_SYMBOL(__xfrm_state_destroy);
-static void xfrm_state_delete_tunnel(struct xfrm_state *x);
int __xfrm_state_delete(struct xfrm_state *x)
{
struct net *net = xs_net(x);
@@ -2073,6 +2074,7 @@ static struct xfrm_state *xfrm_state_clone_and_setup(struct xfrm_state *orig,
return x;
error:
+ x->km.state = XFRM_STATE_DEAD;
xfrm_state_put(x);
out:
return NULL;
@@ -2157,11 +2159,15 @@ struct xfrm_state *xfrm_state_migrate(struct xfrm_state *x,
xfrm_state_insert(xc);
} else {
if (xfrm_state_add(xc) < 0)
- goto error;
+ goto error_add;
}
return xc;
+error_add:
+ if (xuo)
+ xfrm_dev_state_delete(xc);
error:
+ xc->km.state = XFRM_STATE_DEAD;
xfrm_state_put(xc);
return NULL;
}
@@ -2191,14 +2197,18 @@ int xfrm_state_update(struct xfrm_state *x)
}
if (x1->km.state == XFRM_STATE_ACQ) {
- if (x->dir && x1->dir != x->dir)
+ if (x->dir && x1->dir != x->dir) {
+ to_put = x1;
goto out;
+ }
__xfrm_state_insert(x);
x = NULL;
} else {
- if (x1->dir != x->dir)
+ if (x1->dir != x->dir) {
+ to_put = x1;
goto out;
+ }
}
err = 0;
@@ -3298,6 +3308,7 @@ out_bydst:
void xfrm_state_fini(struct net *net)
{
unsigned int sz;
+ int i;
flush_work(&net->xfrm.state_hash_work);
xfrm_state_flush(net, 0, false);
@@ -3305,14 +3316,17 @@ void xfrm_state_fini(struct net *net)
WARN_ON(!list_empty(&net->xfrm.state_all));
+ for (i = 0; i <= net->xfrm.state_hmask; i++) {
+ WARN_ON(!hlist_empty(net->xfrm.state_byseq + i));
+ WARN_ON(!hlist_empty(net->xfrm.state_byspi + i));
+ WARN_ON(!hlist_empty(net->xfrm.state_bysrc + i));
+ WARN_ON(!hlist_empty(net->xfrm.state_bydst + i));
+ }
+
sz = (net->xfrm.state_hmask + 1) * sizeof(struct hlist_head);
- WARN_ON(!hlist_empty(net->xfrm.state_byseq));
xfrm_hash_free(net->xfrm.state_byseq, sz);
- WARN_ON(!hlist_empty(net->xfrm.state_byspi));
xfrm_hash_free(net->xfrm.state_byspi, sz);
- WARN_ON(!hlist_empty(net->xfrm.state_bysrc));
xfrm_hash_free(net->xfrm.state_bysrc, sz);
- WARN_ON(!hlist_empty(net->xfrm.state_bydst));
xfrm_hash_free(net->xfrm.state_bydst, sz);
free_percpu(net->xfrm.state_cache_input);
}
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 010c9e6638c0..403b5ecac2c5 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -947,8 +947,11 @@ static struct xfrm_state *xfrm_state_construct(struct net *net,
if (attrs[XFRMA_SA_PCPU]) {
x->pcpu_num = nla_get_u32(attrs[XFRMA_SA_PCPU]);
- if (x->pcpu_num >= num_possible_cpus())
+ if (x->pcpu_num >= num_possible_cpus()) {
+ err = -ERANGE;
+ NL_SET_ERR_MSG(extack, "pCPU number too big");
goto error;
+ }
}
err = __xfrm_init_state(x, extack);
@@ -3035,6 +3038,9 @@ static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh,
}
xfrm_state_free(x);
+ xfrm_dev_policy_delete(xp);
+ xfrm_dev_policy_free(xp);
+ security_xfrm_policy_free(xp->security);
kfree(xp);
return 0;
diff --git a/rust/Makefile b/rust/Makefile
index 3e545c1a0ff4..7842ad0a4ea7 100644
--- a/rust/Makefile
+++ b/rust/Makefile
@@ -298,7 +298,7 @@ bindgen_skip_c_flags := -mno-fp-ret-in-387 -mpreferred-stack-boundary=% \
-fno-inline-functions-called-once -fsanitize=bounds-strict \
-fstrict-flex-arrays=% -fmin-function-alignment=% \
-fzero-init-padding-bits=% -mno-fdpic \
- --param=% --param asan-%
+ --param=% --param asan-% -fno-isolate-erroneous-paths-dereference
# Derived from `scripts/Makefile.clang`.
BINDGEN_TARGET_x86 := x86_64-linux-gnu
diff --git a/rust/kernel/debugfs/traits.rs b/rust/kernel/debugfs/traits.rs
index ab009eb254b3..92054fed2136 100644
--- a/rust/kernel/debugfs/traits.rs
+++ b/rust/kernel/debugfs/traits.rs
@@ -4,14 +4,11 @@
//! Traits for rendering or updating values exported to DebugFS.
use crate::prelude::*;
+use crate::sync::atomic::{Atomic, AtomicBasicOps, AtomicType, Relaxed};
use crate::sync::Mutex;
use crate::uaccess::UserSliceReader;
use core::fmt::{self, Debug, Formatter};
use core::str::FromStr;
-use core::sync::atomic::{
- AtomicI16, AtomicI32, AtomicI64, AtomicI8, AtomicIsize, AtomicU16, AtomicU32, AtomicU64,
- AtomicU8, AtomicUsize, Ordering,
-};
/// A trait for types that can be written into a string.
///
@@ -50,7 +47,7 @@ pub trait Reader {
fn read_from_slice(&self, reader: &mut UserSliceReader) -> Result;
}
-impl<T: FromStr> Reader for Mutex<T> {
+impl<T: FromStr + Unpin> Reader for Mutex<T> {
fn read_from_slice(&self, reader: &mut UserSliceReader) -> Result {
let mut buf = [0u8; 128];
if reader.len() > buf.len() {
@@ -66,37 +63,21 @@ impl<T: FromStr> Reader for Mutex<T> {
}
}
-macro_rules! impl_reader_for_atomic {
- ($(($atomic_type:ty, $int_type:ty)),*) => {
- $(
- impl Reader for $atomic_type {
- fn read_from_slice(&self, reader: &mut UserSliceReader) -> Result {
- let mut buf = [0u8; 21]; // Enough for a 64-bit number.
- if reader.len() > buf.len() {
- return Err(EINVAL);
- }
- let n = reader.len();
- reader.read_slice(&mut buf[..n])?;
+impl<T: AtomicType + FromStr> Reader for Atomic<T>
+where
+ T::Repr: AtomicBasicOps,
+{
+ fn read_from_slice(&self, reader: &mut UserSliceReader) -> Result {
+ let mut buf = [0u8; 21]; // Enough for a 64-bit number.
+ if reader.len() > buf.len() {
+ return Err(EINVAL);
+ }
+ let n = reader.len();
+ reader.read_slice(&mut buf[..n])?;
- let s = core::str::from_utf8(&buf[..n]).map_err(|_| EINVAL)?;
- let val = s.trim().parse::<$int_type>().map_err(|_| EINVAL)?;
- self.store(val, Ordering::Relaxed);
- Ok(())
- }
- }
- )*
- };
+ let s = core::str::from_utf8(&buf[..n]).map_err(|_| EINVAL)?;
+ let val = s.trim().parse::<T>().map_err(|_| EINVAL)?;
+ self.store(val, Relaxed);
+ Ok(())
+ }
}
-
-impl_reader_for_atomic!(
- (AtomicI16, i16),
- (AtomicI32, i32),
- (AtomicI64, i64),
- (AtomicI8, i8),
- (AtomicIsize, isize),
- (AtomicU16, u16),
- (AtomicU32, u32),
- (AtomicU64, u64),
- (AtomicU8, u8),
- (AtomicUsize, usize)
-);
diff --git a/rust/kernel/sync/atomic.rs b/rust/kernel/sync/atomic.rs
index 016a6bcaf080..3afc376be42d 100644
--- a/rust/kernel/sync/atomic.rs
+++ b/rust/kernel/sync/atomic.rs
@@ -22,9 +22,10 @@ mod predefine;
pub use internal::AtomicImpl;
pub use ordering::{Acquire, Full, Relaxed, Release};
+pub(crate) use internal::{AtomicArithmeticOps, AtomicBasicOps, AtomicExchangeOps};
use crate::build_error;
-use internal::{AtomicArithmeticOps, AtomicBasicOps, AtomicExchangeOps, AtomicRepr};
+use internal::AtomicRepr;
use ordering::OrderingType;
/// A memory location which can be safely modified from multiple execution contexts.
@@ -306,6 +307,15 @@ where
}
}
+impl<T: AtomicType + core::fmt::Debug> core::fmt::Debug for Atomic<T>
+where
+ T::Repr: AtomicBasicOps,
+{
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ core::fmt::Debug::fmt(&self.load(Relaxed), f)
+ }
+}
+
impl<T: AtomicType> Atomic<T>
where
T::Repr: AtomicExchangeOps,
diff --git a/rust/kernel/sync/lock.rs b/rust/kernel/sync/lock.rs
index 27202beef90c..cb00fdb94ffd 100644
--- a/rust/kernel/sync/lock.rs
+++ b/rust/kernel/sync/lock.rs
@@ -11,7 +11,7 @@ use crate::{
types::{NotThreadSafe, Opaque, ScopeGuard},
};
use core::{cell::UnsafeCell, marker::PhantomPinned, pin::Pin};
-use pin_init::{pin_data, pin_init, PinInit};
+use pin_init::{pin_data, pin_init, PinInit, Wrapper};
pub mod mutex;
pub mod spinlock;
@@ -115,6 +115,7 @@ pub struct Lock<T: ?Sized, B: Backend> {
_pin: PhantomPinned,
/// The data protected by the lock.
+ #[pin]
pub(crate) data: UnsafeCell<T>,
}
@@ -127,9 +128,13 @@ unsafe impl<T: ?Sized + Send, B: Backend> Sync for Lock<T, B> {}
impl<T, B: Backend> Lock<T, B> {
/// Constructs a new lock initialiser.
- pub fn new(t: T, name: &'static CStr, key: Pin<&'static LockClassKey>) -> impl PinInit<Self> {
+ pub fn new(
+ t: impl PinInit<T>,
+ name: &'static CStr,
+ key: Pin<&'static LockClassKey>,
+ ) -> impl PinInit<Self> {
pin_init!(Self {
- data: UnsafeCell::new(t),
+ data <- UnsafeCell::pin_init(t),
_pin: PhantomPinned,
// SAFETY: `slot` is valid while the closure is called and both `name` and `key` have
// static lifetimes so they live indefinitely.
@@ -240,6 +245,31 @@ impl<'a, T: ?Sized, B: Backend> Guard<'a, T, B> {
cb()
}
+
+ /// Returns a pinned mutable reference to the protected data.
+ ///
+ /// The guard implements [`DerefMut`] when `T: Unpin`, so for [`Unpin`]
+ /// types [`DerefMut`] should be used instead of this function.
+ ///
+ /// [`DerefMut`]: core::ops::DerefMut
+ /// [`Unpin`]: core::marker::Unpin
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// # use kernel::sync::{Mutex, MutexGuard};
+ /// # use core::{pin::Pin, marker::PhantomPinned};
+ /// struct Data(PhantomPinned);
+ ///
+ /// fn example(mutex: &Mutex<Data>) {
+ /// let mut data: MutexGuard<'_, Data> = mutex.lock();
+ /// let mut data: Pin<&mut Data> = data.as_mut();
+ /// }
+ /// ```
+ pub fn as_mut(&mut self) -> Pin<&mut T> {
+ // SAFETY: `self.lock.data` is structurally pinned.
+ unsafe { Pin::new_unchecked(&mut *self.lock.data.get()) }
+ }
}
impl<T: ?Sized, B: Backend> core::ops::Deref for Guard<'_, T, B> {
@@ -251,7 +281,10 @@ impl<T: ?Sized, B: Backend> core::ops::Deref for Guard<'_, T, B> {
}
}
-impl<T: ?Sized, B: Backend> core::ops::DerefMut for Guard<'_, T, B> {
+impl<T: ?Sized, B: Backend> core::ops::DerefMut for Guard<'_, T, B>
+where
+ T: Unpin,
+{
fn deref_mut(&mut self) -> &mut Self::Target {
// SAFETY: The caller owns the lock, so it is safe to deref the protected data.
unsafe { &mut *self.lock.data.get() }
diff --git a/rust/kernel/sync/lock/global.rs b/rust/kernel/sync/lock/global.rs
index d65f94b5caf2..38b448032799 100644
--- a/rust/kernel/sync/lock/global.rs
+++ b/rust/kernel/sync/lock/global.rs
@@ -106,7 +106,10 @@ impl<B: GlobalLockBackend> core::ops::Deref for GlobalGuard<B> {
}
}
-impl<B: GlobalLockBackend> core::ops::DerefMut for GlobalGuard<B> {
+impl<B: GlobalLockBackend> core::ops::DerefMut for GlobalGuard<B>
+where
+ B::Item: Unpin,
+{
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.inner
}
diff --git a/samples/rust/rust_debugfs.rs b/samples/rust/rust_debugfs.rs
index 82b61a15a34b..711faa07bece 100644
--- a/samples/rust/rust_debugfs.rs
+++ b/samples/rust/rust_debugfs.rs
@@ -32,14 +32,12 @@
//! ```
use core::str::FromStr;
-use core::sync::atomic::AtomicUsize;
-use core::sync::atomic::Ordering;
use kernel::c_str;
use kernel::debugfs::{Dir, File};
use kernel::new_mutex;
use kernel::prelude::*;
+use kernel::sync::atomic::{Atomic, Relaxed};
use kernel::sync::Mutex;
-
use kernel::{acpi, device::Core, of, platform, str::CString, types::ARef};
kernel::module_platform_driver! {
@@ -59,7 +57,7 @@ struct RustDebugFs {
#[pin]
_compatible: File<CString>,
#[pin]
- counter: File<AtomicUsize>,
+ counter: File<Atomic<usize>>,
#[pin]
inner: File<Mutex<Inner>>,
}
@@ -109,7 +107,7 @@ impl platform::Driver for RustDebugFs {
) -> Result<Pin<KBox<Self>>> {
let result = KBox::try_pin_init(RustDebugFs::new(pdev), GFP_KERNEL)?;
// We can still mutate fields through the files which are atomic or mutexed:
- result.counter.store(91, Ordering::Relaxed);
+ result.counter.store(91, Relaxed);
{
let mut guard = result.inner.lock();
guard.x = guard.y;
@@ -120,8 +118,8 @@ impl platform::Driver for RustDebugFs {
}
impl RustDebugFs {
- fn build_counter(dir: &Dir) -> impl PinInit<File<AtomicUsize>> + '_ {
- dir.read_write_file(c_str!("counter"), AtomicUsize::new(0))
+ fn build_counter(dir: &Dir) -> impl PinInit<File<Atomic<usize>>> + '_ {
+ dir.read_write_file(c_str!("counter"), Atomic::<usize>::new(0))
}
fn build_inner(dir: &Dir) -> impl PinInit<File<Mutex<Inner>>> + '_ {
diff --git a/samples/rust/rust_debugfs_scoped.rs b/samples/rust/rust_debugfs_scoped.rs
index b0c4e76b123e..9f0ec5f24cda 100644
--- a/samples/rust/rust_debugfs_scoped.rs
+++ b/samples/rust/rust_debugfs_scoped.rs
@@ -6,9 +6,9 @@
//! `Scope::dir` to create a variety of files without the need to separately
//! track them all.
-use core::sync::atomic::AtomicUsize;
use kernel::debugfs::{Dir, Scope};
use kernel::prelude::*;
+use kernel::sync::atomic::Atomic;
use kernel::sync::Mutex;
use kernel::{c_str, new_mutex, str::CString};
@@ -62,7 +62,7 @@ fn create_file_write(
let file_name = CString::try_from_fmt(fmt!("{name_str}"))?;
for sub in items {
nums.push(
- AtomicUsize::new(sub.parse().map_err(|_| EINVAL)?),
+ Atomic::<usize>::new(sub.parse().map_err(|_| EINVAL)?),
GFP_KERNEL,
)?;
}
@@ -109,7 +109,7 @@ impl ModuleData {
struct DeviceData {
name: CString,
- nums: KVec<AtomicUsize>,
+ nums: KVec<Atomic<usize>>,
}
fn init_control(base_dir: &Dir, dyn_dirs: Dir) -> impl PinInit<Scope<ModuleData>> + '_ {
diff --git a/samples/vfs/test-statx.c b/samples/vfs/test-statx.c
index 49c7a46cee07..424a6fa15723 100644
--- a/samples/vfs/test-statx.c
+++ b/samples/vfs/test-statx.c
@@ -19,6 +19,12 @@
#include <time.h>
#include <sys/syscall.h>
#include <sys/types.h>
+
+// Work around glibc header silliness
+#undef AT_RENAME_NOREPLACE
+#undef AT_RENAME_EXCHANGE
+#undef AT_RENAME_WHITEOUT
+
#include <linux/stat.h>
#include <linux/fcntl.h>
#define statx foo
diff --git a/samples/watch_queue/watch_test.c b/samples/watch_queue/watch_test.c
index 8c6cb57d5cfc..24cf7d7a1972 100644
--- a/samples/watch_queue/watch_test.c
+++ b/samples/watch_queue/watch_test.c
@@ -16,6 +16,12 @@
#include <errno.h>
#include <sys/ioctl.h>
#include <limits.h>
+
+// Work around glibc header silliness
+#undef AT_RENAME_NOREPLACE
+#undef AT_RENAME_EXCHANGE
+#undef AT_RENAME_WHITEOUT
+
#include <linux/watch_queue.h>
#include <linux/unistd.h>
#include <linux/keyctl.h>
diff --git a/scripts/Makefile.extrawarn b/scripts/Makefile.extrawarn
index 6af392f9cd02..68e6fafcb80c 100644
--- a/scripts/Makefile.extrawarn
+++ b/scripts/Makefile.extrawarn
@@ -28,8 +28,10 @@ endif
KBUILD_CFLAGS-$(CONFIG_CC_NO_ARRAY_BOUNDS) += -Wno-array-bounds
ifdef CONFIG_CC_IS_CLANG
-# The kernel builds with '-std=gnu11' so use of GNU extensions is acceptable.
+# The kernel builds with '-std=gnu11' and '-fms-extensions' so use of GNU and
+# Microsoft extensions is acceptable.
KBUILD_CFLAGS += -Wno-gnu
+KBUILD_CFLAGS += -Wno-microsoft-anon-tag
# Clang checks for overflow/truncation with '%p', while GCC does not:
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111219
diff --git a/scripts/atomic/gen-atomic-instrumented.sh b/scripts/atomic/gen-atomic-instrumented.sh
index 592f3ec89b5f..9c1d53f81eb2 100755
--- a/scripts/atomic/gen-atomic-instrumented.sh
+++ b/scripts/atomic/gen-atomic-instrumented.sh
@@ -12,7 +12,7 @@ gen_param_check()
local arg="$1"; shift
local type="${arg%%:*}"
local name="$(gen_param_name "${arg}")"
- local rw="write"
+ local rw="atomic_write"
case "${type#c}" in
i) return;;
@@ -20,14 +20,17 @@ gen_param_check()
if [ ${type#c} != ${type} ]; then
# We don't write to constant parameters.
- rw="read"
+ rw="atomic_read"
+ elif [ "${type}" = "p" ] ; then
+ # The "old" argument in try_cmpxchg() gets accessed non-atomically
+ rw="read_write"
elif [ "${meta}" != "s" ]; then
# An atomic RMW: if this parameter is not a constant, and this atomic is
# not just a 's'tore, this parameter is both read from and written to.
- rw="read_write"
+ rw="atomic_read_write"
fi
- printf "\tinstrument_atomic_${rw}(${name}, sizeof(*${name}));\n"
+ printf "\tinstrument_${rw}(${name}, sizeof(*${name}));\n"
}
#gen_params_checks(meta, arg...)
diff --git a/scripts/decode_stacktrace.sh b/scripts/decode_stacktrace.sh
index c73cb802a0a3..8d01b741de62 100755
--- a/scripts/decode_stacktrace.sh
+++ b/scripts/decode_stacktrace.sh
@@ -277,12 +277,6 @@ handle_line() {
fi
done
- if [[ ${words[$last]} =~ ^[0-9a-f]+\] ]]; then
- words[$last-1]="${words[$last-1]} ${words[$last]}"
- unset words[$last] spaces[$last]
- last=$(( $last - 1 ))
- fi
-
# Extract info after the symbol if present. E.g.:
# func_name+0x54/0x80 (P)
# ^^^
@@ -295,6 +289,14 @@ handle_line() {
last=$(( $last - 1 ))
fi
+ # Join module name with its build id if present, as these were
+ # split during tokenization (e.g. "[module" and "modbuildid]").
+ if [[ ${words[$last]} =~ ^[0-9a-f]+\] ]]; then
+ words[$last-1]="${words[$last-1]} ${words[$last]}"
+ unset words[$last] spaces[$last]
+ last=$(( $last - 1 ))
+ fi
+
if [[ ${words[$last]} =~ \[([^]]+)\] ]]; then
module=${words[$last]}
# some traces format is "(%pS)", which like "(foo+0x0/0x1 [bar])"
diff --git a/scripts/gendwarfksyms/gendwarfksyms.c b/scripts/gendwarfksyms/gendwarfksyms.c
index 08ae61eb327e..f5203d1640ee 100644
--- a/scripts/gendwarfksyms/gendwarfksyms.c
+++ b/scripts/gendwarfksyms/gendwarfksyms.c
@@ -138,7 +138,8 @@ int main(int argc, char **argv)
error("no input files?");
}
- symbol_read_exports(stdin);
+ if (!symbol_read_exports(stdin))
+ return 0;
if (symtypes_file) {
symfile = fopen(symtypes_file, "w");
diff --git a/scripts/gendwarfksyms/gendwarfksyms.h b/scripts/gendwarfksyms/gendwarfksyms.h
index d9c06d2cb1df..32cec8f7695a 100644
--- a/scripts/gendwarfksyms/gendwarfksyms.h
+++ b/scripts/gendwarfksyms/gendwarfksyms.h
@@ -123,7 +123,7 @@ struct symbol {
typedef void (*symbol_callback_t)(struct symbol *, void *arg);
bool is_symbol_ptr(const char *name);
-void symbol_read_exports(FILE *file);
+int symbol_read_exports(FILE *file);
void symbol_read_symtab(int fd);
struct symbol *symbol_get(const char *name);
void symbol_set_ptr(struct symbol *sym, Dwarf_Die *ptr);
diff --git a/scripts/gendwarfksyms/symbols.c b/scripts/gendwarfksyms/symbols.c
index 35ed594f0749..ecddcb5ffcdf 100644
--- a/scripts/gendwarfksyms/symbols.c
+++ b/scripts/gendwarfksyms/symbols.c
@@ -128,7 +128,7 @@ static bool is_exported(const char *name)
return for_each(name, NULL, NULL) > 0;
}
-void symbol_read_exports(FILE *file)
+int symbol_read_exports(FILE *file)
{
struct symbol *sym;
char *line = NULL;
@@ -159,6 +159,8 @@ void symbol_read_exports(FILE *file)
free(line);
debug("%d exported symbols", nsym);
+
+ return nsym;
}
static void get_symbol(struct symbol *sym, void *arg)
diff --git a/scripts/syscall.tbl b/scripts/syscall.tbl
index d1ae5e92c615..e74868be513c 100644
--- a/scripts/syscall.tbl
+++ b/scripts/syscall.tbl
@@ -410,3 +410,4 @@
467 common open_tree_attr sys_open_tree_attr
468 common file_getattr sys_file_getattr
469 common file_setattr sys_file_setattr
+470 common listns sys_listns
diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c
index 391a586d0557..9d08d103f142 100644
--- a/security/apparmor/apparmorfs.c
+++ b/security/apparmor/apparmorfs.c
@@ -355,17 +355,17 @@ static void aafs_remove(struct dentry *dentry)
if (!dentry || IS_ERR(dentry))
return;
+ /* ->d_parent is stable as rename is not supported */
dir = d_inode(dentry->d_parent);
- inode_lock(dir);
- if (simple_positive(dentry)) {
+ dentry = start_removing_dentry(dentry->d_parent, dentry);
+ if (!IS_ERR(dentry) && simple_positive(dentry)) {
if (d_is_dir(dentry))
simple_rmdir(dir, dentry);
else
simple_unlink(dir, dentry);
d_delete(dentry);
- dput(dentry);
}
- inode_unlock(dir);
+ end_removing(dentry);
simple_release_fs(&aafs_mnt, &aafs_count);
}
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index b5d5333ab330..a63c46bb2d14 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -51,7 +51,7 @@ static struct key *get_user_register(struct user_namespace *user_ns)
if (!reg_keyring) {
reg_keyring = keyring_alloc(".user_reg",
user_ns->owner, INVALID_GID,
- &init_cred,
+ kernel_cred(),
KEY_POS_WRITE | KEY_POS_SEARCH |
KEY_USR_VIEW | KEY_USR_READ,
0,
diff --git a/security/landlock/fs.c b/security/landlock/fs.c
index 0bade2c5aa1d..cee2b6f22c83 100644
--- a/security/landlock/fs.c
+++ b/security/landlock/fs.c
@@ -1296,7 +1296,7 @@ static void hook_sb_delete(struct super_block *const sb)
* second call to iput() for the same Landlock object. Also
* checks I_NEW because such inode cannot be tied to an object.
*/
- if (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) {
+ if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE | I_NEW)) {
spin_unlock(&inode->i_lock);
continue;
}
@@ -1335,11 +1335,10 @@ static void hook_sb_delete(struct super_block *const sb)
* At this point, we own the ihold() reference that was
* originally set up by get_inode_object() and the
* __iget() reference that we just set in this loop
- * walk. Therefore the following call to iput() will
- * not sleep nor drop the inode because there is now at
- * least two references to it.
+ * walk. Therefore there are at least two references
+ * on the inode.
*/
- iput(inode);
+ iput_not_last(inode);
} else {
spin_unlock(&object->lock);
rcu_read_unlock();
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index dfc22da42f30..e713291db873 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -210,12 +210,12 @@ static int selinux_lsm_notifier_avc_callback(u32 event)
*/
static void cred_init_security(void)
{
- struct task_security_struct *tsec;
+ struct cred_security_struct *crsec;
/* NOTE: the lsm framework zeros out the buffer on allocation */
- tsec = selinux_cred(unrcu_pointer(current->real_cred));
- tsec->osid = tsec->sid = tsec->avdcache.sid = SECINITSID_KERNEL;
+ crsec = selinux_cred(unrcu_pointer(current->real_cred));
+ crsec->osid = crsec->sid = SECINITSID_KERNEL;
}
/*
@@ -223,10 +223,10 @@ static void cred_init_security(void)
*/
static inline u32 cred_sid(const struct cred *cred)
{
- const struct task_security_struct *tsec;
+ const struct cred_security_struct *crsec;
- tsec = selinux_cred(cred);
- return tsec->sid;
+ crsec = selinux_cred(cred);
+ return crsec->sid;
}
static void __ad_net_init(struct common_audit_data *ad,
@@ -437,15 +437,15 @@ static int may_context_mount_sb_relabel(u32 sid,
struct superblock_security_struct *sbsec,
const struct cred *cred)
{
- const struct task_security_struct *tsec = selinux_cred(cred);
+ const struct cred_security_struct *crsec = selinux_cred(cred);
int rc;
- rc = avc_has_perm(tsec->sid, sbsec->sid, SECCLASS_FILESYSTEM,
+ rc = avc_has_perm(crsec->sid, sbsec->sid, SECCLASS_FILESYSTEM,
FILESYSTEM__RELABELFROM, NULL);
if (rc)
return rc;
- rc = avc_has_perm(tsec->sid, sid, SECCLASS_FILESYSTEM,
+ rc = avc_has_perm(crsec->sid, sid, SECCLASS_FILESYSTEM,
FILESYSTEM__RELABELTO, NULL);
return rc;
}
@@ -454,9 +454,9 @@ static int may_context_mount_inode_relabel(u32 sid,
struct superblock_security_struct *sbsec,
const struct cred *cred)
{
- const struct task_security_struct *tsec = selinux_cred(cred);
+ const struct cred_security_struct *crsec = selinux_cred(cred);
int rc;
- rc = avc_has_perm(tsec->sid, sbsec->sid, SECCLASS_FILESYSTEM,
+ rc = avc_has_perm(crsec->sid, sbsec->sid, SECCLASS_FILESYSTEM,
FILESYSTEM__RELABELFROM, NULL);
if (rc)
return rc;
@@ -1788,7 +1788,7 @@ out:
* Determine the label for an inode that might be unioned.
*/
static int
-selinux_determine_inode_label(const struct task_security_struct *tsec,
+selinux_determine_inode_label(const struct cred_security_struct *crsec,
struct inode *dir,
const struct qstr *name, u16 tclass,
u32 *_new_isid)
@@ -1800,11 +1800,11 @@ selinux_determine_inode_label(const struct task_security_struct *tsec,
(sbsec->behavior == SECURITY_FS_USE_MNTPOINT)) {
*_new_isid = sbsec->mntpoint_sid;
} else if ((sbsec->flags & SBLABEL_MNT) &&
- tsec->create_sid) {
- *_new_isid = tsec->create_sid;
+ crsec->create_sid) {
+ *_new_isid = crsec->create_sid;
} else {
const struct inode_security_struct *dsec = inode_security(dir);
- return security_transition_sid(tsec->sid,
+ return security_transition_sid(crsec->sid,
dsec->sid, tclass,
name, _new_isid);
}
@@ -1817,7 +1817,7 @@ static int may_create(struct inode *dir,
struct dentry *dentry,
u16 tclass)
{
- const struct task_security_struct *tsec = selinux_cred(current_cred());
+ const struct cred_security_struct *crsec = selinux_cred(current_cred());
struct inode_security_struct *dsec;
struct superblock_security_struct *sbsec;
u32 sid, newsid;
@@ -1827,7 +1827,7 @@ static int may_create(struct inode *dir,
dsec = inode_security(dir);
sbsec = selinux_superblock(dir->i_sb);
- sid = tsec->sid;
+ sid = crsec->sid;
ad.type = LSM_AUDIT_DATA_DENTRY;
ad.u.dentry = dentry;
@@ -1838,7 +1838,7 @@ static int may_create(struct inode *dir,
if (rc)
return rc;
- rc = selinux_determine_inode_label(tsec, dir, &dentry->d_name, tclass,
+ rc = selinux_determine_inode_label(crsec, dir, &dentry->d_name, tclass,
&newsid);
if (rc)
return rc;
@@ -2251,8 +2251,8 @@ static u32 ptrace_parent_sid(void)
}
static int check_nnp_nosuid(const struct linux_binprm *bprm,
- const struct task_security_struct *old_tsec,
- const struct task_security_struct *new_tsec)
+ const struct cred_security_struct *old_crsec,
+ const struct cred_security_struct *new_crsec)
{
int nnp = (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS);
int nosuid = !mnt_may_suid(bprm->file->f_path.mnt);
@@ -2262,7 +2262,7 @@ static int check_nnp_nosuid(const struct linux_binprm *bprm,
if (!nnp && !nosuid)
return 0; /* neither NNP nor nosuid */
- if (new_tsec->sid == old_tsec->sid)
+ if (new_crsec->sid == old_crsec->sid)
return 0; /* No change in credentials */
/*
@@ -2277,7 +2277,7 @@ static int check_nnp_nosuid(const struct linux_binprm *bprm,
av |= PROCESS2__NNP_TRANSITION;
if (nosuid)
av |= PROCESS2__NOSUID_TRANSITION;
- rc = avc_has_perm(old_tsec->sid, new_tsec->sid,
+ rc = avc_has_perm(old_crsec->sid, new_crsec->sid,
SECCLASS_PROCESS2, av, NULL);
if (!rc)
return 0;
@@ -2288,8 +2288,8 @@ static int check_nnp_nosuid(const struct linux_binprm *bprm,
* i.e. SIDs that are guaranteed to only be allowed a subset
* of the permissions of the current SID.
*/
- rc = security_bounded_transition(old_tsec->sid,
- new_tsec->sid);
+ rc = security_bounded_transition(old_crsec->sid,
+ new_crsec->sid);
if (!rc)
return 0;
@@ -2305,8 +2305,8 @@ static int check_nnp_nosuid(const struct linux_binprm *bprm,
static int selinux_bprm_creds_for_exec(struct linux_binprm *bprm)
{
- const struct task_security_struct *old_tsec;
- struct task_security_struct *new_tsec;
+ const struct cred_security_struct *old_crsec;
+ struct cred_security_struct *new_crsec;
struct inode_security_struct *isec;
struct common_audit_data ad;
struct inode *inode = file_inode(bprm->file);
@@ -2315,18 +2315,18 @@ static int selinux_bprm_creds_for_exec(struct linux_binprm *bprm)
/* SELinux context only depends on initial program or script and not
* the script interpreter */
- old_tsec = selinux_cred(current_cred());
- new_tsec = selinux_cred(bprm->cred);
+ old_crsec = selinux_cred(current_cred());
+ new_crsec = selinux_cred(bprm->cred);
isec = inode_security(inode);
/* Default to the current task SID. */
- new_tsec->sid = old_tsec->sid;
- new_tsec->osid = old_tsec->sid;
+ new_crsec->sid = old_crsec->sid;
+ new_crsec->osid = old_crsec->sid;
/* Reset fs, key, and sock SIDs on execve. */
- new_tsec->create_sid = 0;
- new_tsec->keycreate_sid = 0;
- new_tsec->sockcreate_sid = 0;
+ new_crsec->create_sid = 0;
+ new_crsec->keycreate_sid = 0;
+ new_crsec->sockcreate_sid = 0;
/*
* Before policy is loaded, label any task outside kernel space
@@ -2335,26 +2335,26 @@ static int selinux_bprm_creds_for_exec(struct linux_binprm *bprm)
* (if the policy chooses to set SECINITSID_INIT != SECINITSID_KERNEL).
*/
if (!selinux_initialized()) {
- new_tsec->sid = SECINITSID_INIT;
+ new_crsec->sid = SECINITSID_INIT;
/* also clear the exec_sid just in case */
- new_tsec->exec_sid = 0;
+ new_crsec->exec_sid = 0;
return 0;
}
- if (old_tsec->exec_sid) {
- new_tsec->sid = old_tsec->exec_sid;
+ if (old_crsec->exec_sid) {
+ new_crsec->sid = old_crsec->exec_sid;
/* Reset exec SID on execve. */
- new_tsec->exec_sid = 0;
+ new_crsec->exec_sid = 0;
/* Fail on NNP or nosuid if not an allowed transition. */
- rc = check_nnp_nosuid(bprm, old_tsec, new_tsec);
+ rc = check_nnp_nosuid(bprm, old_crsec, new_crsec);
if (rc)
return rc;
} else {
/* Check for a default transition on this program. */
- rc = security_transition_sid(old_tsec->sid,
+ rc = security_transition_sid(old_crsec->sid,
isec->sid, SECCLASS_PROCESS, NULL,
- &new_tsec->sid);
+ &new_crsec->sid);
if (rc)
return rc;
@@ -2362,34 +2362,34 @@ static int selinux_bprm_creds_for_exec(struct linux_binprm *bprm)
* Fallback to old SID on NNP or nosuid if not an allowed
* transition.
*/
- rc = check_nnp_nosuid(bprm, old_tsec, new_tsec);
+ rc = check_nnp_nosuid(bprm, old_crsec, new_crsec);
if (rc)
- new_tsec->sid = old_tsec->sid;
+ new_crsec->sid = old_crsec->sid;
}
ad.type = LSM_AUDIT_DATA_FILE;
ad.u.file = bprm->file;
- if (new_tsec->sid == old_tsec->sid) {
- rc = avc_has_perm(old_tsec->sid, isec->sid,
+ if (new_crsec->sid == old_crsec->sid) {
+ rc = avc_has_perm(old_crsec->sid, isec->sid,
SECCLASS_FILE, FILE__EXECUTE_NO_TRANS, &ad);
if (rc)
return rc;
} else {
/* Check permissions for the transition. */
- rc = avc_has_perm(old_tsec->sid, new_tsec->sid,
+ rc = avc_has_perm(old_crsec->sid, new_crsec->sid,
SECCLASS_PROCESS, PROCESS__TRANSITION, &ad);
if (rc)
return rc;
- rc = avc_has_perm(new_tsec->sid, isec->sid,
+ rc = avc_has_perm(new_crsec->sid, isec->sid,
SECCLASS_FILE, FILE__ENTRYPOINT, &ad);
if (rc)
return rc;
/* Check for shared state */
if (bprm->unsafe & LSM_UNSAFE_SHARE) {
- rc = avc_has_perm(old_tsec->sid, new_tsec->sid,
+ rc = avc_has_perm(old_crsec->sid, new_crsec->sid,
SECCLASS_PROCESS, PROCESS__SHARE,
NULL);
if (rc)
@@ -2401,7 +2401,7 @@ static int selinux_bprm_creds_for_exec(struct linux_binprm *bprm)
if (bprm->unsafe & LSM_UNSAFE_PTRACE) {
u32 ptsid = ptrace_parent_sid();
if (ptsid != 0) {
- rc = avc_has_perm(ptsid, new_tsec->sid,
+ rc = avc_has_perm(ptsid, new_crsec->sid,
SECCLASS_PROCESS,
PROCESS__PTRACE, NULL);
if (rc)
@@ -2415,7 +2415,7 @@ static int selinux_bprm_creds_for_exec(struct linux_binprm *bprm)
/* Enable secure mode for SIDs transitions unless
the noatsecure permission is granted between
the two SIDs, i.e. ahp returns 0. */
- rc = avc_has_perm(old_tsec->sid, new_tsec->sid,
+ rc = avc_has_perm(old_crsec->sid, new_crsec->sid,
SECCLASS_PROCESS, PROCESS__NOATSECURE,
NULL);
bprm->secureexec |= !!rc;
@@ -2483,12 +2483,12 @@ static inline void flush_unauthorized_files(const struct cred *cred,
*/
static void selinux_bprm_committing_creds(const struct linux_binprm *bprm)
{
- struct task_security_struct *new_tsec;
+ struct cred_security_struct *new_crsec;
struct rlimit *rlim, *initrlim;
int rc, i;
- new_tsec = selinux_cred(bprm->cred);
- if (new_tsec->sid == new_tsec->osid)
+ new_crsec = selinux_cred(bprm->cred);
+ if (new_crsec->sid == new_crsec->osid)
return;
/* Close files for which the new task SID is not authorized. */
@@ -2507,7 +2507,7 @@ static void selinux_bprm_committing_creds(const struct linux_binprm *bprm)
* higher than the default soft limit for cases where the default is
* lower than the hard limit, e.g. RLIMIT_CORE or RLIMIT_STACK.
*/
- rc = avc_has_perm(new_tsec->osid, new_tsec->sid, SECCLASS_PROCESS,
+ rc = avc_has_perm(new_crsec->osid, new_crsec->sid, SECCLASS_PROCESS,
PROCESS__RLIMITINH, NULL);
if (rc) {
/* protect against do_prlimit() */
@@ -2529,12 +2529,12 @@ static void selinux_bprm_committing_creds(const struct linux_binprm *bprm)
*/
static void selinux_bprm_committed_creds(const struct linux_binprm *bprm)
{
- const struct task_security_struct *tsec = selinux_cred(current_cred());
+ const struct cred_security_struct *crsec = selinux_cred(current_cred());
u32 osid, sid;
int rc;
- osid = tsec->osid;
- sid = tsec->sid;
+ osid = crsec->osid;
+ sid = crsec->sid;
if (sid == osid)
return;
@@ -2911,7 +2911,7 @@ static int selinux_dentry_create_files_as(struct dentry *dentry, int mode,
{
u32 newsid;
int rc;
- struct task_security_struct *tsec;
+ struct cred_security_struct *crsec;
rc = selinux_determine_inode_label(selinux_cred(old),
d_inode(dentry->d_parent), name,
@@ -2920,8 +2920,8 @@ static int selinux_dentry_create_files_as(struct dentry *dentry, int mode,
if (rc)
return rc;
- tsec = selinux_cred(new);
- tsec->create_sid = newsid;
+ crsec = selinux_cred(new);
+ crsec->create_sid = newsid;
return 0;
}
@@ -2929,7 +2929,7 @@ static int selinux_inode_init_security(struct inode *inode, struct inode *dir,
const struct qstr *qstr,
struct xattr *xattrs, int *xattr_count)
{
- const struct task_security_struct *tsec = selinux_cred(current_cred());
+ const struct cred_security_struct *crsec = selinux_cred(current_cred());
struct superblock_security_struct *sbsec;
struct xattr *xattr = lsm_get_xattr_slot(xattrs, xattr_count);
u32 newsid, clen;
@@ -2939,9 +2939,9 @@ static int selinux_inode_init_security(struct inode *inode, struct inode *dir,
sbsec = selinux_superblock(dir->i_sb);
- newsid = tsec->create_sid;
+ newsid = crsec->create_sid;
newsclass = inode_mode_to_security_class(inode->i_mode);
- rc = selinux_determine_inode_label(tsec, dir, qstr, newsclass, &newsid);
+ rc = selinux_determine_inode_label(crsec, dir, qstr, newsclass, &newsid);
if (rc)
return rc;
@@ -3113,7 +3113,7 @@ static noinline int audit_inode_permission(struct inode *inode,
static inline void task_avdcache_reset(struct task_security_struct *tsec)
{
memset(&tsec->avdcache.dir, 0, sizeof(tsec->avdcache.dir));
- tsec->avdcache.sid = tsec->sid;
+ tsec->avdcache.sid = current_sid();
tsec->avdcache.seqno = avc_policy_seqno();
tsec->avdcache.dir_spot = TSEC_AVDC_DIR_SIZE - 1;
}
@@ -3137,7 +3137,7 @@ static inline int task_avdcache_search(struct task_security_struct *tsec,
if (isec->sclass != SECCLASS_DIR)
return -ENOENT;
- if (unlikely(tsec->sid != tsec->avdcache.sid ||
+ if (unlikely(current_sid() != tsec->avdcache.sid ||
tsec->avdcache.seqno != avc_policy_seqno())) {
task_avdcache_reset(tsec);
return -ENOENT;
@@ -3201,6 +3201,7 @@ static int selinux_inode_permission(struct inode *inode, int requested)
{
int mask;
u32 perms;
+ u32 sid = current_sid();
struct task_security_struct *tsec;
struct inode_security_struct *isec;
struct avdc_entry *avdc;
@@ -3213,8 +3214,8 @@ static int selinux_inode_permission(struct inode *inode, int requested)
if (!mask)
return 0;
- tsec = selinux_cred(current_cred());
- if (task_avdcache_permnoaudit(tsec))
+ tsec = selinux_task(current);
+ if (task_avdcache_permnoaudit(tsec, sid))
return 0;
isec = inode_security_rcu(inode, requested & MAY_NOT_BLOCK);
@@ -3234,7 +3235,7 @@ static int selinux_inode_permission(struct inode *inode, int requested)
struct av_decision avd;
/* Cache miss. */
- rc = avc_has_perm_noaudit(tsec->sid, isec->sid, isec->sclass,
+ rc = avc_has_perm_noaudit(sid, isec->sid, isec->sclass,
perms, 0, &avd);
audited = avc_audit_required(perms, &avd, rc,
(requested & MAY_ACCESS) ? FILE__AUDIT_ACCESS : 0,
@@ -3285,9 +3286,9 @@ static int selinux_inode_getattr(const struct path *path)
{
struct task_security_struct *tsec;
- tsec = selinux_cred(current_cred());
+ tsec = selinux_task(current);
- if (task_avdcache_permnoaudit(tsec))
+ if (task_avdcache_permnoaudit(tsec, current_sid()))
return 0;
return path_has_perm(current_cred(), path, FILE__GETATTR);
@@ -3659,7 +3660,7 @@ static void selinux_inode_getlsmprop(struct inode *inode, struct lsm_prop *prop)
static int selinux_inode_copy_up(struct dentry *src, struct cred **new)
{
struct lsm_prop prop;
- struct task_security_struct *tsec;
+ struct cred_security_struct *crsec;
struct cred *new_creds = *new;
if (new_creds == NULL) {
@@ -3668,10 +3669,10 @@ static int selinux_inode_copy_up(struct dentry *src, struct cred **new)
return -ENOMEM;
}
- tsec = selinux_cred(new_creds);
+ crsec = selinux_cred(new_creds);
/* Get label from overlay inode and set it in create_sid */
selinux_inode_getlsmprop(d_inode(src), &prop);
- tsec->create_sid = prop.selinux.secid;
+ crsec->create_sid = prop.selinux.secid;
*new = new_creds;
return 0;
}
@@ -3697,7 +3698,7 @@ static int selinux_inode_copy_up_xattr(struct dentry *dentry, const char *name)
static int selinux_kernfs_init_security(struct kernfs_node *kn_dir,
struct kernfs_node *kn)
{
- const struct task_security_struct *tsec = selinux_cred(current_cred());
+ const struct cred_security_struct *crsec = selinux_cred(current_cred());
u32 parent_sid, newsid, clen;
int rc;
char *context;
@@ -3725,8 +3726,8 @@ static int selinux_kernfs_init_security(struct kernfs_node *kn_dir,
if (rc)
return rc;
- if (tsec->create_sid) {
- newsid = tsec->create_sid;
+ if (crsec->create_sid) {
+ newsid = crsec->create_sid;
} else {
u16 secclass = inode_mode_to_security_class(kn->mode);
const char *kn_name;
@@ -3737,7 +3738,7 @@ static int selinux_kernfs_init_security(struct kernfs_node *kn_dir,
q.name = kn_name;
q.hash_len = hashlen_string(kn_dir, kn_name);
- rc = security_transition_sid(tsec->sid,
+ rc = security_transition_sid(crsec->sid,
parent_sid, secclass, &q,
&newsid);
if (rc)
@@ -4151,7 +4152,10 @@ static int selinux_task_alloc(struct task_struct *task,
u64 clone_flags)
{
u32 sid = current_sid();
+ struct task_security_struct *old_tsec = selinux_task(current);
+ struct task_security_struct *new_tsec = selinux_task(task);
+ *new_tsec = *old_tsec;
return avc_has_perm(sid, sid, SECCLASS_PROCESS, PROCESS__FORK, NULL);
}
@@ -4161,10 +4165,10 @@ static int selinux_task_alloc(struct task_struct *task,
static int selinux_cred_prepare(struct cred *new, const struct cred *old,
gfp_t gfp)
{
- const struct task_security_struct *old_tsec = selinux_cred(old);
- struct task_security_struct *tsec = selinux_cred(new);
+ const struct cred_security_struct *old_crsec = selinux_cred(old);
+ struct cred_security_struct *crsec = selinux_cred(new);
- *tsec = *old_tsec;
+ *crsec = *old_crsec;
return 0;
}
@@ -4173,10 +4177,10 @@ static int selinux_cred_prepare(struct cred *new, const struct cred *old,
*/
static void selinux_cred_transfer(struct cred *new, const struct cred *old)
{
- const struct task_security_struct *old_tsec = selinux_cred(old);
- struct task_security_struct *tsec = selinux_cred(new);
+ const struct cred_security_struct *old_crsec = selinux_cred(old);
+ struct cred_security_struct *crsec = selinux_cred(new);
- *tsec = *old_tsec;
+ *crsec = *old_crsec;
}
static void selinux_cred_getsecid(const struct cred *c, u32 *secid)
@@ -4195,7 +4199,7 @@ static void selinux_cred_getlsmprop(const struct cred *c, struct lsm_prop *prop)
*/
static int selinux_kernel_act_as(struct cred *new, u32 secid)
{
- struct task_security_struct *tsec = selinux_cred(new);
+ struct cred_security_struct *crsec = selinux_cred(new);
u32 sid = current_sid();
int ret;
@@ -4204,10 +4208,10 @@ static int selinux_kernel_act_as(struct cred *new, u32 secid)
KERNEL_SERVICE__USE_AS_OVERRIDE,
NULL);
if (ret == 0) {
- tsec->sid = secid;
- tsec->create_sid = 0;
- tsec->keycreate_sid = 0;
- tsec->sockcreate_sid = 0;
+ crsec->sid = secid;
+ crsec->create_sid = 0;
+ crsec->keycreate_sid = 0;
+ crsec->sockcreate_sid = 0;
}
return ret;
}
@@ -4219,7 +4223,7 @@ static int selinux_kernel_act_as(struct cred *new, u32 secid)
static int selinux_kernel_create_files_as(struct cred *new, struct inode *inode)
{
struct inode_security_struct *isec = inode_security(inode);
- struct task_security_struct *tsec = selinux_cred(new);
+ struct cred_security_struct *crsec = selinux_cred(new);
u32 sid = current_sid();
int ret;
@@ -4229,7 +4233,7 @@ static int selinux_kernel_create_files_as(struct cred *new, struct inode *inode)
NULL);
if (ret == 0)
- tsec->create_sid = isec->sid;
+ crsec->create_sid = isec->sid;
return ret;
}
@@ -4744,15 +4748,15 @@ static int selinux_conn_sid(u32 sk_sid, u32 skb_sid, u32 *conn_sid)
/* socket security operations */
-static int socket_sockcreate_sid(const struct task_security_struct *tsec,
+static int socket_sockcreate_sid(const struct cred_security_struct *crsec,
u16 secclass, u32 *socksid)
{
- if (tsec->sockcreate_sid > SECSID_NULL) {
- *socksid = tsec->sockcreate_sid;
+ if (crsec->sockcreate_sid > SECSID_NULL) {
+ *socksid = crsec->sockcreate_sid;
return 0;
}
- return security_transition_sid(tsec->sid, tsec->sid,
+ return security_transition_sid(crsec->sid, crsec->sid,
secclass, NULL, socksid);
}
@@ -4797,7 +4801,7 @@ static int sock_has_perm(struct sock *sk, u32 perms)
static int selinux_socket_create(int family, int type,
int protocol, int kern)
{
- const struct task_security_struct *tsec = selinux_cred(current_cred());
+ const struct cred_security_struct *crsec = selinux_cred(current_cred());
u32 newsid;
u16 secclass;
int rc;
@@ -4806,17 +4810,17 @@ static int selinux_socket_create(int family, int type,
return 0;
secclass = socket_type_to_security_class(family, type, protocol);
- rc = socket_sockcreate_sid(tsec, secclass, &newsid);
+ rc = socket_sockcreate_sid(crsec, secclass, &newsid);
if (rc)
return rc;
- return avc_has_perm(tsec->sid, newsid, secclass, SOCKET__CREATE, NULL);
+ return avc_has_perm(crsec->sid, newsid, secclass, SOCKET__CREATE, NULL);
}
static int selinux_socket_post_create(struct socket *sock, int family,
int type, int protocol, int kern)
{
- const struct task_security_struct *tsec = selinux_cred(current_cred());
+ const struct cred_security_struct *crsec = selinux_cred(current_cred());
struct inode_security_struct *isec = inode_security_novalidate(SOCK_INODE(sock));
struct sk_security_struct *sksec;
u16 sclass = socket_type_to_security_class(family, type, protocol);
@@ -4824,7 +4828,7 @@ static int selinux_socket_post_create(struct socket *sock, int family,
int err = 0;
if (!kern) {
- err = socket_sockcreate_sid(tsec, sclass, &sid);
+ err = socket_sockcreate_sid(crsec, sclass, &sid);
if (err)
return err;
}
@@ -6526,37 +6530,37 @@ static void selinux_d_instantiate(struct dentry *dentry, struct inode *inode)
static int selinux_lsm_getattr(unsigned int attr, struct task_struct *p,
char **value)
{
- const struct task_security_struct *tsec;
+ const struct cred_security_struct *crsec;
int error;
u32 sid;
u32 len;
rcu_read_lock();
- tsec = selinux_cred(__task_cred(p));
+ crsec = selinux_cred(__task_cred(p));
if (p != current) {
- error = avc_has_perm(current_sid(), tsec->sid,
+ error = avc_has_perm(current_sid(), crsec->sid,
SECCLASS_PROCESS, PROCESS__GETATTR, NULL);
if (error)
goto err_unlock;
}
switch (attr) {
case LSM_ATTR_CURRENT:
- sid = tsec->sid;
+ sid = crsec->sid;
break;
case LSM_ATTR_PREV:
- sid = tsec->osid;
+ sid = crsec->osid;
break;
case LSM_ATTR_EXEC:
- sid = tsec->exec_sid;
+ sid = crsec->exec_sid;
break;
case LSM_ATTR_FSCREATE:
- sid = tsec->create_sid;
+ sid = crsec->create_sid;
break;
case LSM_ATTR_KEYCREATE:
- sid = tsec->keycreate_sid;
+ sid = crsec->keycreate_sid;
break;
case LSM_ATTR_SOCKCREATE:
- sid = tsec->sockcreate_sid;
+ sid = crsec->sockcreate_sid;
break;
default:
error = -EOPNOTSUPP;
@@ -6581,7 +6585,7 @@ err_unlock:
static int selinux_lsm_setattr(u64 attr, void *value, size_t size)
{
- struct task_security_struct *tsec;
+ struct cred_security_struct *crsec;
struct cred *new;
u32 mysid = current_sid(), sid = 0, ptsid;
int error;
@@ -6667,11 +6671,11 @@ static int selinux_lsm_setattr(u64 attr, void *value, size_t size)
operation. See selinux_bprm_creds_for_exec for the execve
checks and may_create for the file creation checks. The
operation will then fail if the context is not permitted. */
- tsec = selinux_cred(new);
+ crsec = selinux_cred(new);
if (attr == LSM_ATTR_EXEC) {
- tsec->exec_sid = sid;
+ crsec->exec_sid = sid;
} else if (attr == LSM_ATTR_FSCREATE) {
- tsec->create_sid = sid;
+ crsec->create_sid = sid;
} else if (attr == LSM_ATTR_KEYCREATE) {
if (sid) {
error = avc_has_perm(mysid, sid,
@@ -6679,22 +6683,22 @@ static int selinux_lsm_setattr(u64 attr, void *value, size_t size)
if (error)
goto abort_change;
}
- tsec->keycreate_sid = sid;
+ crsec->keycreate_sid = sid;
} else if (attr == LSM_ATTR_SOCKCREATE) {
- tsec->sockcreate_sid = sid;
+ crsec->sockcreate_sid = sid;
} else if (attr == LSM_ATTR_CURRENT) {
error = -EINVAL;
if (sid == 0)
goto abort_change;
if (!current_is_single_threaded()) {
- error = security_bounded_transition(tsec->sid, sid);
+ error = security_bounded_transition(crsec->sid, sid);
if (error)
goto abort_change;
}
/* Check permissions for the transition. */
- error = avc_has_perm(tsec->sid, sid, SECCLASS_PROCESS,
+ error = avc_has_perm(crsec->sid, sid, SECCLASS_PROCESS,
PROCESS__DYNTRANSITION, NULL);
if (error)
goto abort_change;
@@ -6709,7 +6713,7 @@ static int selinux_lsm_setattr(u64 attr, void *value, size_t size)
goto abort_change;
}
- tsec->sid = sid;
+ crsec->sid = sid;
} else {
error = -EINVAL;
goto abort_change;
@@ -6876,14 +6880,14 @@ static int selinux_inode_getsecctx(struct inode *inode, struct lsm_context *cp)
static int selinux_key_alloc(struct key *k, const struct cred *cred,
unsigned long flags)
{
- const struct task_security_struct *tsec;
+ const struct cred_security_struct *crsec;
struct key_security_struct *ksec = selinux_key(k);
- tsec = selinux_cred(cred);
- if (tsec->keycreate_sid)
- ksec->sid = tsec->keycreate_sid;
+ crsec = selinux_cred(cred);
+ if (crsec->keycreate_sid)
+ ksec->sid = crsec->keycreate_sid;
else
- ksec->sid = tsec->sid;
+ ksec->sid = crsec->sid;
return 0;
}
@@ -7137,7 +7141,8 @@ static int selinux_bpf_token_create(struct bpf_token *token, union bpf_attr *att
#endif
struct lsm_blob_sizes selinux_blob_sizes __ro_after_init = {
- .lbs_cred = sizeof(struct task_security_struct),
+ .lbs_cred = sizeof(struct cred_security_struct),
+ .lbs_task = sizeof(struct task_security_struct),
.lbs_file = sizeof(struct file_security_struct),
.lbs_inode = sizeof(struct inode_security_struct),
.lbs_ipc = sizeof(struct ipc_security_struct),
diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h
index 2d5139c6d45b..8fc3de5234ac 100644
--- a/security/selinux/include/objsec.h
+++ b/security/selinux/include/objsec.h
@@ -37,13 +37,16 @@ struct avdc_entry {
bool permissive; /* AVC permissive flag */
};
-struct task_security_struct {
+struct cred_security_struct {
u32 osid; /* SID prior to last execve */
u32 sid; /* current SID */
u32 exec_sid; /* exec SID */
u32 create_sid; /* fscreate SID */
u32 keycreate_sid; /* keycreate SID */
u32 sockcreate_sid; /* fscreate SID */
+} __randomize_layout;
+
+struct task_security_struct {
#define TSEC_AVDC_DIR_SIZE (1 << 2)
struct {
u32 sid; /* current SID for cached entries */
@@ -54,10 +57,11 @@ struct task_security_struct {
} avdcache;
} __randomize_layout;
-static inline bool task_avdcache_permnoaudit(struct task_security_struct *tsec)
+static inline bool task_avdcache_permnoaudit(struct task_security_struct *tsec,
+ u32 sid)
{
return (tsec->avdcache.permissive_neveraudit &&
- tsec->sid == tsec->avdcache.sid &&
+ sid == tsec->avdcache.sid &&
tsec->avdcache.seqno == avc_policy_seqno());
}
@@ -172,11 +176,17 @@ struct perf_event_security_struct {
};
extern struct lsm_blob_sizes selinux_blob_sizes;
-static inline struct task_security_struct *selinux_cred(const struct cred *cred)
+static inline struct cred_security_struct *selinux_cred(const struct cred *cred)
{
return cred->security + selinux_blob_sizes.lbs_cred;
}
+static inline struct task_security_struct *
+selinux_task(const struct task_struct *task)
+{
+ return task->security + selinux_blob_sizes.lbs_task;
+}
+
static inline struct file_security_struct *selinux_file(const struct file *file)
{
return file->f_security + selinux_blob_sizes.lbs_file;
@@ -207,9 +217,9 @@ selinux_ipc(const struct kern_ipc_perm *ipc)
*/
static inline u32 current_sid(void)
{
- const struct task_security_struct *tsec = selinux_cred(current_cred());
+ const struct cred_security_struct *crsec = selinux_cred(current_cred());
- return tsec->sid;
+ return crsec->sid;
}
static inline struct superblock_security_struct *
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index 232e087bce3e..404e08bf60ba 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -506,6 +506,7 @@ static int sel_make_policy_nodes(struct selinux_fs_info *fsi,
{
int ret = 0;
struct dentry *tmp_parent, *tmp_bool_dir, *tmp_class_dir;
+ struct renamedata rd = {};
unsigned int bool_num = 0;
char **bool_names = NULL;
int *bool_values = NULL;
@@ -539,9 +540,14 @@ static int sel_make_policy_nodes(struct selinux_fs_info *fsi,
if (ret)
goto out;
- lock_rename(tmp_parent, fsi->sb->s_root);
+ rd.old_parent = tmp_parent;
+ rd.new_parent = fsi->sb->s_root;
/* booleans */
+ ret = start_renaming_two_dentries(&rd, tmp_bool_dir, fsi->bool_dir);
+ if (ret)
+ goto out;
+
d_exchange(tmp_bool_dir, fsi->bool_dir);
swap(fsi->bool_num, bool_num);
@@ -549,12 +555,17 @@ static int sel_make_policy_nodes(struct selinux_fs_info *fsi,
swap(fsi->bool_pending_values, bool_values);
fsi->bool_dir = tmp_bool_dir;
+ end_renaming(&rd);
/* classes */
+ ret = start_renaming_two_dentries(&rd, tmp_class_dir, fsi->class_dir);
+ if (ret)
+ goto out;
+
d_exchange(tmp_class_dir, fsi->class_dir);
fsi->class_dir = tmp_class_dir;
- unlock_rename(tmp_parent, fsi->sb->s_root);
+ end_renaming(&rd);
out:
sel_remove_old_bool_data(bool_num, bool_names, bool_values);
diff --git a/sound/hda/codecs/cirrus/cs420x.c b/sound/hda/codecs/cirrus/cs420x.c
index 823220d5cada..13f5f1711fa4 100644
--- a/sound/hda/codecs/cirrus/cs420x.c
+++ b/sound/hda/codecs/cirrus/cs420x.c
@@ -585,6 +585,7 @@ static const struct hda_quirk cs4208_mac_fixup_tbl[] = {
SND_PCI_QUIRK(0x106b, 0x6c00, "MacMini 7,1", CS4208_MACMINI),
SND_PCI_QUIRK(0x106b, 0x7100, "MacBookAir 6,1", CS4208_MBA6),
SND_PCI_QUIRK(0x106b, 0x7200, "MacBookAir 6,2", CS4208_MBA6),
+ SND_PCI_QUIRK(0x106b, 0x7800, "MacPro 6,1", CS4208_MACMINI),
SND_PCI_QUIRK(0x106b, 0x7b00, "MacBookPro 12,1", CS4208_MBP11),
{} /* terminator */
};
diff --git a/sound/hda/codecs/hdmi/nvhdmi-mcp.c b/sound/hda/codecs/hdmi/nvhdmi-mcp.c
index 8fd8d76fa72f..1c5fdfe872f2 100644
--- a/sound/hda/codecs/hdmi/nvhdmi-mcp.c
+++ b/sound/hda/codecs/hdmi/nvhdmi-mcp.c
@@ -350,8 +350,8 @@ static int nvhdmi_mcp_probe(struct hda_codec *codec,
static const struct hda_codec_ops nvhdmi_mcp_codec_ops = {
.probe = nvhdmi_mcp_probe,
.remove = snd_hda_hdmi_simple_remove,
- .build_controls = nvhdmi_mcp_build_pcms,
- .build_pcms = nvhdmi_mcp_build_controls,
+ .build_pcms = nvhdmi_mcp_build_pcms,
+ .build_controls = nvhdmi_mcp_build_controls,
.init = nvhdmi_mcp_init,
.unsol_event = snd_hda_hdmi_simple_unsol_event,
};
diff --git a/sound/hda/codecs/realtek/alc269.c b/sound/hda/codecs/realtek/alc269.c
index 4aec5067c59d..b45fcc9a3785 100644
--- a/sound/hda/codecs/realtek/alc269.c
+++ b/sound/hda/codecs/realtek/alc269.c
@@ -6525,6 +6525,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = {
SND_PCI_QUIRK(0x103c, 0x8a4f, "HP Victus 15-fa0xxx (MB 8A4F)", ALC245_FIXUP_HP_MUTE_LED_COEFBIT),
SND_PCI_QUIRK(0x103c, 0x8a6e, "HP EDNA 360", ALC287_FIXUP_CS35L41_I2C_4),
SND_PCI_QUIRK(0x103c, 0x8a74, "HP ProBook 440 G8 Notebook PC", ALC236_FIXUP_HP_GPIO_LED),
+ SND_PCI_QUIRK(0x103c, 0x8a75, "HP ProBook 450 G8 Notebook PC", ALC236_FIXUP_HP_GPIO_LED),
SND_PCI_QUIRK(0x103c, 0x8a78, "HP Dev One", ALC285_FIXUP_HP_LIMIT_INT_MIC_BOOST),
SND_PCI_QUIRK(0x103c, 0x8aa0, "HP ProBook 440 G9 (MB 8A9E)", ALC236_FIXUP_HP_GPIO_LED),
SND_PCI_QUIRK(0x103c, 0x8aa3, "HP ProBook 450 G9 (MB 8AA1)", ALC236_FIXUP_HP_GPIO_LED),
@@ -6572,6 +6573,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = {
SND_PCI_QUIRK(0x103c, 0x8bc8, "HP Victus 15-fa1xxx", ALC245_FIXUP_HP_MUTE_LED_COEFBIT),
SND_PCI_QUIRK(0x103c, 0x8bcd, "HP Omen 16-xd0xxx", ALC245_FIXUP_HP_MUTE_LED_V1_COEFBIT),
SND_PCI_QUIRK(0x103c, 0x8bd4, "HP Victus 16-s0xxx (MB 8BD4)", ALC245_FIXUP_HP_MUTE_LED_COEFBIT),
+ SND_PCI_QUIRK(0x103c, 0x8bd6, "HP Pavilion Aero Laptop 13z-be200", ALC287_FIXUP_HP_GPIO_LED),
SND_PCI_QUIRK(0x103c, 0x8bdd, "HP Envy 17", ALC287_FIXUP_CS35L41_I2C_2),
SND_PCI_QUIRK(0x103c, 0x8bde, "HP Envy 17", ALC287_FIXUP_CS35L41_I2C_2),
SND_PCI_QUIRK(0x103c, 0x8bdf, "HP Envy 15", ALC287_FIXUP_CS35L41_I2C_2),
@@ -6694,6 +6696,15 @@ static const struct hda_quirk alc269_fixup_tbl[] = {
SND_PCI_QUIRK(0x103c, 0x8e60, "HP Trekker ", ALC287_FIXUP_CS35L41_I2C_2),
SND_PCI_QUIRK(0x103c, 0x8e61, "HP Trekker ", ALC287_FIXUP_CS35L41_I2C_2),
SND_PCI_QUIRK(0x103c, 0x8e62, "HP Trekker ", ALC287_FIXUP_CS35L41_I2C_2),
+ SND_PCI_QUIRK(0x103c, 0x8ed5, "HP Merino13X", ALC245_FIXUP_TAS2781_SPI_2),
+ SND_PCI_QUIRK(0x103c, 0x8ed6, "HP Merino13", ALC245_FIXUP_TAS2781_SPI_2),
+ SND_PCI_QUIRK(0x103c, 0x8ed7, "HP Merino14", ALC245_FIXUP_TAS2781_SPI_2),
+ SND_PCI_QUIRK(0x103c, 0x8ed8, "HP Merino16", ALC245_FIXUP_TAS2781_SPI_2),
+ SND_PCI_QUIRK(0x103c, 0x8ed9, "HP Merino14W", ALC245_FIXUP_TAS2781_SPI_2),
+ SND_PCI_QUIRK(0x103c, 0x8eda, "HP Merino16W", ALC245_FIXUP_TAS2781_SPI_2),
+ SND_PCI_QUIRK(0x103c, 0x8f40, "HP Lampas14", ALC287_FIXUP_TXNW2781_I2C),
+ SND_PCI_QUIRK(0x103c, 0x8f41, "HP Lampas16", ALC287_FIXUP_TXNW2781_I2C),
+ SND_PCI_QUIRK(0x103c, 0x8f42, "HP LampasW14", ALC287_FIXUP_TXNW2781_I2C),
SND_PCI_QUIRK(0x1043, 0x1032, "ASUS VivoBook X513EA", ALC256_FIXUP_ASUS_MIC_NO_PRESENCE),
SND_PCI_QUIRK(0x1043, 0x1034, "ASUS GU605C", ALC285_FIXUP_ASUS_GU605_SPI_SPEAKER2_TO_DAC1),
SND_PCI_QUIRK(0x1043, 0x103e, "ASUS X540SA", ALC256_FIXUP_ASUS_MIC),
diff --git a/sound/pci/au88x0/au88x0.c b/sound/pci/au88x0/au88x0.c
index de56e83d8e10..bb02945793f0 100644
--- a/sound/pci/au88x0/au88x0.c
+++ b/sound/pci/au88x0/au88x0.c
@@ -280,11 +280,11 @@ __snd_vortex_probe(struct pci_dev *pci, const struct pci_device_id *pci_id)
// (5)
err = pci_read_config_word(pci, PCI_DEVICE_ID, &chip->device);
- if (err < 0)
- return err;
+ if (err)
+ return pcibios_err_to_errno(err);
err = pci_read_config_word(pci, PCI_VENDOR_ID, &chip->vendor);
- if (err < 0)
- return err;
+ if (err)
+ return pcibios_err_to_errno(err);
chip->rev = pci->revision;
#ifdef CHIP_AU8830
if ((chip->rev) != 0xfe && (chip->rev) != 0xfa) {
diff --git a/sound/soc/codecs/cs4271.c b/sound/soc/codecs/cs4271.c
index 6a3cca3d26c7..ead447a5da7f 100644
--- a/sound/soc/codecs/cs4271.c
+++ b/sound/soc/codecs/cs4271.c
@@ -581,17 +581,17 @@ static int cs4271_component_probe(struct snd_soc_component *component)
ret = regcache_sync(cs4271->regmap);
if (ret < 0)
- return ret;
+ goto err_disable_regulator;
ret = regmap_update_bits(cs4271->regmap, CS4271_MODE2,
CS4271_MODE2_PDN | CS4271_MODE2_CPEN,
CS4271_MODE2_PDN | CS4271_MODE2_CPEN);
if (ret < 0)
- return ret;
+ goto err_disable_regulator;
ret = regmap_update_bits(cs4271->regmap, CS4271_MODE2,
CS4271_MODE2_PDN, 0);
if (ret < 0)
- return ret;
+ goto err_disable_regulator;
/* Power-up sequence requires 85 uS */
udelay(85);
@@ -601,6 +601,10 @@ static int cs4271_component_probe(struct snd_soc_component *component)
CS4271_MODE2_MUTECAEQUB);
return 0;
+
+err_disable_regulator:
+ regulator_bulk_disable(ARRAY_SIZE(cs4271->supplies), cs4271->supplies);
+ return ret;
}
static void cs4271_component_remove(struct snd_soc_component *component)
diff --git a/sound/soc/codecs/da7213.c b/sound/soc/codecs/da7213.c
index ae89260ca215..3420011da444 100644
--- a/sound/soc/codecs/da7213.c
+++ b/sound/soc/codecs/da7213.c
@@ -2124,11 +2124,50 @@ static int da7213_probe(struct snd_soc_component *component)
return 0;
}
+static int da7213_runtime_suspend(struct device *dev)
+{
+ struct da7213_priv *da7213 = dev_get_drvdata(dev);
+
+ regcache_cache_only(da7213->regmap, true);
+ regcache_mark_dirty(da7213->regmap);
+ regulator_bulk_disable(DA7213_NUM_SUPPLIES, da7213->supplies);
+
+ return 0;
+}
+
+static int da7213_runtime_resume(struct device *dev)
+{
+ struct da7213_priv *da7213 = dev_get_drvdata(dev);
+ int ret;
+
+ ret = regulator_bulk_enable(DA7213_NUM_SUPPLIES, da7213->supplies);
+ if (ret < 0)
+ return ret;
+ regcache_cache_only(da7213->regmap, false);
+ return regcache_sync(da7213->regmap);
+}
+
+static int da7213_suspend(struct snd_soc_component *component)
+{
+ struct da7213_priv *da7213 = snd_soc_component_get_drvdata(component);
+
+ return da7213_runtime_suspend(da7213->dev);
+}
+
+static int da7213_resume(struct snd_soc_component *component)
+{
+ struct da7213_priv *da7213 = snd_soc_component_get_drvdata(component);
+
+ return da7213_runtime_resume(da7213->dev);
+}
+
static const struct snd_soc_component_driver soc_component_dev_da7213 = {
.probe = da7213_probe,
.set_bias_level = da7213_set_bias_level,
.controls = da7213_snd_controls,
.num_controls = ARRAY_SIZE(da7213_snd_controls),
+ .suspend = da7213_suspend,
+ .resume = da7213_resume,
.dapm_widgets = da7213_dapm_widgets,
.num_dapm_widgets = ARRAY_SIZE(da7213_dapm_widgets),
.dapm_routes = da7213_audio_map,
@@ -2175,6 +2214,8 @@ static int da7213_i2c_probe(struct i2c_client *i2c)
if (!da7213->fin_min_rate)
return -EINVAL;
+ da7213->dev = &i2c->dev;
+
i2c_set_clientdata(i2c, da7213);
/* Get required supplies */
@@ -2224,31 +2265,9 @@ static void da7213_i2c_remove(struct i2c_client *i2c)
pm_runtime_disable(&i2c->dev);
}
-static int da7213_runtime_suspend(struct device *dev)
-{
- struct da7213_priv *da7213 = dev_get_drvdata(dev);
-
- regcache_cache_only(da7213->regmap, true);
- regcache_mark_dirty(da7213->regmap);
- regulator_bulk_disable(DA7213_NUM_SUPPLIES, da7213->supplies);
-
- return 0;
-}
-
-static int da7213_runtime_resume(struct device *dev)
-{
- struct da7213_priv *da7213 = dev_get_drvdata(dev);
- int ret;
-
- ret = regulator_bulk_enable(DA7213_NUM_SUPPLIES, da7213->supplies);
- if (ret < 0)
- return ret;
- regcache_cache_only(da7213->regmap, false);
- return regcache_sync(da7213->regmap);
-}
-
-static DEFINE_RUNTIME_DEV_PM_OPS(da7213_pm, da7213_runtime_suspend,
- da7213_runtime_resume, NULL);
+static const struct dev_pm_ops da7213_pm = {
+ RUNTIME_PM_OPS(da7213_runtime_suspend, da7213_runtime_resume, NULL)
+};
static const struct i2c_device_id da7213_i2c_id[] = {
{ "da7213" },
diff --git a/sound/soc/codecs/da7213.h b/sound/soc/codecs/da7213.h
index b9ab791d6b88..29cbf0eb6124 100644
--- a/sound/soc/codecs/da7213.h
+++ b/sound/soc/codecs/da7213.h
@@ -595,6 +595,7 @@ enum da7213_supplies {
/* Codec private data */
struct da7213_priv {
struct regmap *regmap;
+ struct device *dev;
struct mutex ctrl_lock;
struct regulator_bulk_data supplies[DA7213_NUM_SUPPLIES];
struct clk *mclk;
diff --git a/sound/soc/codecs/lpass-va-macro.c b/sound/soc/codecs/lpass-va-macro.c
index 2e1b77973a3e..92c177b82a02 100644
--- a/sound/soc/codecs/lpass-va-macro.c
+++ b/sound/soc/codecs/lpass-va-macro.c
@@ -1638,7 +1638,7 @@ static int va_macro_probe(struct platform_device *pdev)
if (ret)
goto err_clkout;
- va->fsgen = clk_hw_get_clk(&va->hw, "fsgen");
+ va->fsgen = devm_clk_hw_get_clk(dev, &va->hw, "fsgen");
if (IS_ERR(va->fsgen)) {
ret = PTR_ERR(va->fsgen);
goto err_clkout;
diff --git a/sound/soc/codecs/tas2781-i2c.c b/sound/soc/codecs/tas2781-i2c.c
index ba880b5de7e8..8f37aa00e62e 100644
--- a/sound/soc/codecs/tas2781-i2c.c
+++ b/sound/soc/codecs/tas2781-i2c.c
@@ -1957,7 +1957,8 @@ static void tasdevice_parse_dt(struct tasdevice_priv *tas_priv)
{
struct i2c_client *client = (struct i2c_client *)tas_priv->client;
unsigned int dev_addrs[TASDEVICE_MAX_CHANNELS];
- int i, ndev = 0;
+ int ndev = 0;
+ int i, rc;
if (tas_priv->isacpi) {
ndev = device_property_read_u32_array(&client->dev,
@@ -1968,8 +1969,12 @@ static void tasdevice_parse_dt(struct tasdevice_priv *tas_priv)
} else {
ndev = (ndev < ARRAY_SIZE(dev_addrs))
? ndev : ARRAY_SIZE(dev_addrs);
- ndev = device_property_read_u32_array(&client->dev,
+ rc = device_property_read_u32_array(&client->dev,
"ti,audio-slots", dev_addrs, ndev);
+ if (rc != 0) {
+ ndev = 1;
+ dev_addrs[0] = client->addr;
+ }
}
tas_priv->irq =
diff --git a/sound/soc/codecs/tas2783-sdw.c b/sound/soc/codecs/tas2783-sdw.c
index 1fb4227b711e..e273b80d033e 100644
--- a/sound/soc/codecs/tas2783-sdw.c
+++ b/sound/soc/codecs/tas2783-sdw.c
@@ -762,10 +762,17 @@ static void tas2783_fw_ready(const struct firmware *fmw, void *context)
goto out;
}
- mutex_lock(&tas_dev->pde_lock);
img_sz = fmw->size;
buf = fmw->data;
offset += FW_DL_OFFSET;
+ if (offset >= (img_sz - FW_FL_HDR)) {
+ dev_err(tas_dev->dev,
+ "firmware is too small");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ mutex_lock(&tas_dev->pde_lock);
while (offset < (img_sz - FW_FL_HDR)) {
memset(&hdr, 0, sizeof(hdr));
offset += read_header(&buf[offset], &hdr);
@@ -776,6 +783,14 @@ static void tas2783_fw_ready(const struct firmware *fmw, void *context)
/* size also includes the header */
file_blk_size = hdr.length - FW_FL_HDR;
+ /* make sure that enough data is there */
+ if (offset + file_blk_size > img_sz) {
+ ret = -EINVAL;
+ dev_err(tas_dev->dev,
+ "corrupt firmware file");
+ break;
+ }
+
switch (hdr.file_id) {
case 0:
ret = sdw_nwrite_no_pm(tas_dev->sdw_peripheral,
@@ -808,7 +823,8 @@ static void tas2783_fw_ready(const struct firmware *fmw, void *context)
break;
}
mutex_unlock(&tas_dev->pde_lock);
- tas2783_update_calibdata(tas_dev);
+ if (!ret)
+ tas2783_update_calibdata(tas_dev);
out:
if (!ret)
diff --git a/sound/soc/renesas/rcar/ssiu.c b/sound/soc/renesas/rcar/ssiu.c
index faf351126d57..244fb833292a 100644
--- a/sound/soc/renesas/rcar/ssiu.c
+++ b/sound/soc/renesas/rcar/ssiu.c
@@ -509,7 +509,7 @@ void rsnd_parse_connect_ssiu(struct rsnd_dai *rdai,
int rsnd_ssiu_probe(struct rsnd_priv *priv)
{
struct device *dev = rsnd_priv_to_dev(priv);
- struct device_node *node;
+ struct device_node *node __free(device_node) = rsnd_ssiu_of_node(priv);
struct rsnd_ssiu *ssiu;
struct rsnd_mod_ops *ops;
const int *list = NULL;
@@ -522,7 +522,6 @@ int rsnd_ssiu_probe(struct rsnd_priv *priv)
* see
* rsnd_ssiu_bufsif_to_id()
*/
- node = rsnd_ssiu_of_node(priv);
if (node)
nr = rsnd_node_count(priv, node, SSIU_NAME);
else
diff --git a/sound/soc/sdca/sdca_functions.c b/sound/soc/sdca/sdca_functions.c
index 13f68f7b6dd6..0ccb6775f4de 100644
--- a/sound/soc/sdca/sdca_functions.c
+++ b/sound/soc/sdca/sdca_functions.c
@@ -894,7 +894,8 @@ static int find_sdca_entity_control(struct device *dev, struct sdca_entity *enti
return ret;
}
- control->values = devm_kzalloc(dev, hweight64(control->cn_list), GFP_KERNEL);
+ control->values = devm_kcalloc(dev, hweight64(control->cn_list),
+ sizeof(int), GFP_KERNEL);
if (!control->values)
return -ENOMEM;
diff --git a/sound/soc/sdw_utils/soc_sdw_utils.c b/sound/soc/sdw_utils/soc_sdw_utils.c
index f7c8c16308de..3848c7df1916 100644
--- a/sound/soc/sdw_utils/soc_sdw_utils.c
+++ b/sound/soc/sdw_utils/soc_sdw_utils.c
@@ -1277,7 +1277,7 @@ static int is_sdca_endpoint_present(struct device *dev,
struct sdw_slave *slave;
struct device *sdw_dev;
const char *sdw_codec_name;
- int i;
+ int ret, i;
dlc = kzalloc(sizeof(*dlc), GFP_KERNEL);
if (!dlc)
@@ -1307,13 +1307,16 @@ static int is_sdca_endpoint_present(struct device *dev,
}
slave = dev_to_sdw_dev(sdw_dev);
- if (!slave)
- return -EINVAL;
+ if (!slave) {
+ ret = -EINVAL;
+ goto put_device;
+ }
/* Make sure BIOS provides SDCA properties */
if (!slave->sdca_data.interface_revision) {
dev_warn(&slave->dev, "SDCA properties not found in the BIOS\n");
- return 1;
+ ret = 1;
+ goto put_device;
}
for (i = 0; i < slave->sdca_data.num_functions; i++) {
@@ -1322,7 +1325,8 @@ static int is_sdca_endpoint_present(struct device *dev,
if (dai_type == dai_info->dai_type) {
dev_dbg(&slave->dev, "DAI type %d sdca function %s found\n",
dai_type, slave->sdca_data.function[i].name);
- return 1;
+ ret = 1;
+ goto put_device;
}
}
@@ -1330,7 +1334,11 @@ static int is_sdca_endpoint_present(struct device *dev,
"SDCA device function for DAI type %d not supported, skip endpoint\n",
dai_info->dai_type);
- return 0;
+ ret = 0;
+
+put_device:
+ put_device(sdw_dev);
+ return ret;
}
int asoc_sdw_parse_sdw_endpoints(struct snd_soc_card *card,
diff --git a/sound/usb/endpoint.c b/sound/usb/endpoint.c
index 880f5afcce60..cc15624ecaff 100644
--- a/sound/usb/endpoint.c
+++ b/sound/usb/endpoint.c
@@ -1362,6 +1362,11 @@ int snd_usb_endpoint_set_params(struct snd_usb_audio *chip,
ep->sample_rem = ep->cur_rate % ep->pps;
ep->packsize[0] = ep->cur_rate / ep->pps;
ep->packsize[1] = (ep->cur_rate + (ep->pps - 1)) / ep->pps;
+ if (ep->packsize[1] > ep->maxpacksize) {
+ usb_audio_dbg(chip, "Too small maxpacksize %u for rate %u / pps %u\n",
+ ep->maxpacksize, ep->cur_rate, ep->pps);
+ return -EINVAL;
+ }
/* calculate the frequency in 16.16 format */
ep->freqm = ep->freqn;
diff --git a/sound/usb/mixer.c b/sound/usb/mixer.c
index 6f00e0d52382..3af71d42b9b9 100644
--- a/sound/usb/mixer.c
+++ b/sound/usb/mixer.c
@@ -921,7 +921,7 @@ static int parse_term_uac2_clock_source(struct mixer_build *state,
{
struct uac_clock_source_descriptor *d = p1;
- term->type = UAC3_CLOCK_SOURCE << 16; /* virtual type */
+ term->type = UAC2_CLOCK_SOURCE << 16; /* virtual type */
term->id = id;
term->name = d->iClockSource;
return 0;
@@ -3086,6 +3086,8 @@ static int snd_usb_mixer_controls_badd(struct usb_mixer_interface *mixer,
int i;
assoc = usb_ifnum_to_if(dev, ctrlif)->intf_assoc;
+ if (!assoc)
+ return -EINVAL;
/* Detect BADD capture/playback channels from AS EP descriptors */
for (i = 0; i < assoc->bInterfaceCount; i++) {
diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c
index 71638e6dfb20..61bd61ffb1b2 100644
--- a/sound/usb/quirks.c
+++ b/sound/usb/quirks.c
@@ -2022,12 +2022,15 @@ u64 snd_usb_interface_dsd_format_quirks(struct snd_usb_audio *chip,
case USB_ID(0x16d0, 0x09d8): /* NuPrime IDA-8 */
case USB_ID(0x16d0, 0x09db): /* NuPrime Audio DAC-9 */
case USB_ID(0x16d0, 0x09dd): /* Encore mDSD */
+ case USB_ID(0x16d0, 0x0ab1): /* PureAudio APA DAC */
+ case USB_ID(0x16d0, 0xeca1): /* PureAudio Lotus DAC5, DAC5 SE, DAC5 Pro */
case USB_ID(0x1db5, 0x0003): /* Bryston BDA3 */
case USB_ID(0x20a0, 0x4143): /* WaveIO USB Audio 2.0 */
case USB_ID(0x22e1, 0xca01): /* HDTA Serenade DSD */
case USB_ID(0x249c, 0x9326): /* M2Tech Young MkIII */
case USB_ID(0x2616, 0x0106): /* PS Audio NuWave DAC */
case USB_ID(0x2622, 0x0041): /* Audiolab M-DAC+ */
+ case USB_ID(0x2622, 0x0061): /* LEAK Stereo 230 */
case USB_ID(0x278b, 0x5100): /* Rotel RC-1590 */
case USB_ID(0x27f7, 0x3002): /* W4S DAC-2v2SE */
case USB_ID(0x29a2, 0x0086): /* Mutec MC3+ USB */
@@ -2267,6 +2270,8 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = {
QUIRK_FLAG_FIXED_RATE),
DEVICE_FLG(0x0fd9, 0x0008, /* Hauppauge HVR-950Q */
QUIRK_FLAG_SHARE_MEDIA_DEVICE | QUIRK_FLAG_ALIGN_TRANSFER),
+ DEVICE_FLG(0x1038, 0x1294, /* SteelSeries Arctis Pro Wireless */
+ QUIRK_FLAG_MIXER_PLAYBACK_MIN_MUTE),
DEVICE_FLG(0x1101, 0x0003, /* Audioengine D1 */
QUIRK_FLAG_GET_SAMPLE_RATE),
DEVICE_FLG(0x12d1, 0x3a07, /* Huawei Technologies Co., Ltd. */
@@ -2297,6 +2302,10 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = {
QUIRK_FLAG_IGNORE_CLOCK_SOURCE),
DEVICE_FLG(0x1686, 0x00dd, /* Zoom R16/24 */
QUIRK_FLAG_TX_LENGTH | QUIRK_FLAG_CTL_MSG_DELAY_1M),
+ DEVICE_FLG(0x16d0, 0x0ab1, /* PureAudio APA DAC */
+ QUIRK_FLAG_DSD_RAW),
+ DEVICE_FLG(0x16d0, 0xeca1, /* PureAudio Lotus DAC5, DAC5 SE and DAC5 Pro */
+ QUIRK_FLAG_DSD_RAW),
DEVICE_FLG(0x17aa, 0x1046, /* Lenovo ThinkStation P620 Rear Line-in, Line-out and Microphone */
QUIRK_FLAG_DISABLE_AUTOSUSPEND),
DEVICE_FLG(0x17aa, 0x104d, /* Lenovo ThinkStation P620 Internal Speaker + Front Headset */
@@ -2420,6 +2429,8 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = {
QUIRK_FLAG_DSD_RAW),
VENDOR_FLG(0x25ce, /* Mytek devices */
QUIRK_FLAG_DSD_RAW),
+ VENDOR_FLG(0x2622, /* IAG Limited devices */
+ QUIRK_FLAG_DSD_RAW),
VENDOR_FLG(0x278b, /* Rotel? */
QUIRK_FLAG_DSD_RAW),
VENDOR_FLG(0x292b, /* Gustard/Ess based devices */
diff --git a/tools/arch/riscv/include/asm/csr.h b/tools/arch/riscv/include/asm/csr.h
index 56d7367ee344..21d8cee04638 100644
--- a/tools/arch/riscv/include/asm/csr.h
+++ b/tools/arch/riscv/include/asm/csr.h
@@ -167,7 +167,8 @@
#define VSIP_TO_HVIP_SHIFT (IRQ_VS_SOFT - IRQ_S_SOFT)
#define VSIP_VALID_MASK ((_AC(1, UL) << IRQ_S_SOFT) | \
(_AC(1, UL) << IRQ_S_TIMER) | \
- (_AC(1, UL) << IRQ_S_EXT))
+ (_AC(1, UL) << IRQ_S_EXT) | \
+ (_AC(1, UL) << IRQ_PMU_OVF))
/* AIA CSR bits */
#define TOPI_IID_SHIFT 16
@@ -280,7 +281,7 @@
#define CSR_HPMCOUNTER30H 0xc9e
#define CSR_HPMCOUNTER31H 0xc9f
-#define CSR_SSCOUNTOVF 0xda0
+#define CSR_SCOUNTOVF 0xda0
#define CSR_SSTATUS 0x100
#define CSR_SIE 0x104
diff --git a/tools/arch/x86/include/uapi/asm/vmx.h b/tools/arch/x86/include/uapi/asm/vmx.h
index 9792e329343e..1baa86dfe029 100644
--- a/tools/arch/x86/include/uapi/asm/vmx.h
+++ b/tools/arch/x86/include/uapi/asm/vmx.h
@@ -93,6 +93,7 @@
#define EXIT_REASON_TPAUSE 68
#define EXIT_REASON_BUS_LOCK 74
#define EXIT_REASON_NOTIFY 75
+#define EXIT_REASON_SEAMCALL 76
#define EXIT_REASON_TDCALL 77
#define EXIT_REASON_MSR_READ_IMM 84
#define EXIT_REASON_MSR_WRITE_IMM 85
diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
index 009633294b09..35aeeaf5f711 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
@@ -182,7 +182,7 @@ bpftool prog tracelog
bpftool prog tracelog { stdout | stderr } *PROG*
Dump the BPF stream of the program. BPF programs can write to these streams
- at runtime with the **bpf_stream_vprintk**\ () kfunc. The kernel may write
+ at runtime with the **bpf_stream_vprintk_impl**\ () kfunc. The kernel may write
error messages to the standard error stream. This facility should be used
only for debugging purposes.
diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile
index 49b0add392b1..95646290cb89 100644
--- a/tools/build/feature/Makefile
+++ b/tools/build/feature/Makefile
@@ -107,7 +107,7 @@ all: $(FILES)
__BUILD = $(CC) $(CFLAGS) -MD -Wall -Werror -o $@ $(patsubst %.bin,%.c,$(@F)) $(LDFLAGS)
BUILD = $(__BUILD) > $(@:.bin=.make.output) 2>&1
BUILD_BFD = $(BUILD) -DPACKAGE='"perf"' -lbfd -ldl
- BUILD_ALL = $(BUILD) -fstack-protector-all -O2 -D_FORTIFY_SOURCE=2 -ldw -lelf -lnuma -lelf -lslang $(FLAGS_PERL_EMBED) $(FLAGS_PYTHON_EMBED) -DPACKAGE='"perf"' -lbfd -ldl -lz -llzma -lzstd
+ BUILD_ALL = $(BUILD) -fstack-protector-all -O2 -D_FORTIFY_SOURCE=2 -ldw -lelf -lnuma -lelf -lslang $(FLAGS_PERL_EMBED) $(FLAGS_PYTHON_EMBED) -ldl -lz -llzma -lzstd
__BUILDXX = $(CXX) $(CXXFLAGS) -MD -Wall -Werror -o $@ $(patsubst %.bin,%.cpp,$(@F)) $(LDFLAGS)
BUILDXX = $(__BUILDXX) > $(@:.bin=.make.output) 2>&1
@@ -115,7 +115,7 @@ __BUILDXX = $(CXX) $(CXXFLAGS) -MD -Wall -Werror -o $@ $(patsubst %.bin,%.cpp,$(
###############################
$(OUTPUT)test-all.bin:
- $(BUILD_ALL) || $(BUILD_ALL) -lopcodes -liberty
+ $(BUILD_ALL)
$(OUTPUT)test-hello.bin:
$(BUILD)
diff --git a/tools/include/uapi/linux/nsfs.h b/tools/include/uapi/linux/nsfs.h
index 33c9b578b3b2..a25e38d1c874 100644
--- a/tools/include/uapi/linux/nsfs.h
+++ b/tools/include/uapi/linux/nsfs.h
@@ -53,6 +53,76 @@ enum init_ns_ino {
TIME_NS_INIT_INO = 0xEFFFFFFAU,
NET_NS_INIT_INO = 0xEFFFFFF9U,
MNT_NS_INIT_INO = 0xEFFFFFF8U,
+#ifdef __KERNEL__
+ MNT_NS_ANON_INO = 0xEFFFFFF7U,
+#endif
};
+struct nsfs_file_handle {
+ __u64 ns_id;
+ __u32 ns_type;
+ __u32 ns_inum;
+};
+
+#define NSFS_FILE_HANDLE_SIZE_VER0 16 /* sizeof first published struct */
+#define NSFS_FILE_HANDLE_SIZE_LATEST sizeof(struct nsfs_file_handle) /* sizeof latest published struct */
+
+enum init_ns_id {
+ IPC_NS_INIT_ID = 1ULL,
+ UTS_NS_INIT_ID = 2ULL,
+ USER_NS_INIT_ID = 3ULL,
+ PID_NS_INIT_ID = 4ULL,
+ CGROUP_NS_INIT_ID = 5ULL,
+ TIME_NS_INIT_ID = 6ULL,
+ NET_NS_INIT_ID = 7ULL,
+ MNT_NS_INIT_ID = 8ULL,
+#ifdef __KERNEL__
+ NS_LAST_INIT_ID = MNT_NS_INIT_ID,
+#endif
+};
+
+enum ns_type {
+ TIME_NS = (1ULL << 7), /* CLONE_NEWTIME */
+ MNT_NS = (1ULL << 17), /* CLONE_NEWNS */
+ CGROUP_NS = (1ULL << 25), /* CLONE_NEWCGROUP */
+ UTS_NS = (1ULL << 26), /* CLONE_NEWUTS */
+ IPC_NS = (1ULL << 27), /* CLONE_NEWIPC */
+ USER_NS = (1ULL << 28), /* CLONE_NEWUSER */
+ PID_NS = (1ULL << 29), /* CLONE_NEWPID */
+ NET_NS = (1ULL << 30), /* CLONE_NEWNET */
+};
+
+/**
+ * struct ns_id_req - namespace ID request structure
+ * @size: size of this structure
+ * @spare: reserved for future use
+ * @filter: filter mask
+ * @ns_id: last namespace id
+ * @user_ns_id: owning user namespace ID
+ *
+ * Structure for passing namespace ID and miscellaneous parameters to
+ * statns(2) and listns(2).
+ *
+ * For statns(2) @param represents the request mask.
+ * For listns(2) @param represents the last listed mount id (or zero).
+ */
+struct ns_id_req {
+ __u32 size;
+ __u32 spare;
+ __u64 ns_id;
+ struct /* listns */ {
+ __u32 ns_type;
+ __u32 spare2;
+ __u64 user_ns_id;
+ };
+};
+
+/*
+ * Special @user_ns_id value that can be passed to listns()
+ */
+#define LISTNS_CURRENT_USER 0xffffffffffffffff /* Caller's userns */
+
+/* List of all ns_id_req versions. */
+#define NS_ID_REQ_SIZE_VER0 32 /* sizeof first published struct */
+
#endif /* __LINUX_NSFS_H */
diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h
index 80c028540656..d4e4e388e625 100644
--- a/tools/lib/bpf/bpf_helpers.h
+++ b/tools/lib/bpf/bpf_helpers.h
@@ -315,20 +315,20 @@ enum libbpf_tristate {
___param, sizeof(___param)); \
})
-extern int bpf_stream_vprintk(int stream_id, const char *fmt__str, const void *args,
- __u32 len__sz, void *aux__prog) __weak __ksym;
-
-#define bpf_stream_printk(stream_id, fmt, args...) \
-({ \
- static const char ___fmt[] = fmt; \
- unsigned long long ___param[___bpf_narg(args)]; \
- \
- _Pragma("GCC diagnostic push") \
- _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \
- ___bpf_fill(___param, args); \
- _Pragma("GCC diagnostic pop") \
- \
- bpf_stream_vprintk(stream_id, ___fmt, ___param, sizeof(___param), NULL);\
+extern int bpf_stream_vprintk_impl(int stream_id, const char *fmt__str, const void *args,
+ __u32 len__sz, void *aux__prog) __weak __ksym;
+
+#define bpf_stream_printk(stream_id, fmt, args...) \
+({ \
+ static const char ___fmt[] = fmt; \
+ unsigned long long ___param[___bpf_narg(args)]; \
+ \
+ _Pragma("GCC diagnostic push") \
+ _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \
+ ___bpf_fill(___param, args); \
+ _Pragma("GCC diagnostic pop") \
+ \
+ bpf_stream_vprintk_impl(stream_id, ___fmt, ___param, sizeof(___param), NULL); \
})
/* Use __bpf_printk when bpf_printk call has 3 or fewer fmt args
diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py
index 58086b101057..aadeb3abcad8 100755
--- a/tools/net/ynl/pyynl/ynl_gen_c.py
+++ b/tools/net/ynl/pyynl/ynl_gen_c.py
@@ -861,6 +861,18 @@ class TypeIndexedArray(Type):
return [f"{member} = {self.c_name};",
f"{presence} = n_{self.c_name};"]
+ def free_needs_iter(self):
+ return self.sub_type == 'nest'
+
+ def _free_lines(self, ri, var, ref):
+ lines = []
+ if self.sub_type == 'nest':
+ lines += [
+ f"for (i = 0; i < {var}->{ref}_count.{self.c_name}; i++)",
+ f'{self.nested_render_name}_free(&{var}->{ref}{self.c_name}[i]);',
+ ]
+ lines += f"free({var}->{ref}{self.c_name});",
+ return lines
class TypeNestTypeValue(Type):
def _complex_member_type(self, ri):
diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index 5700516aa84a..2dd5f5a60568 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -354,9 +354,6 @@ FEATURE_CHECK_LDFLAGS-libpython := $(PYTHON_EMBED_LDOPTS)
FEATURE_CHECK_LDFLAGS-libaio = -lrt
-FEATURE_CHECK_LDFLAGS-disassembler-four-args = -lbfd -lopcodes -ldl
-FEATURE_CHECK_LDFLAGS-disassembler-init-styled = -lbfd -lopcodes -ldl
-
CORE_CFLAGS += -fno-omit-frame-pointer
CORE_CFLAGS += -Wall
CORE_CFLAGS += -Wextra
@@ -930,6 +927,8 @@ ifdef BUILD_NONDISTRO
ifeq ($(feature-libbfd), 1)
EXTLIBS += -lbfd -lopcodes
+ FEATURE_CHECK_LDFLAGS-disassembler-four-args = -lbfd -lopcodes -ldl
+ FEATURE_CHECK_LDFLAGS-disassembler-init-styled = -lbfd -lopcodes -ldl
else
# we are on a system that requires -liberty and (maybe) -lz
# to link against -lbfd; test each case individually here
diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index 078634461df2..e8962c985d34 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -1867,6 +1867,7 @@ static int __cmd_report(bool display_info)
eops.sample = process_sample_event;
eops.comm = perf_event__process_comm;
eops.mmap = perf_event__process_mmap;
+ eops.mmap2 = perf_event__process_mmap2;
eops.namespaces = perf_event__process_namespaces;
eops.tracing_data = perf_event__process_tracing_data;
session = perf_session__new(&data, &eops);
@@ -2023,6 +2024,7 @@ static int __cmd_contention(int argc, const char **argv)
eops.sample = process_sample_event;
eops.comm = perf_event__process_comm;
eops.mmap = perf_event__process_mmap;
+ eops.mmap2 = perf_event__process_mmap2;
eops.tracing_data = perf_event__process_tracing_data;
perf_env__init(&host_env);
diff --git a/tools/perf/tests/shell/lock_contention.sh b/tools/perf/tests/shell/lock_contention.sh
index 7248a74ca2a3..6dd90519f45c 100755
--- a/tools/perf/tests/shell/lock_contention.sh
+++ b/tools/perf/tests/shell/lock_contention.sh
@@ -13,15 +13,18 @@ cleanup() {
rm -f ${perfdata}
rm -f ${result}
rm -f ${errout}
- trap - EXIT TERM INT
+ trap - EXIT TERM INT ERR
}
trap_cleanup() {
+ if (( $? == 139 )); then #SIGSEGV
+ err=1
+ fi
echo "Unexpected signal in ${FUNCNAME[1]}"
cleanup
exit ${err}
}
-trap trap_cleanup EXIT TERM INT
+trap trap_cleanup EXIT TERM INT ERR
check() {
if [ "$(id -u)" != 0 ]; then
@@ -145,7 +148,7 @@ test_aggr_cgroup()
fi
# the perf lock contention output goes to the stderr
- perf lock con -a -b -g -E 1 -q -- perf bench sched messaging -p > /dev/null 2> ${result}
+ perf lock con -a -b --lock-cgroup -E 1 -q -- perf bench sched messaging -p > /dev/null 2> ${result}
if [ "$(cat "${result}" | wc -l)" != "1" ]; then
echo "[Fail] BPF result count is not 1:" "$(cat "${result}" | wc -l)"
err=1
@@ -271,7 +274,7 @@ test_cgroup_filter()
return
fi
- perf lock con -a -b -g -E 1 -F wait_total -q -- perf bench sched messaging -p > /dev/null 2> ${result}
+ perf lock con -a -b --lock-cgroup -E 1 -F wait_total -q -- perf bench sched messaging -p > /dev/null 2> ${result}
if [ "$(cat "${result}" | wc -l)" != "1" ]; then
echo "[Fail] BPF result should have a cgroup result:" "$(cat "${result}")"
err=1
@@ -279,7 +282,7 @@ test_cgroup_filter()
fi
cgroup=$(cat "${result}" | awk '{ print $3 }')
- perf lock con -a -b -g -E 1 -G "${cgroup}" -q -- perf bench sched messaging -p > /dev/null 2> ${result}
+ perf lock con -a -b --lock-cgroup -E 1 -G "${cgroup}" -q -- perf bench sched messaging -p > /dev/null 2> ${result}
if [ "$(cat "${result}" | wc -l)" != "1" ]; then
echo "[Fail] BPF result should have a result with cgroup filter:" "$(cat "${cgroup}")"
err=1
@@ -338,4 +341,5 @@ test_aggr_task_stack_filter
test_cgroup_filter
test_csv_output
+cleanup
exit ${err}
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 4f2a6e10ed5c..4e12be579140 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -1022,12 +1022,9 @@ static int write_bpf_prog_info(struct feat_fd *ff,
down_read(&env->bpf_progs.lock);
- if (env->bpf_progs.infos_cnt == 0)
- goto out;
-
ret = do_write(ff, &env->bpf_progs.infos_cnt,
sizeof(env->bpf_progs.infos_cnt));
- if (ret < 0)
+ if (ret < 0 || env->bpf_progs.infos_cnt == 0)
goto out;
root = &env->bpf_progs.infos;
@@ -1067,13 +1064,10 @@ static int write_bpf_btf(struct feat_fd *ff,
down_read(&env->bpf_progs.lock);
- if (env->bpf_progs.btfs_cnt == 0)
- goto out;
-
ret = do_write(ff, &env->bpf_progs.btfs_cnt,
sizeof(env->bpf_progs.btfs_cnt));
- if (ret < 0)
+ if (ret < 0 || env->bpf_progs.btfs_cnt == 0)
goto out;
root = &env->bpf_progs.btfs;
diff --git a/tools/perf/util/libbfd.c b/tools/perf/util/libbfd.c
index 01147fbf73b3..6434c2dccd4a 100644
--- a/tools/perf/util/libbfd.c
+++ b/tools/perf/util/libbfd.c
@@ -38,6 +38,39 @@ struct a2l_data {
asymbol **syms;
};
+static bool perf_bfd_lock(void *bfd_mutex)
+{
+ mutex_lock(bfd_mutex);
+ return true;
+}
+
+static bool perf_bfd_unlock(void *bfd_mutex)
+{
+ mutex_unlock(bfd_mutex);
+ return true;
+}
+
+static void perf_bfd_init(void)
+{
+ static struct mutex bfd_mutex;
+
+ mutex_init_recursive(&bfd_mutex);
+
+ if (bfd_init() != BFD_INIT_MAGIC) {
+ pr_err("Error initializing libbfd\n");
+ return;
+ }
+ if (!bfd_thread_init(perf_bfd_lock, perf_bfd_unlock, &bfd_mutex))
+ pr_err("Error initializing libbfd threading\n");
+}
+
+static void ensure_bfd_init(void)
+{
+ static pthread_once_t bfd_init_once = PTHREAD_ONCE_INIT;
+
+ pthread_once(&bfd_init_once, perf_bfd_init);
+}
+
static int bfd_error(const char *string)
{
const char *errmsg;
@@ -132,6 +165,7 @@ static struct a2l_data *addr2line_init(const char *path)
bfd *abfd;
struct a2l_data *a2l = NULL;
+ ensure_bfd_init();
abfd = bfd_openr(path, NULL);
if (abfd == NULL)
return NULL;
@@ -288,6 +322,7 @@ int dso__load_bfd_symbols(struct dso *dso, const char *debugfile)
bfd *abfd;
u64 start, len;
+ ensure_bfd_init();
abfd = bfd_openr(debugfile, NULL);
if (!abfd)
return -1;
@@ -393,6 +428,7 @@ int libbfd__read_build_id(const char *filename, struct build_id *bid, bool block
if (fd < 0)
return -1;
+ ensure_bfd_init();
abfd = bfd_fdopenr(filename, /*target=*/NULL, fd);
if (!abfd)
return -1;
@@ -421,6 +457,7 @@ int libbfd_filename__read_debuglink(const char *filename, char *debuglink,
asection *section;
bfd *abfd;
+ ensure_bfd_init();
abfd = bfd_openr(filename, NULL);
if (!abfd)
return -1;
@@ -480,6 +517,7 @@ int symbol__disassemble_bpf_libbfd(struct symbol *sym __maybe_unused,
memset(tpath, 0, sizeof(tpath));
perf_exe(tpath, sizeof(tpath));
+ ensure_bfd_init();
bfdf = bfd_openr(tpath, NULL);
if (bfdf == NULL)
abort();
diff --git a/tools/perf/util/mutex.c b/tools/perf/util/mutex.c
index bca7f0717f35..7aa1f3f55a7d 100644
--- a/tools/perf/util/mutex.c
+++ b/tools/perf/util/mutex.c
@@ -17,7 +17,7 @@ static void check_err(const char *fn, int err)
#define CHECK_ERR(err) check_err(__func__, err)
-static void __mutex_init(struct mutex *mtx, bool pshared)
+static void __mutex_init(struct mutex *mtx, bool pshared, bool recursive)
{
pthread_mutexattr_t attr;
@@ -27,21 +27,27 @@ static void __mutex_init(struct mutex *mtx, bool pshared)
/* In normal builds enable error checking, such as recursive usage. */
CHECK_ERR(pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ERRORCHECK));
#endif
+ if (recursive)
+ CHECK_ERR(pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE));
if (pshared)
CHECK_ERR(pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED));
-
CHECK_ERR(pthread_mutex_init(&mtx->lock, &attr));
CHECK_ERR(pthread_mutexattr_destroy(&attr));
}
void mutex_init(struct mutex *mtx)
{
- __mutex_init(mtx, /*pshared=*/false);
+ __mutex_init(mtx, /*pshared=*/false, /*recursive=*/false);
}
void mutex_init_pshared(struct mutex *mtx)
{
- __mutex_init(mtx, /*pshared=*/true);
+ __mutex_init(mtx, /*pshared=*/true, /*recursive=*/false);
+}
+
+void mutex_init_recursive(struct mutex *mtx)
+{
+ __mutex_init(mtx, /*pshared=*/false, /*recursive=*/true);
}
void mutex_destroy(struct mutex *mtx)
diff --git a/tools/perf/util/mutex.h b/tools/perf/util/mutex.h
index 38458f00846f..70232d8d094f 100644
--- a/tools/perf/util/mutex.h
+++ b/tools/perf/util/mutex.h
@@ -104,6 +104,8 @@ void mutex_init(struct mutex *mtx);
* process-private attribute.
*/
void mutex_init_pshared(struct mutex *mtx);
+/* Initializes a mutex that may be recursively held on the same thread. */
+void mutex_init_recursive(struct mutex *mtx);
void mutex_destroy(struct mutex *mtx);
void mutex_lock(struct mutex *mtx) EXCLUSIVE_LOCK_FUNCTION(*mtx);
diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config
index 70b28c1e653e..f2a2fd236ca8 100644
--- a/tools/testing/selftests/bpf/config
+++ b/tools/testing/selftests/bpf/config
@@ -50,6 +50,7 @@ CONFIG_IPV6_SIT=y
CONFIG_IPV6_TUNNEL=y
CONFIG_KEYS=y
CONFIG_LIRC=y
+CONFIG_LIVEPATCH=y
CONFIG_LWTUNNEL=y
CONFIG_MODULE_SIG=y
CONFIG_MODULE_SRCVERSION_ALL=y
@@ -111,6 +112,8 @@ CONFIG_IP6_NF_FILTER=y
CONFIG_NF_NAT=y
CONFIG_PACKET=y
CONFIG_RC_CORE=y
+CONFIG_SAMPLES=y
+CONFIG_SAMPLE_LIVEPATCH=m
CONFIG_SECURITY=y
CONFIG_SECURITYFS=y
CONFIG_SYN_COOKIES=y
diff --git a/tools/testing/selftests/bpf/prog_tests/livepatch_trampoline.c b/tools/testing/selftests/bpf/prog_tests/livepatch_trampoline.c
new file mode 100644
index 000000000000..72aa5376c30e
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/livepatch_trampoline.c
@@ -0,0 +1,107 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
+
+#include <test_progs.h>
+#include "testing_helpers.h"
+#include "livepatch_trampoline.skel.h"
+
+static int load_livepatch(void)
+{
+ char path[4096];
+
+ /* CI will set KBUILD_OUTPUT */
+ snprintf(path, sizeof(path), "%s/samples/livepatch/livepatch-sample.ko",
+ getenv("KBUILD_OUTPUT") ? : "../../../..");
+
+ return load_module(path, env_verbosity > VERBOSE_NONE);
+}
+
+static void unload_livepatch(void)
+{
+ /* Disable the livepatch before unloading the module */
+ system("echo 0 > /sys/kernel/livepatch/livepatch_sample/enabled");
+
+ unload_module("livepatch_sample", env_verbosity > VERBOSE_NONE);
+}
+
+static void read_proc_cmdline(void)
+{
+ char buf[4096];
+ int fd, ret;
+
+ fd = open("/proc/cmdline", O_RDONLY);
+ if (!ASSERT_OK_FD(fd, "open /proc/cmdline"))
+ return;
+
+ ret = read(fd, buf, sizeof(buf));
+ if (!ASSERT_GT(ret, 0, "read /proc/cmdline"))
+ goto out;
+
+ ASSERT_OK(strncmp(buf, "this has been live patched", 26), "strncmp");
+
+out:
+ close(fd);
+}
+
+static void __test_livepatch_trampoline(bool fexit_first)
+{
+ struct livepatch_trampoline *skel = NULL;
+ int err;
+
+ skel = livepatch_trampoline__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "skel_open_and_load"))
+ goto out;
+
+ skel->bss->my_pid = getpid();
+
+ if (!fexit_first) {
+ /* fentry program is loaded first by default */
+ err = livepatch_trampoline__attach(skel);
+ if (!ASSERT_OK(err, "skel_attach"))
+ goto out;
+ } else {
+ /* Manually load fexit program first. */
+ skel->links.fexit_cmdline = bpf_program__attach(skel->progs.fexit_cmdline);
+ if (!ASSERT_OK_PTR(skel->links.fexit_cmdline, "attach_fexit"))
+ goto out;
+
+ skel->links.fentry_cmdline = bpf_program__attach(skel->progs.fentry_cmdline);
+ if (!ASSERT_OK_PTR(skel->links.fentry_cmdline, "attach_fentry"))
+ goto out;
+ }
+
+ read_proc_cmdline();
+
+ ASSERT_EQ(skel->bss->fentry_hit, 1, "fentry_hit");
+ ASSERT_EQ(skel->bss->fexit_hit, 1, "fexit_hit");
+out:
+ livepatch_trampoline__destroy(skel);
+}
+
+void test_livepatch_trampoline(void)
+{
+ int retry_cnt = 0;
+
+retry:
+ if (load_livepatch()) {
+ if (retry_cnt) {
+ ASSERT_OK(1, "load_livepatch");
+ goto out;
+ }
+ /*
+ * Something else (previous run of the same test?) loaded
+ * the KLP module. Unload the KLP module and retry.
+ */
+ unload_livepatch();
+ retry_cnt++;
+ goto retry;
+ }
+
+ if (test__start_subtest("fentry_first"))
+ __test_livepatch_trampoline(false);
+
+ if (test__start_subtest("fexit_first"))
+ __test_livepatch_trampoline(true);
+out:
+ unload_livepatch();
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/mptcp.c b/tools/testing/selftests/bpf/prog_tests/mptcp.c
index f8eb7f9d4fd2..8fade8bdc451 100644
--- a/tools/testing/selftests/bpf/prog_tests/mptcp.c
+++ b/tools/testing/selftests/bpf/prog_tests/mptcp.c
@@ -6,11 +6,13 @@
#include <netinet/in.h>
#include <test_progs.h>
#include <unistd.h>
+#include <errno.h>
#include "cgroup_helpers.h"
#include "network_helpers.h"
#include "mptcp_sock.skel.h"
#include "mptcpify.skel.h"
#include "mptcp_subflow.skel.h"
+#include "mptcp_sockmap.skel.h"
#define NS_TEST "mptcp_ns"
#define ADDR_1 "10.0.1.1"
@@ -436,6 +438,142 @@ close_cgroup:
close(cgroup_fd);
}
+/* Test sockmap on MPTCP server handling non-mp-capable clients. */
+static void test_sockmap_with_mptcp_fallback(struct mptcp_sockmap *skel)
+{
+ int listen_fd = -1, client_fd1 = -1, client_fd2 = -1;
+ int server_fd1 = -1, server_fd2 = -1, sent, recvd;
+ char snd[9] = "123456789";
+ char rcv[10];
+
+ /* start server with MPTCP enabled */
+ listen_fd = start_mptcp_server(AF_INET, NULL, 0, 0);
+ if (!ASSERT_OK_FD(listen_fd, "sockmap-fb:start_mptcp_server"))
+ return;
+
+ skel->bss->trace_port = ntohs(get_socket_local_port(listen_fd));
+ skel->bss->sk_index = 0;
+ /* create client without MPTCP enabled */
+ client_fd1 = connect_to_fd_opts(listen_fd, NULL);
+ if (!ASSERT_OK_FD(client_fd1, "sockmap-fb:connect_to_fd"))
+ goto end;
+
+ server_fd1 = accept(listen_fd, NULL, 0);
+ skel->bss->sk_index = 1;
+ client_fd2 = connect_to_fd_opts(listen_fd, NULL);
+ if (!ASSERT_OK_FD(client_fd2, "sockmap-fb:connect_to_fd"))
+ goto end;
+
+ server_fd2 = accept(listen_fd, NULL, 0);
+ /* test normal redirect behavior: data sent by client_fd1 can be
+ * received by client_fd2
+ */
+ skel->bss->redirect_idx = 1;
+ sent = send(client_fd1, snd, sizeof(snd), 0);
+ if (!ASSERT_EQ(sent, sizeof(snd), "sockmap-fb:send(client_fd1)"))
+ goto end;
+
+ /* try to recv more bytes to avoid truncation check */
+ recvd = recv(client_fd2, rcv, sizeof(rcv), 0);
+ if (!ASSERT_EQ(recvd, sizeof(snd), "sockmap-fb:recv(client_fd2)"))
+ goto end;
+
+end:
+ if (client_fd1 >= 0)
+ close(client_fd1);
+ if (client_fd2 >= 0)
+ close(client_fd2);
+ if (server_fd1 >= 0)
+ close(server_fd1);
+ if (server_fd2 >= 0)
+ close(server_fd2);
+ close(listen_fd);
+}
+
+/* Test sockmap rejection of MPTCP sockets - both server and client sides. */
+static void test_sockmap_reject_mptcp(struct mptcp_sockmap *skel)
+{
+ int listen_fd = -1, server_fd = -1, client_fd1 = -1;
+ int err, zero = 0;
+
+ /* start server with MPTCP enabled */
+ listen_fd = start_mptcp_server(AF_INET, NULL, 0, 0);
+ if (!ASSERT_OK_FD(listen_fd, "start_mptcp_server"))
+ return;
+
+ skel->bss->trace_port = ntohs(get_socket_local_port(listen_fd));
+ skel->bss->sk_index = 0;
+ /* create client with MPTCP enabled */
+ client_fd1 = connect_to_fd(listen_fd, 0);
+ if (!ASSERT_OK_FD(client_fd1, "connect_to_fd client_fd1"))
+ goto end;
+
+ /* bpf_sock_map_update() called from sockops should reject MPTCP sk */
+ if (!ASSERT_EQ(skel->bss->helper_ret, -EOPNOTSUPP, "should reject"))
+ goto end;
+
+ server_fd = accept(listen_fd, NULL, 0);
+ err = bpf_map_update_elem(bpf_map__fd(skel->maps.sock_map),
+ &zero, &server_fd, BPF_NOEXIST);
+ if (!ASSERT_EQ(err, -EOPNOTSUPP, "server should be disallowed"))
+ goto end;
+
+ /* MPTCP client should also be disallowed */
+ err = bpf_map_update_elem(bpf_map__fd(skel->maps.sock_map),
+ &zero, &client_fd1, BPF_NOEXIST);
+ if (!ASSERT_EQ(err, -EOPNOTSUPP, "client should be disallowed"))
+ goto end;
+end:
+ if (client_fd1 >= 0)
+ close(client_fd1);
+ if (server_fd >= 0)
+ close(server_fd);
+ close(listen_fd);
+}
+
+static void test_mptcp_sockmap(void)
+{
+ struct mptcp_sockmap *skel;
+ struct netns_obj *netns;
+ int cgroup_fd, err;
+
+ cgroup_fd = test__join_cgroup("/mptcp_sockmap");
+ if (!ASSERT_OK_FD(cgroup_fd, "join_cgroup: mptcp_sockmap"))
+ return;
+
+ skel = mptcp_sockmap__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "skel_open_load: mptcp_sockmap"))
+ goto close_cgroup;
+
+ skel->links.mptcp_sockmap_inject =
+ bpf_program__attach_cgroup(skel->progs.mptcp_sockmap_inject, cgroup_fd);
+ if (!ASSERT_OK_PTR(skel->links.mptcp_sockmap_inject, "attach sockmap"))
+ goto skel_destroy;
+
+ err = bpf_prog_attach(bpf_program__fd(skel->progs.mptcp_sockmap_redirect),
+ bpf_map__fd(skel->maps.sock_map),
+ BPF_SK_SKB_STREAM_VERDICT, 0);
+ if (!ASSERT_OK(err, "bpf_prog_attach stream verdict"))
+ goto skel_destroy;
+
+ netns = netns_new(NS_TEST, true);
+ if (!ASSERT_OK_PTR(netns, "netns_new: mptcp_sockmap"))
+ goto skel_destroy;
+
+ if (endpoint_init("subflow") < 0)
+ goto close_netns;
+
+ test_sockmap_with_mptcp_fallback(skel);
+ test_sockmap_reject_mptcp(skel);
+
+close_netns:
+ netns_free(netns);
+skel_destroy:
+ mptcp_sockmap__destroy(skel);
+close_cgroup:
+ close(cgroup_fd);
+}
+
void test_mptcp(void)
{
if (test__start_subtest("base"))
@@ -444,4 +582,6 @@ void test_mptcp(void)
test_mptcpify();
if (test__start_subtest("subflow"))
test_subflow();
+ if (test__start_subtest("sockmap"))
+ test_mptcp_sockmap();
}
diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c
new file mode 100644
index 000000000000..c9efdd2a5b18
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c
@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include "stacktrace_ips.skel.h"
+
+#ifdef __x86_64__
+static int check_stacktrace_ips(int fd, __u32 key, int cnt, ...)
+{
+ __u64 ips[PERF_MAX_STACK_DEPTH];
+ struct ksyms *ksyms = NULL;
+ int i, err = 0;
+ va_list args;
+
+ /* sorted by addr */
+ ksyms = load_kallsyms_local();
+ if (!ASSERT_OK_PTR(ksyms, "load_kallsyms_local"))
+ return -1;
+
+ /* unlikely, but... */
+ if (!ASSERT_LT(cnt, PERF_MAX_STACK_DEPTH, "check_max"))
+ return -1;
+
+ err = bpf_map_lookup_elem(fd, &key, ips);
+ if (err)
+ goto out;
+
+ /*
+ * Compare all symbols provided via arguments with stacktrace ips,
+ * and their related symbol addresses.t
+ */
+ va_start(args, cnt);
+
+ for (i = 0; i < cnt; i++) {
+ unsigned long val;
+ struct ksym *ksym;
+
+ val = va_arg(args, unsigned long);
+ ksym = ksym_search_local(ksyms, ips[i]);
+ if (!ASSERT_OK_PTR(ksym, "ksym_search_local"))
+ break;
+ ASSERT_EQ(ksym->addr, val, "stack_cmp");
+ }
+
+ va_end(args);
+
+out:
+ free_kallsyms_local(ksyms);
+ return err;
+}
+
+static void test_stacktrace_ips_kprobe_multi(bool retprobe)
+{
+ LIBBPF_OPTS(bpf_kprobe_multi_opts, opts,
+ .retprobe = retprobe
+ );
+ LIBBPF_OPTS(bpf_test_run_opts, topts);
+ struct stacktrace_ips *skel;
+
+ skel = stacktrace_ips__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "stacktrace_ips__open_and_load"))
+ return;
+
+ if (!skel->kconfig->CONFIG_UNWINDER_ORC) {
+ test__skip();
+ goto cleanup;
+ }
+
+ skel->links.kprobe_multi_test = bpf_program__attach_kprobe_multi_opts(
+ skel->progs.kprobe_multi_test,
+ "bpf_testmod_stacktrace_test", &opts);
+ if (!ASSERT_OK_PTR(skel->links.kprobe_multi_test, "bpf_program__attach_kprobe_multi_opts"))
+ goto cleanup;
+
+ trigger_module_test_read(1);
+
+ load_kallsyms();
+
+ check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 4,
+ ksym_get_addr("bpf_testmod_stacktrace_test_3"),
+ ksym_get_addr("bpf_testmod_stacktrace_test_2"),
+ ksym_get_addr("bpf_testmod_stacktrace_test_1"),
+ ksym_get_addr("bpf_testmod_test_read"));
+
+cleanup:
+ stacktrace_ips__destroy(skel);
+}
+
+static void test_stacktrace_ips_raw_tp(void)
+{
+ __u32 info_len = sizeof(struct bpf_prog_info);
+ LIBBPF_OPTS(bpf_test_run_opts, topts);
+ struct bpf_prog_info info = {};
+ struct stacktrace_ips *skel;
+ __u64 bpf_prog_ksym = 0;
+ int err;
+
+ skel = stacktrace_ips__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "stacktrace_ips__open_and_load"))
+ return;
+
+ if (!skel->kconfig->CONFIG_UNWINDER_ORC) {
+ test__skip();
+ goto cleanup;
+ }
+
+ skel->links.rawtp_test = bpf_program__attach_raw_tracepoint(
+ skel->progs.rawtp_test,
+ "bpf_testmod_test_read");
+ if (!ASSERT_OK_PTR(skel->links.rawtp_test, "bpf_program__attach_raw_tracepoint"))
+ goto cleanup;
+
+ /* get bpf program address */
+ info.jited_ksyms = ptr_to_u64(&bpf_prog_ksym);
+ info.nr_jited_ksyms = 1;
+ err = bpf_prog_get_info_by_fd(bpf_program__fd(skel->progs.rawtp_test),
+ &info, &info_len);
+ if (!ASSERT_OK(err, "bpf_prog_get_info_by_fd"))
+ goto cleanup;
+
+ trigger_module_test_read(1);
+
+ load_kallsyms();
+
+ check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 2,
+ bpf_prog_ksym,
+ ksym_get_addr("bpf_trace_run2"));
+
+cleanup:
+ stacktrace_ips__destroy(skel);
+}
+
+static void __test_stacktrace_ips(void)
+{
+ if (test__start_subtest("kprobe_multi"))
+ test_stacktrace_ips_kprobe_multi(false);
+ if (test__start_subtest("kretprobe_multi"))
+ test_stacktrace_ips_kprobe_multi(true);
+ if (test__start_subtest("raw_tp"))
+ test_stacktrace_ips_raw_tp();
+}
+#else
+static void __test_stacktrace_ips(void)
+{
+ test__skip();
+}
+#endif
+
+void test_stacktrace_ips(void)
+{
+ __test_stacktrace_ips();
+}
diff --git a/tools/testing/selftests/bpf/progs/iters_looping.c b/tools/testing/selftests/bpf/progs/iters_looping.c
index 05fa5ce7fc59..d00fd570255a 100644
--- a/tools/testing/selftests/bpf/progs/iters_looping.c
+++ b/tools/testing/selftests/bpf/progs/iters_looping.c
@@ -161,3 +161,56 @@ int simplest_loop(void *ctx)
return 0;
}
+
+__used
+static void iterator_with_diff_stack_depth(int x)
+{
+ struct bpf_iter_num iter;
+
+ asm volatile (
+ "if r1 == 42 goto 0f;"
+ "*(u64 *)(r10 - 128) = 0;"
+ "0:"
+ /* create iterator */
+ "r1 = %[iter];"
+ "r2 = 0;"
+ "r3 = 10;"
+ "call %[bpf_iter_num_new];"
+ "1:"
+ /* consume next item */
+ "r1 = %[iter];"
+ "call %[bpf_iter_num_next];"
+ "if r0 == 0 goto 2f;"
+ "goto 1b;"
+ "2:"
+ /* destroy iterator */
+ "r1 = %[iter];"
+ "call %[bpf_iter_num_destroy];"
+ :
+ : __imm_ptr(iter), ITER_HELPERS
+ : __clobber_common, "r6"
+ );
+}
+
+SEC("socket")
+__success
+__naked int widening_stack_size_bug(void *ctx)
+{
+ /*
+ * Depending on iterator_with_diff_stack_depth() parameter value,
+ * subprogram stack depth is either 8 or 128 bytes. Arrange values so
+ * that it is 128 on a first call and 8 on a second. This triggered a
+ * bug in verifier's widen_imprecise_scalars() logic.
+ */
+ asm volatile (
+ "r6 = 0;"
+ "r1 = 0;"
+ "1:"
+ "call iterator_with_diff_stack_depth;"
+ "r1 = 42;"
+ "r6 += 1;"
+ "if r6 < 2 goto 1b;"
+ "r0 = 0;"
+ "exit;"
+ ::: __clobber_all);
+}
diff --git a/tools/testing/selftests/bpf/progs/livepatch_trampoline.c b/tools/testing/selftests/bpf/progs/livepatch_trampoline.c
new file mode 100644
index 000000000000..15579d5bcd91
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/livepatch_trampoline.c
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+int fentry_hit;
+int fexit_hit;
+int my_pid;
+
+SEC("fentry/cmdline_proc_show")
+int BPF_PROG(fentry_cmdline)
+{
+ if (my_pid != (bpf_get_current_pid_tgid() >> 32))
+ return 0;
+
+ fentry_hit = 1;
+ return 0;
+}
+
+SEC("fexit/cmdline_proc_show")
+int BPF_PROG(fexit_cmdline)
+{
+ if (my_pid != (bpf_get_current_pid_tgid() >> 32))
+ return 0;
+
+ fexit_hit = 1;
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/mptcp_sockmap.c b/tools/testing/selftests/bpf/progs/mptcp_sockmap.c
new file mode 100644
index 000000000000..d4eef0cbadb9
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/mptcp_sockmap.c
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bpf_tracing_net.h"
+
+char _license[] SEC("license") = "GPL";
+
+int sk_index;
+int redirect_idx;
+int trace_port;
+int helper_ret;
+struct {
+ __uint(type, BPF_MAP_TYPE_SOCKMAP);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(__u32));
+ __uint(max_entries, 100);
+} sock_map SEC(".maps");
+
+SEC("sockops")
+int mptcp_sockmap_inject(struct bpf_sock_ops *skops)
+{
+ struct bpf_sock *sk;
+
+ /* only accept specified connection */
+ if (skops->local_port != trace_port ||
+ skops->op != BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB)
+ return 1;
+
+ sk = skops->sk;
+ if (!sk)
+ return 1;
+
+ /* update sk handler */
+ helper_ret = bpf_sock_map_update(skops, &sock_map, &sk_index, BPF_NOEXIST);
+
+ return 1;
+}
+
+SEC("sk_skb/stream_verdict")
+int mptcp_sockmap_redirect(struct __sk_buff *skb)
+{
+ /* redirect skb to the sk under sock_map[redirect_idx] */
+ return bpf_sk_redirect_map(skb, &sock_map, redirect_idx, 0);
+}
diff --git a/tools/testing/selftests/bpf/progs/stacktrace_ips.c b/tools/testing/selftests/bpf/progs/stacktrace_ips.c
new file mode 100644
index 000000000000..a96c8150d7f5
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/stacktrace_ips.c
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Facebook
+
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+#ifndef PERF_MAX_STACK_DEPTH
+#define PERF_MAX_STACK_DEPTH 127
+#endif
+
+typedef __u64 stack_trace_t[PERF_MAX_STACK_DEPTH];
+
+struct {
+ __uint(type, BPF_MAP_TYPE_STACK_TRACE);
+ __uint(max_entries, 16384);
+ __type(key, __u32);
+ __type(value, stack_trace_t);
+} stackmap SEC(".maps");
+
+extern bool CONFIG_UNWINDER_ORC __kconfig __weak;
+
+/*
+ * This function is here to have CONFIG_UNWINDER_ORC
+ * used and added to object BTF.
+ */
+int unused(void)
+{
+ return CONFIG_UNWINDER_ORC ? 0 : 1;
+}
+
+__u32 stack_key;
+
+SEC("kprobe.multi")
+int kprobe_multi_test(struct pt_regs *ctx)
+{
+ stack_key = bpf_get_stackid(ctx, &stackmap, 0);
+ return 0;
+}
+
+SEC("raw_tp/bpf_testmod_test_read")
+int rawtp_test(void *ctx)
+{
+ /* Skip ebpf program entry in the stack. */
+ stack_key = bpf_get_stackid(ctx, &stackmap, 0);
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/stream_fail.c b/tools/testing/selftests/bpf/progs/stream_fail.c
index b4a0d0cc8ec8..3662515f0107 100644
--- a/tools/testing/selftests/bpf/progs/stream_fail.c
+++ b/tools/testing/selftests/bpf/progs/stream_fail.c
@@ -10,7 +10,7 @@ SEC("syscall")
__failure __msg("Possibly NULL pointer passed")
int stream_vprintk_null_arg(void *ctx)
{
- bpf_stream_vprintk(BPF_STDOUT, "", NULL, 0, NULL);
+ bpf_stream_vprintk_impl(BPF_STDOUT, "", NULL, 0, NULL);
return 0;
}
@@ -18,7 +18,7 @@ SEC("syscall")
__failure __msg("R3 type=scalar expected=")
int stream_vprintk_scalar_arg(void *ctx)
{
- bpf_stream_vprintk(BPF_STDOUT, "", (void *)46, 0, NULL);
+ bpf_stream_vprintk_impl(BPF_STDOUT, "", (void *)46, 0, NULL);
return 0;
}
@@ -26,7 +26,7 @@ SEC("syscall")
__failure __msg("arg#1 doesn't point to a const string")
int stream_vprintk_string_arg(void *ctx)
{
- bpf_stream_vprintk(BPF_STDOUT, ctx, NULL, 0, NULL);
+ bpf_stream_vprintk_impl(BPF_STDOUT, ctx, NULL, 0, NULL);
return 0;
}
diff --git a/tools/testing/selftests/bpf/progs/task_work.c b/tools/testing/selftests/bpf/progs/task_work.c
index 23217f06a3ec..663a80990f8f 100644
--- a/tools/testing/selftests/bpf/progs/task_work.c
+++ b/tools/testing/selftests/bpf/progs/task_work.c
@@ -66,7 +66,7 @@ int oncpu_hash_map(struct pt_regs *args)
if (!work)
return 0;
- bpf_task_work_schedule_resume(task, &work->tw, &hmap, process_work, NULL);
+ bpf_task_work_schedule_resume_impl(task, &work->tw, &hmap, process_work, NULL);
return 0;
}
@@ -80,7 +80,7 @@ int oncpu_array_map(struct pt_regs *args)
work = bpf_map_lookup_elem(&arrmap, &key);
if (!work)
return 0;
- bpf_task_work_schedule_signal(task, &work->tw, &arrmap, process_work, NULL);
+ bpf_task_work_schedule_signal_impl(task, &work->tw, &arrmap, process_work, NULL);
return 0;
}
@@ -102,6 +102,6 @@ int oncpu_lru_map(struct pt_regs *args)
work = bpf_map_lookup_elem(&lrumap, &key);
if (!work || work->data[0])
return 0;
- bpf_task_work_schedule_resume(task, &work->tw, &lrumap, process_work, NULL);
+ bpf_task_work_schedule_resume_impl(task, &work->tw, &lrumap, process_work, NULL);
return 0;
}
diff --git a/tools/testing/selftests/bpf/progs/task_work_fail.c b/tools/testing/selftests/bpf/progs/task_work_fail.c
index 77fe8f28facd..1270953fd092 100644
--- a/tools/testing/selftests/bpf/progs/task_work_fail.c
+++ b/tools/testing/selftests/bpf/progs/task_work_fail.c
@@ -53,7 +53,7 @@ int mismatch_map(struct pt_regs *args)
work = bpf_map_lookup_elem(&arrmap, &key);
if (!work)
return 0;
- bpf_task_work_schedule_resume(task, &work->tw, &hmap, process_work, NULL);
+ bpf_task_work_schedule_resume_impl(task, &work->tw, &hmap, process_work, NULL);
return 0;
}
@@ -65,7 +65,7 @@ int no_map_task_work(struct pt_regs *args)
struct bpf_task_work tw;
task = bpf_get_current_task_btf();
- bpf_task_work_schedule_resume(task, &tw, &hmap, process_work, NULL);
+ bpf_task_work_schedule_resume_impl(task, &tw, &hmap, process_work, NULL);
return 0;
}
@@ -76,7 +76,7 @@ int task_work_null(struct pt_regs *args)
struct task_struct *task;
task = bpf_get_current_task_btf();
- bpf_task_work_schedule_resume(task, NULL, &hmap, process_work, NULL);
+ bpf_task_work_schedule_resume_impl(task, NULL, &hmap, process_work, NULL);
return 0;
}
@@ -91,6 +91,6 @@ int map_null(struct pt_regs *args)
work = bpf_map_lookup_elem(&arrmap, &key);
if (!work)
return 0;
- bpf_task_work_schedule_resume(task, &work->tw, NULL, process_work, NULL);
+ bpf_task_work_schedule_resume_impl(task, &work->tw, NULL, process_work, NULL);
return 0;
}
diff --git a/tools/testing/selftests/bpf/progs/task_work_stress.c b/tools/testing/selftests/bpf/progs/task_work_stress.c
index 90fca06fff56..55e555f7f41b 100644
--- a/tools/testing/selftests/bpf/progs/task_work_stress.c
+++ b/tools/testing/selftests/bpf/progs/task_work_stress.c
@@ -51,8 +51,8 @@ int schedule_task_work(void *ctx)
if (!work)
return 0;
}
- err = bpf_task_work_schedule_signal(bpf_get_current_task_btf(), &work->tw, &hmap,
- process_work, NULL);
+ err = bpf_task_work_schedule_signal_impl(bpf_get_current_task_btf(), &work->tw, &hmap,
+ process_work, NULL);
if (err)
__sync_fetch_and_add(&schedule_error, 1);
else
diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
index 8074bc5f6f20..ed0a4721d8fd 100644
--- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c
@@ -417,6 +417,30 @@ noinline int bpf_testmod_fentry_test11(u64 a, void *b, short c, int d,
return a + (long)b + c + d + (long)e + f + g + h + i + j + k;
}
+noinline void bpf_testmod_stacktrace_test(void)
+{
+ /* used for stacktrace test as attach function */
+ asm volatile ("");
+}
+
+noinline void bpf_testmod_stacktrace_test_3(void)
+{
+ bpf_testmod_stacktrace_test();
+ asm volatile ("");
+}
+
+noinline void bpf_testmod_stacktrace_test_2(void)
+{
+ bpf_testmod_stacktrace_test_3();
+ asm volatile ("");
+}
+
+noinline void bpf_testmod_stacktrace_test_1(void)
+{
+ bpf_testmod_stacktrace_test_2();
+ asm volatile ("");
+}
+
int bpf_testmod_fentry_ok;
noinline ssize_t
@@ -497,6 +521,8 @@ bpf_testmod_test_read(struct file *file, struct kobject *kobj,
21, 22, 23, 24, 25, 26) != 231)
goto out;
+ bpf_testmod_stacktrace_test_1();
+
bpf_testmod_fentry_ok = 1;
out:
return -EIO; /* always fail */
diff --git a/tools/testing/selftests/coredump/.gitignore b/tools/testing/selftests/coredump/.gitignore
new file mode 100644
index 000000000000..097f52db0be9
--- /dev/null
+++ b/tools/testing/selftests/coredump/.gitignore
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+stackdump_test
+coredump_socket_test
+coredump_socket_protocol_test
diff --git a/tools/testing/selftests/coredump/Makefile b/tools/testing/selftests/coredump/Makefile
index 77b3665c73c7..dece1a31d561 100644
--- a/tools/testing/selftests/coredump/Makefile
+++ b/tools/testing/selftests/coredump/Makefile
@@ -1,7 +1,13 @@
# SPDX-License-Identifier: GPL-2.0-only
CFLAGS += -Wall -O0 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES)
-TEST_GEN_PROGS := stackdump_test
+TEST_GEN_PROGS := stackdump_test \
+ coredump_socket_test \
+ coredump_socket_protocol_test
TEST_FILES := stackdump
include ../lib.mk
+
+$(OUTPUT)/stackdump_test: coredump_test_helpers.c
+$(OUTPUT)/coredump_socket_test: coredump_test_helpers.c
+$(OUTPUT)/coredump_socket_protocol_test: coredump_test_helpers.c
diff --git a/tools/testing/selftests/coredump/coredump_socket_protocol_test.c b/tools/testing/selftests/coredump/coredump_socket_protocol_test.c
new file mode 100644
index 000000000000..d19b6717c53e
--- /dev/null
+++ b/tools/testing/selftests/coredump/coredump_socket_protocol_test.c
@@ -0,0 +1,1568 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <sys/stat.h>
+#include <sys/epoll.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include "coredump_test.h"
+
+#define NUM_CRASHING_COREDUMPS 5
+
+FIXTURE_SETUP(coredump)
+{
+ FILE *file;
+ int ret;
+
+ self->pid_coredump_server = -ESRCH;
+ self->fd_tmpfs_detached = -1;
+ file = fopen("/proc/sys/kernel/core_pattern", "r");
+ ASSERT_NE(NULL, file);
+
+ ret = fread(self->original_core_pattern, 1, sizeof(self->original_core_pattern), file);
+ ASSERT_TRUE(ret || feof(file));
+ ASSERT_LT(ret, sizeof(self->original_core_pattern));
+
+ self->original_core_pattern[ret] = '\0';
+ self->fd_tmpfs_detached = create_detached_tmpfs();
+ ASSERT_GE(self->fd_tmpfs_detached, 0);
+
+ ret = fclose(file);
+ ASSERT_EQ(0, ret);
+}
+
+FIXTURE_TEARDOWN(coredump)
+{
+ const char *reason;
+ FILE *file;
+ int ret, status;
+
+ if (self->pid_coredump_server > 0) {
+ kill(self->pid_coredump_server, SIGTERM);
+ waitpid(self->pid_coredump_server, &status, 0);
+ }
+ unlink("/tmp/coredump.file");
+ unlink("/tmp/coredump.socket");
+
+ file = fopen("/proc/sys/kernel/core_pattern", "w");
+ if (!file) {
+ reason = "Unable to open core_pattern";
+ goto fail;
+ }
+
+ ret = fprintf(file, "%s", self->original_core_pattern);
+ if (ret < 0) {
+ reason = "Unable to write to core_pattern";
+ goto fail;
+ }
+
+ ret = fclose(file);
+ if (ret) {
+ reason = "Unable to close core_pattern";
+ goto fail;
+ }
+
+ if (self->fd_tmpfs_detached >= 0) {
+ ret = close(self->fd_tmpfs_detached);
+ if (ret < 0) {
+ reason = "Unable to close detached tmpfs";
+ goto fail;
+ }
+ self->fd_tmpfs_detached = -1;
+ }
+
+ return;
+fail:
+ /* This should never happen */
+ fprintf(stderr, "Failed to cleanup coredump test: %s\n", reason);
+}
+
+TEST_F(coredump, socket_request_kernel)
+{
+ int pidfd, ret, status;
+ pid_t pid, pid_coredump_server;
+ struct stat st;
+ struct pidfd_info info = {};
+ int ipc_sockets[2];
+ char c;
+
+ ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
+
+ ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ struct coredump_req req = {};
+ int fd_server = -1, fd_coredump = -1, fd_core_file = -1, fd_peer_pidfd = -1;
+ int exit_code = EXIT_FAILURE;
+
+ close(ipc_sockets[0]);
+
+ fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
+ if (fd_server < 0) {
+ fprintf(stderr, "socket_request_kernel: create_and_listen_unix_socket failed: %m\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "socket_request_kernel: write_nointr to ipc socket failed: %m\n");
+ goto out;
+ }
+
+ close(ipc_sockets[1]);
+
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ fprintf(stderr, "socket_request_kernel: accept4 failed: %m\n");
+ goto out;
+ }
+
+ fd_peer_pidfd = get_peer_pidfd(fd_coredump);
+ if (fd_peer_pidfd < 0) {
+ fprintf(stderr, "socket_request_kernel: get_peer_pidfd failed\n");
+ goto out;
+ }
+
+ if (!get_pidfd_info(fd_peer_pidfd, &info)) {
+ fprintf(stderr, "socket_request_kernel: get_pidfd_info failed\n");
+ goto out;
+ }
+
+ if (!(info.mask & PIDFD_INFO_COREDUMP)) {
+ fprintf(stderr, "socket_request_kernel: PIDFD_INFO_COREDUMP not set in mask\n");
+ goto out;
+ }
+
+ if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
+ fprintf(stderr, "socket_request_kernel: PIDFD_COREDUMPED not set in coredump_mask\n");
+ goto out;
+ }
+
+ fd_core_file = creat("/tmp/coredump.file", 0644);
+ if (fd_core_file < 0) {
+ fprintf(stderr, "socket_request_kernel: creat coredump file failed: %m\n");
+ goto out;
+ }
+
+ if (!read_coredump_req(fd_coredump, &req)) {
+ fprintf(stderr, "socket_request_kernel: read_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
+ COREDUMP_KERNEL | COREDUMP_USERSPACE |
+ COREDUMP_REJECT | COREDUMP_WAIT)) {
+ fprintf(stderr, "socket_request_kernel: check_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!send_coredump_ack(fd_coredump, &req,
+ COREDUMP_KERNEL | COREDUMP_WAIT, 0)) {
+ fprintf(stderr, "socket_request_kernel: send_coredump_ack failed\n");
+ goto out;
+ }
+
+ if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) {
+ fprintf(stderr, "socket_request_kernel: read_marker COREDUMP_MARK_REQACK failed\n");
+ goto out;
+ }
+
+ for (;;) {
+ char buffer[4096];
+ ssize_t bytes_read, bytes_write;
+
+ bytes_read = read(fd_coredump, buffer, sizeof(buffer));
+ if (bytes_read < 0) {
+ fprintf(stderr, "socket_request_kernel: read from coredump socket failed: %m\n");
+ goto out;
+ }
+
+ if (bytes_read == 0)
+ break;
+
+ bytes_write = write(fd_core_file, buffer, bytes_read);
+ if (bytes_read != bytes_write) {
+ if (bytes_write < 0 && errno == ENOSPC)
+ continue;
+ fprintf(stderr, "socket_request_kernel: write to core file failed (read=%zd, write=%zd): %m\n",
+ bytes_read, bytes_write);
+ goto out;
+ }
+ }
+
+ exit_code = EXIT_SUCCESS;
+ fprintf(stderr, "socket_request_kernel: completed successfully\n");
+out:
+ if (fd_core_file >= 0)
+ close(fd_core_file);
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ if (fd_server >= 0)
+ close(fd_server);
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0)
+ crashing_child();
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_TRUE(WCOREDUMP(status));
+
+ ASSERT_TRUE(get_pidfd_info(pidfd, &info));
+ ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
+ ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+
+ ASSERT_EQ(stat("/tmp/coredump.file", &st), 0);
+ ASSERT_GT(st.st_size, 0);
+ system("file /tmp/coredump.file");
+}
+
+TEST_F(coredump, socket_request_userspace)
+{
+ int pidfd, ret, status;
+ pid_t pid, pid_coredump_server;
+ struct pidfd_info info = {};
+ int ipc_sockets[2];
+ char c;
+
+ ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
+
+ ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ struct coredump_req req = {};
+ int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
+ int exit_code = EXIT_FAILURE;
+
+ close(ipc_sockets[0]);
+
+ fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
+ if (fd_server < 0) {
+ fprintf(stderr, "socket_request_userspace: create_and_listen_unix_socket failed: %m\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "socket_request_userspace: write_nointr to ipc socket failed: %m\n");
+ goto out;
+ }
+
+ close(ipc_sockets[1]);
+
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ fprintf(stderr, "socket_request_userspace: accept4 failed: %m\n");
+ goto out;
+ }
+
+ fd_peer_pidfd = get_peer_pidfd(fd_coredump);
+ if (fd_peer_pidfd < 0) {
+ fprintf(stderr, "socket_request_userspace: get_peer_pidfd failed\n");
+ goto out;
+ }
+
+ if (!get_pidfd_info(fd_peer_pidfd, &info)) {
+ fprintf(stderr, "socket_request_userspace: get_pidfd_info failed\n");
+ goto out;
+ }
+
+ if (!(info.mask & PIDFD_INFO_COREDUMP)) {
+ fprintf(stderr, "socket_request_userspace: PIDFD_INFO_COREDUMP not set in mask\n");
+ goto out;
+ }
+
+ if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
+ fprintf(stderr, "socket_request_userspace: PIDFD_COREDUMPED not set in coredump_mask\n");
+ goto out;
+ }
+
+ if (!read_coredump_req(fd_coredump, &req)) {
+ fprintf(stderr, "socket_request_userspace: read_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
+ COREDUMP_KERNEL | COREDUMP_USERSPACE |
+ COREDUMP_REJECT | COREDUMP_WAIT)) {
+ fprintf(stderr, "socket_request_userspace: check_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!send_coredump_ack(fd_coredump, &req,
+ COREDUMP_USERSPACE | COREDUMP_WAIT, 0)) {
+ fprintf(stderr, "socket_request_userspace: send_coredump_ack failed\n");
+ goto out;
+ }
+
+ if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) {
+ fprintf(stderr, "socket_request_userspace: read_marker COREDUMP_MARK_REQACK failed\n");
+ goto out;
+ }
+
+ for (;;) {
+ char buffer[4096];
+ ssize_t bytes_read;
+
+ bytes_read = read(fd_coredump, buffer, sizeof(buffer));
+ if (bytes_read > 0) {
+ fprintf(stderr, "socket_request_userspace: unexpected data received (expected no coredump data)\n");
+ goto out;
+ }
+
+ if (bytes_read < 0) {
+ fprintf(stderr, "socket_request_userspace: read from coredump socket failed: %m\n");
+ goto out;
+ }
+
+ if (bytes_read == 0)
+ break;
+ }
+
+ exit_code = EXIT_SUCCESS;
+ fprintf(stderr, "socket_request_userspace: completed successfully\n");
+out:
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ if (fd_server >= 0)
+ close(fd_server);
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0)
+ crashing_child();
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_TRUE(WCOREDUMP(status));
+
+ ASSERT_TRUE(get_pidfd_info(pidfd, &info));
+ ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
+ ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+}
+
+TEST_F(coredump, socket_request_reject)
+{
+ int pidfd, ret, status;
+ pid_t pid, pid_coredump_server;
+ struct pidfd_info info = {};
+ int ipc_sockets[2];
+ char c;
+
+ ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
+
+ ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ struct coredump_req req = {};
+ int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
+ int exit_code = EXIT_FAILURE;
+
+ close(ipc_sockets[0]);
+
+ fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
+ if (fd_server < 0) {
+ fprintf(stderr, "socket_request_reject: create_and_listen_unix_socket failed: %m\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "socket_request_reject: write_nointr to ipc socket failed: %m\n");
+ goto out;
+ }
+
+ close(ipc_sockets[1]);
+
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ fprintf(stderr, "socket_request_reject: accept4 failed: %m\n");
+ goto out;
+ }
+
+ fd_peer_pidfd = get_peer_pidfd(fd_coredump);
+ if (fd_peer_pidfd < 0) {
+ fprintf(stderr, "socket_request_reject: get_peer_pidfd failed\n");
+ goto out;
+ }
+
+ if (!get_pidfd_info(fd_peer_pidfd, &info)) {
+ fprintf(stderr, "socket_request_reject: get_pidfd_info failed\n");
+ goto out;
+ }
+
+ if (!(info.mask & PIDFD_INFO_COREDUMP)) {
+ fprintf(stderr, "socket_request_reject: PIDFD_INFO_COREDUMP not set in mask\n");
+ goto out;
+ }
+
+ if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
+ fprintf(stderr, "socket_request_reject: PIDFD_COREDUMPED not set in coredump_mask\n");
+ goto out;
+ }
+
+ if (!read_coredump_req(fd_coredump, &req)) {
+ fprintf(stderr, "socket_request_reject: read_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
+ COREDUMP_KERNEL | COREDUMP_USERSPACE |
+ COREDUMP_REJECT | COREDUMP_WAIT)) {
+ fprintf(stderr, "socket_request_reject: check_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!send_coredump_ack(fd_coredump, &req,
+ COREDUMP_REJECT | COREDUMP_WAIT, 0)) {
+ fprintf(stderr, "socket_request_reject: send_coredump_ack failed\n");
+ goto out;
+ }
+
+ if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) {
+ fprintf(stderr, "socket_request_reject: read_marker COREDUMP_MARK_REQACK failed\n");
+ goto out;
+ }
+
+ for (;;) {
+ char buffer[4096];
+ ssize_t bytes_read;
+
+ bytes_read = read(fd_coredump, buffer, sizeof(buffer));
+ if (bytes_read > 0) {
+ fprintf(stderr, "socket_request_reject: unexpected data received (expected no coredump data for REJECT)\n");
+ goto out;
+ }
+
+ if (bytes_read < 0) {
+ fprintf(stderr, "socket_request_reject: read from coredump socket failed: %m\n");
+ goto out;
+ }
+
+ if (bytes_read == 0)
+ break;
+ }
+
+ exit_code = EXIT_SUCCESS;
+ fprintf(stderr, "socket_request_reject: completed successfully\n");
+out:
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ if (fd_server >= 0)
+ close(fd_server);
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0)
+ crashing_child();
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_FALSE(WCOREDUMP(status));
+
+ ASSERT_TRUE(get_pidfd_info(pidfd, &info));
+ ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
+ ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+}
+
+TEST_F(coredump, socket_request_invalid_flag_combination)
+{
+ int pidfd, ret, status;
+ pid_t pid, pid_coredump_server;
+ struct pidfd_info info = {};
+ int ipc_sockets[2];
+ char c;
+
+ ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
+
+ ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ struct coredump_req req = {};
+ int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
+ int exit_code = EXIT_FAILURE;
+
+ close(ipc_sockets[0]);
+
+ fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
+ if (fd_server < 0) {
+ fprintf(stderr, "socket_request_invalid_flag_combination: create_and_listen_unix_socket failed: %m\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "socket_request_invalid_flag_combination: write_nointr to ipc socket failed: %m\n");
+ goto out;
+ }
+
+ close(ipc_sockets[1]);
+
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ fprintf(stderr, "socket_request_invalid_flag_combination: accept4 failed: %m\n");
+ goto out;
+ }
+
+ fd_peer_pidfd = get_peer_pidfd(fd_coredump);
+ if (fd_peer_pidfd < 0) {
+ fprintf(stderr, "socket_request_invalid_flag_combination: get_peer_pidfd failed\n");
+ goto out;
+ }
+
+ if (!get_pidfd_info(fd_peer_pidfd, &info)) {
+ fprintf(stderr, "socket_request_invalid_flag_combination: get_pidfd_info failed\n");
+ goto out;
+ }
+
+ if (!(info.mask & PIDFD_INFO_COREDUMP)) {
+ fprintf(stderr, "socket_request_invalid_flag_combination: PIDFD_INFO_COREDUMP not set in mask\n");
+ goto out;
+ }
+
+ if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
+ fprintf(stderr, "socket_request_invalid_flag_combination: PIDFD_COREDUMPED not set in coredump_mask\n");
+ goto out;
+ }
+
+ if (!read_coredump_req(fd_coredump, &req)) {
+ fprintf(stderr, "socket_request_invalid_flag_combination: read_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
+ COREDUMP_KERNEL | COREDUMP_USERSPACE |
+ COREDUMP_REJECT | COREDUMP_WAIT)) {
+ fprintf(stderr, "socket_request_invalid_flag_combination: check_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!send_coredump_ack(fd_coredump, &req,
+ COREDUMP_KERNEL | COREDUMP_REJECT | COREDUMP_WAIT, 0)) {
+ fprintf(stderr, "socket_request_invalid_flag_combination: send_coredump_ack failed\n");
+ goto out;
+ }
+
+ if (!read_marker(fd_coredump, COREDUMP_MARK_CONFLICTING)) {
+ fprintf(stderr, "socket_request_invalid_flag_combination: read_marker COREDUMP_MARK_CONFLICTING failed\n");
+ goto out;
+ }
+
+ exit_code = EXIT_SUCCESS;
+ fprintf(stderr, "socket_request_invalid_flag_combination: completed successfully\n");
+out:
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ if (fd_server >= 0)
+ close(fd_server);
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0)
+ crashing_child();
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_FALSE(WCOREDUMP(status));
+
+ ASSERT_TRUE(get_pidfd_info(pidfd, &info));
+ ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
+ ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+}
+
+TEST_F(coredump, socket_request_unknown_flag)
+{
+ int pidfd, ret, status;
+ pid_t pid, pid_coredump_server;
+ struct pidfd_info info = {};
+ int ipc_sockets[2];
+ char c;
+
+ ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
+
+ ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ struct coredump_req req = {};
+ int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
+ int exit_code = EXIT_FAILURE;
+
+ close(ipc_sockets[0]);
+
+ fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
+ if (fd_server < 0) {
+ fprintf(stderr, "socket_request_unknown_flag: create_and_listen_unix_socket failed: %m\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "socket_request_unknown_flag: write_nointr to ipc socket failed: %m\n");
+ goto out;
+ }
+
+ close(ipc_sockets[1]);
+
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ fprintf(stderr, "socket_request_unknown_flag: accept4 failed: %m\n");
+ goto out;
+ }
+
+ fd_peer_pidfd = get_peer_pidfd(fd_coredump);
+ if (fd_peer_pidfd < 0) {
+ fprintf(stderr, "socket_request_unknown_flag: get_peer_pidfd failed\n");
+ goto out;
+ }
+
+ if (!get_pidfd_info(fd_peer_pidfd, &info)) {
+ fprintf(stderr, "socket_request_unknown_flag: get_pidfd_info failed\n");
+ goto out;
+ }
+
+ if (!(info.mask & PIDFD_INFO_COREDUMP)) {
+ fprintf(stderr, "socket_request_unknown_flag: PIDFD_INFO_COREDUMP not set in mask\n");
+ goto out;
+ }
+
+ if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
+ fprintf(stderr, "socket_request_unknown_flag: PIDFD_COREDUMPED not set in coredump_mask\n");
+ goto out;
+ }
+
+ if (!read_coredump_req(fd_coredump, &req)) {
+ fprintf(stderr, "socket_request_unknown_flag: read_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
+ COREDUMP_KERNEL | COREDUMP_USERSPACE |
+ COREDUMP_REJECT | COREDUMP_WAIT)) {
+ fprintf(stderr, "socket_request_unknown_flag: check_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!send_coredump_ack(fd_coredump, &req, (1ULL << 63), 0)) {
+ fprintf(stderr, "socket_request_unknown_flag: send_coredump_ack failed\n");
+ goto out;
+ }
+
+ if (!read_marker(fd_coredump, COREDUMP_MARK_UNSUPPORTED)) {
+ fprintf(stderr, "socket_request_unknown_flag: read_marker COREDUMP_MARK_UNSUPPORTED failed\n");
+ goto out;
+ }
+
+ exit_code = EXIT_SUCCESS;
+ fprintf(stderr, "socket_request_unknown_flag: completed successfully\n");
+out:
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ if (fd_server >= 0)
+ close(fd_server);
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0)
+ crashing_child();
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_FALSE(WCOREDUMP(status));
+
+ ASSERT_TRUE(get_pidfd_info(pidfd, &info));
+ ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
+ ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+}
+
+TEST_F(coredump, socket_request_invalid_size_small)
+{
+ int pidfd, ret, status;
+ pid_t pid, pid_coredump_server;
+ struct pidfd_info info = {};
+ int ipc_sockets[2];
+ char c;
+
+ ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
+
+ ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ struct coredump_req req = {};
+ int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
+ int exit_code = EXIT_FAILURE;
+
+ close(ipc_sockets[0]);
+
+ fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
+ if (fd_server < 0) {
+ fprintf(stderr, "socket_request_invalid_size_small: create_and_listen_unix_socket failed: %m\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "socket_request_invalid_size_small: write_nointr to ipc socket failed: %m\n");
+ goto out;
+ }
+
+ close(ipc_sockets[1]);
+
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ fprintf(stderr, "socket_request_invalid_size_small: accept4 failed: %m\n");
+ goto out;
+ }
+
+ fd_peer_pidfd = get_peer_pidfd(fd_coredump);
+ if (fd_peer_pidfd < 0) {
+ fprintf(stderr, "socket_request_invalid_size_small: get_peer_pidfd failed\n");
+ goto out;
+ }
+
+ if (!get_pidfd_info(fd_peer_pidfd, &info)) {
+ fprintf(stderr, "socket_request_invalid_size_small: get_pidfd_info failed\n");
+ goto out;
+ }
+
+ if (!(info.mask & PIDFD_INFO_COREDUMP)) {
+ fprintf(stderr, "socket_request_invalid_size_small: PIDFD_INFO_COREDUMP not set in mask\n");
+ goto out;
+ }
+
+ if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
+ fprintf(stderr, "socket_request_invalid_size_small: PIDFD_COREDUMPED not set in coredump_mask\n");
+ goto out;
+ }
+
+ if (!read_coredump_req(fd_coredump, &req)) {
+ fprintf(stderr, "socket_request_invalid_size_small: read_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
+ COREDUMP_KERNEL | COREDUMP_USERSPACE |
+ COREDUMP_REJECT | COREDUMP_WAIT)) {
+ fprintf(stderr, "socket_request_invalid_size_small: check_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!send_coredump_ack(fd_coredump, &req,
+ COREDUMP_REJECT | COREDUMP_WAIT,
+ COREDUMP_ACK_SIZE_VER0 / 2)) {
+ fprintf(stderr, "socket_request_invalid_size_small: send_coredump_ack failed\n");
+ goto out;
+ }
+
+ if (!read_marker(fd_coredump, COREDUMP_MARK_MINSIZE)) {
+ fprintf(stderr, "socket_request_invalid_size_small: read_marker COREDUMP_MARK_MINSIZE failed\n");
+ goto out;
+ }
+
+ exit_code = EXIT_SUCCESS;
+ fprintf(stderr, "socket_request_invalid_size_small: completed successfully\n");
+out:
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ if (fd_server >= 0)
+ close(fd_server);
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0)
+ crashing_child();
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_FALSE(WCOREDUMP(status));
+
+ ASSERT_TRUE(get_pidfd_info(pidfd, &info));
+ ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
+ ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+}
+
+TEST_F(coredump, socket_request_invalid_size_large)
+{
+ int pidfd, ret, status;
+ pid_t pid, pid_coredump_server;
+ struct pidfd_info info = {};
+ int ipc_sockets[2];
+ char c;
+
+ ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
+
+ ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ struct coredump_req req = {};
+ int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
+ int exit_code = EXIT_FAILURE;
+
+ close(ipc_sockets[0]);
+
+ fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
+ if (fd_server < 0) {
+ fprintf(stderr, "socket_request_invalid_size_large: create_and_listen_unix_socket failed: %m\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "socket_request_invalid_size_large: write_nointr to ipc socket failed: %m\n");
+ goto out;
+ }
+
+ close(ipc_sockets[1]);
+
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ fprintf(stderr, "socket_request_invalid_size_large: accept4 failed: %m\n");
+ goto out;
+ }
+
+ fd_peer_pidfd = get_peer_pidfd(fd_coredump);
+ if (fd_peer_pidfd < 0) {
+ fprintf(stderr, "socket_request_invalid_size_large: get_peer_pidfd failed\n");
+ goto out;
+ }
+
+ if (!get_pidfd_info(fd_peer_pidfd, &info)) {
+ fprintf(stderr, "socket_request_invalid_size_large: get_pidfd_info failed\n");
+ goto out;
+ }
+
+ if (!(info.mask & PIDFD_INFO_COREDUMP)) {
+ fprintf(stderr, "socket_request_invalid_size_large: PIDFD_INFO_COREDUMP not set in mask\n");
+ goto out;
+ }
+
+ if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
+ fprintf(stderr, "socket_request_invalid_size_large: PIDFD_COREDUMPED not set in coredump_mask\n");
+ goto out;
+ }
+
+ if (!read_coredump_req(fd_coredump, &req)) {
+ fprintf(stderr, "socket_request_invalid_size_large: read_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
+ COREDUMP_KERNEL | COREDUMP_USERSPACE |
+ COREDUMP_REJECT | COREDUMP_WAIT)) {
+ fprintf(stderr, "socket_request_invalid_size_large: check_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!send_coredump_ack(fd_coredump, &req,
+ COREDUMP_REJECT | COREDUMP_WAIT,
+ COREDUMP_ACK_SIZE_VER0 + PAGE_SIZE)) {
+ fprintf(stderr, "socket_request_invalid_size_large: send_coredump_ack failed\n");
+ goto out;
+ }
+
+ if (!read_marker(fd_coredump, COREDUMP_MARK_MAXSIZE)) {
+ fprintf(stderr, "socket_request_invalid_size_large: read_marker COREDUMP_MARK_MAXSIZE failed\n");
+ goto out;
+ }
+
+ exit_code = EXIT_SUCCESS;
+ fprintf(stderr, "socket_request_invalid_size_large: completed successfully\n");
+out:
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ if (fd_server >= 0)
+ close(fd_server);
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0)
+ crashing_child();
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_FALSE(WCOREDUMP(status));
+
+ ASSERT_TRUE(get_pidfd_info(pidfd, &info));
+ ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
+ ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+}
+
+/*
+ * Test: PIDFD_INFO_COREDUMP_SIGNAL via socket coredump with SIGSEGV
+ *
+ * Verify that when using socket-based coredump protocol,
+ * the coredump_signal field is correctly exposed as SIGSEGV.
+ */
+TEST_F(coredump, socket_coredump_signal_sigsegv)
+{
+ int pidfd, ret, status;
+ pid_t pid, pid_coredump_server;
+ struct pidfd_info info = {};
+ int ipc_sockets[2];
+ char c;
+
+ ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
+
+ ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ struct coredump_req req = {};
+ int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
+ int exit_code = EXIT_FAILURE;
+
+ close(ipc_sockets[0]);
+
+ fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
+ if (fd_server < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: create_and_listen_unix_socket failed: %m\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: write_nointr to ipc socket failed: %m\n");
+ goto out;
+ }
+
+ close(ipc_sockets[1]);
+
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: accept4 failed: %m\n");
+ goto out;
+ }
+
+ fd_peer_pidfd = get_peer_pidfd(fd_coredump);
+ if (fd_peer_pidfd < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: get_peer_pidfd failed\n");
+ goto out;
+ }
+
+ if (!get_pidfd_info(fd_peer_pidfd, &info)) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: get_pidfd_info failed\n");
+ goto out;
+ }
+
+ if (!(info.mask & PIDFD_INFO_COREDUMP)) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: PIDFD_INFO_COREDUMP not set in mask\n");
+ goto out;
+ }
+
+ if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: PIDFD_COREDUMPED not set in coredump_mask\n");
+ goto out;
+ }
+
+ /* Verify coredump_signal is available and correct */
+ if (!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: PIDFD_INFO_COREDUMP_SIGNAL not set in mask\n");
+ goto out;
+ }
+
+ if (info.coredump_signal != SIGSEGV) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: coredump_signal=%d, expected SIGSEGV=%d\n",
+ info.coredump_signal, SIGSEGV);
+ goto out;
+ }
+
+ if (!read_coredump_req(fd_coredump, &req)) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: read_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!send_coredump_ack(fd_coredump, &req,
+ COREDUMP_REJECT | COREDUMP_WAIT, 0)) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: send_coredump_ack failed\n");
+ goto out;
+ }
+
+ if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: read_marker COREDUMP_MARK_REQACK failed\n");
+ goto out;
+ }
+
+ exit_code = EXIT_SUCCESS;
+ fprintf(stderr, "socket_coredump_signal_sigsegv: completed successfully\n");
+out:
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ if (fd_server >= 0)
+ close(fd_server);
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0)
+ crashing_child();
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_EQ(WTERMSIG(status), SIGSEGV);
+
+ ASSERT_TRUE(get_pidfd_info(pidfd, &info));
+ ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP));
+ ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL));
+ ASSERT_EQ(info.coredump_signal, SIGSEGV);
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+}
+
+/*
+ * Test: PIDFD_INFO_COREDUMP_SIGNAL via socket coredump with SIGABRT
+ *
+ * Verify that when using socket-based coredump protocol,
+ * the coredump_signal field is correctly exposed as SIGABRT.
+ */
+TEST_F(coredump, socket_coredump_signal_sigabrt)
+{
+ int pidfd, ret, status;
+ pid_t pid, pid_coredump_server;
+ struct pidfd_info info = {};
+ int ipc_sockets[2];
+ char c;
+
+ ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
+
+ ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ struct coredump_req req = {};
+ int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
+ int exit_code = EXIT_FAILURE;
+
+ close(ipc_sockets[0]);
+
+ fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
+ if (fd_server < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: create_and_listen_unix_socket failed: %m\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: write_nointr to ipc socket failed: %m\n");
+ goto out;
+ }
+
+ close(ipc_sockets[1]);
+
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: accept4 failed: %m\n");
+ goto out;
+ }
+
+ fd_peer_pidfd = get_peer_pidfd(fd_coredump);
+ if (fd_peer_pidfd < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: get_peer_pidfd failed\n");
+ goto out;
+ }
+
+ if (!get_pidfd_info(fd_peer_pidfd, &info)) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: get_pidfd_info failed\n");
+ goto out;
+ }
+
+ if (!(info.mask & PIDFD_INFO_COREDUMP)) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: PIDFD_INFO_COREDUMP not set in mask\n");
+ goto out;
+ }
+
+ if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: PIDFD_COREDUMPED not set in coredump_mask\n");
+ goto out;
+ }
+
+ /* Verify coredump_signal is available and correct */
+ if (!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: PIDFD_INFO_COREDUMP_SIGNAL not set in mask\n");
+ goto out;
+ }
+
+ if (info.coredump_signal != SIGABRT) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: coredump_signal=%d, expected SIGABRT=%d\n",
+ info.coredump_signal, SIGABRT);
+ goto out;
+ }
+
+ if (!read_coredump_req(fd_coredump, &req)) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: read_coredump_req failed\n");
+ goto out;
+ }
+
+ if (!send_coredump_ack(fd_coredump, &req,
+ COREDUMP_REJECT | COREDUMP_WAIT, 0)) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: send_coredump_ack failed\n");
+ goto out;
+ }
+
+ if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: read_marker COREDUMP_MARK_REQACK failed\n");
+ goto out;
+ }
+
+ exit_code = EXIT_SUCCESS;
+ fprintf(stderr, "socket_coredump_signal_sigabrt: completed successfully\n");
+out:
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ if (fd_server >= 0)
+ close(fd_server);
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0)
+ abort();
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_EQ(WTERMSIG(status), SIGABRT);
+
+ ASSERT_TRUE(get_pidfd_info(pidfd, &info));
+ ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP));
+ ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL));
+ ASSERT_EQ(info.coredump_signal, SIGABRT);
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+}
+
+TEST_F_TIMEOUT(coredump, socket_multiple_crashing_coredumps, 500)
+{
+ int pidfd[NUM_CRASHING_COREDUMPS], status[NUM_CRASHING_COREDUMPS];
+ pid_t pid[NUM_CRASHING_COREDUMPS], pid_coredump_server;
+ struct pidfd_info info = {};
+ int ipc_sockets[2];
+ char c;
+
+ ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1;
+ int exit_code = EXIT_FAILURE;
+ struct coredump_req req = {};
+
+ close(ipc_sockets[0]);
+ fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
+ if (fd_server < 0) {
+ fprintf(stderr, "Failed to create and listen on unix socket\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "Failed to notify parent via ipc socket\n");
+ goto out;
+ }
+ close(ipc_sockets[1]);
+
+ for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) {
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ fprintf(stderr, "accept4 failed: %m\n");
+ goto out;
+ }
+
+ fd_peer_pidfd = get_peer_pidfd(fd_coredump);
+ if (fd_peer_pidfd < 0) {
+ fprintf(stderr, "get_peer_pidfd failed for fd %d: %m\n", fd_coredump);
+ goto out;
+ }
+
+ if (!get_pidfd_info(fd_peer_pidfd, &info)) {
+ fprintf(stderr, "get_pidfd_info failed for fd %d\n", fd_peer_pidfd);
+ goto out;
+ }
+
+ if (!(info.mask & PIDFD_INFO_COREDUMP)) {
+ fprintf(stderr, "pidfd info missing PIDFD_INFO_COREDUMP for fd %d\n", fd_peer_pidfd);
+ goto out;
+ }
+ if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
+ fprintf(stderr, "pidfd info missing PIDFD_COREDUMPED for fd %d\n", fd_peer_pidfd);
+ goto out;
+ }
+
+ if (!read_coredump_req(fd_coredump, &req)) {
+ fprintf(stderr, "read_coredump_req failed for fd %d\n", fd_coredump);
+ goto out;
+ }
+
+ if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
+ COREDUMP_KERNEL | COREDUMP_USERSPACE |
+ COREDUMP_REJECT | COREDUMP_WAIT)) {
+ fprintf(stderr, "check_coredump_req failed for fd %d\n", fd_coredump);
+ goto out;
+ }
+
+ if (!send_coredump_ack(fd_coredump, &req,
+ COREDUMP_KERNEL | COREDUMP_WAIT, 0)) {
+ fprintf(stderr, "send_coredump_ack failed for fd %d\n", fd_coredump);
+ goto out;
+ }
+
+ if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) {
+ fprintf(stderr, "read_marker failed for fd %d\n", fd_coredump);
+ goto out;
+ }
+
+ fd_core_file = open_coredump_tmpfile(self->fd_tmpfs_detached);
+ if (fd_core_file < 0) {
+ fprintf(stderr, "%m - open_coredump_tmpfile failed for fd %d\n", fd_coredump);
+ goto out;
+ }
+
+ for (;;) {
+ char buffer[4096];
+ ssize_t bytes_read, bytes_write;
+
+ bytes_read = read(fd_coredump, buffer, sizeof(buffer));
+ if (bytes_read < 0) {
+ fprintf(stderr, "read failed for fd %d: %m\n", fd_coredump);
+ goto out;
+ }
+
+ if (bytes_read == 0)
+ break;
+
+ bytes_write = write(fd_core_file, buffer, bytes_read);
+ if (bytes_read != bytes_write) {
+ if (bytes_write < 0 && errno == ENOSPC)
+ continue;
+ fprintf(stderr, "write failed for fd %d: %m\n", fd_core_file);
+ goto out;
+ }
+ }
+
+ close(fd_core_file);
+ close(fd_peer_pidfd);
+ close(fd_coredump);
+ fd_peer_pidfd = -1;
+ fd_coredump = -1;
+ }
+
+ exit_code = EXIT_SUCCESS;
+out:
+ if (fd_core_file >= 0)
+ close(fd_core_file);
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ if (fd_server >= 0)
+ close(fd_server);
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) {
+ pid[i] = fork();
+ ASSERT_GE(pid[i], 0);
+ if (pid[i] == 0)
+ crashing_child();
+ pidfd[i] = sys_pidfd_open(pid[i], 0);
+ ASSERT_GE(pidfd[i], 0);
+ }
+
+ for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) {
+ waitpid(pid[i], &status[i], 0);
+ ASSERT_TRUE(WIFSIGNALED(status[i]));
+ ASSERT_TRUE(WCOREDUMP(status[i]));
+ }
+
+ for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) {
+ info.mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP;
+ ASSERT_EQ(ioctl(pidfd[i], PIDFD_GET_INFO, &info), 0);
+ ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
+ ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
+ }
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+}
+
+TEST_F_TIMEOUT(coredump, socket_multiple_crashing_coredumps_epoll_workers, 500)
+{
+ int pidfd[NUM_CRASHING_COREDUMPS], status[NUM_CRASHING_COREDUMPS];
+ pid_t pid[NUM_CRASHING_COREDUMPS], pid_coredump_server, worker_pids[NUM_CRASHING_COREDUMPS];
+ struct pidfd_info info = {};
+ int ipc_sockets[2];
+ char c;
+
+ ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ int fd_server = -1, exit_code = EXIT_FAILURE, n_conns = 0;
+ fd_server = -1;
+ exit_code = EXIT_FAILURE;
+ n_conns = 0;
+ close(ipc_sockets[0]);
+ fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
+ if (fd_server < 0) {
+ fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: create_and_listen_unix_socket failed: %m\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: write_nointr to ipc socket failed: %m\n");
+ goto out;
+ }
+ close(ipc_sockets[1]);
+
+ while (n_conns < NUM_CRASHING_COREDUMPS) {
+ int fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1;
+ struct coredump_req req = {};
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ if (errno == EAGAIN || errno == EWOULDBLOCK)
+ continue;
+ fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: accept4 failed: %m\n");
+ goto out;
+ }
+ fd_peer_pidfd = get_peer_pidfd(fd_coredump);
+ if (fd_peer_pidfd < 0) {
+ fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: get_peer_pidfd failed\n");
+ goto out;
+ }
+ if (!get_pidfd_info(fd_peer_pidfd, &info)) {
+ fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: get_pidfd_info failed\n");
+ goto out;
+ }
+ if (!(info.mask & PIDFD_INFO_COREDUMP) || !(info.coredump_mask & PIDFD_COREDUMPED)) {
+ fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: missing PIDFD_INFO_COREDUMP or PIDFD_COREDUMPED\n");
+ goto out;
+ }
+ if (!read_coredump_req(fd_coredump, &req)) {
+ fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: read_coredump_req failed\n");
+ goto out;
+ }
+ if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
+ COREDUMP_KERNEL | COREDUMP_USERSPACE |
+ COREDUMP_REJECT | COREDUMP_WAIT)) {
+ fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: check_coredump_req failed\n");
+ goto out;
+ }
+ if (!send_coredump_ack(fd_coredump, &req, COREDUMP_KERNEL | COREDUMP_WAIT, 0)) {
+ fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: send_coredump_ack failed\n");
+ goto out;
+ }
+ if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) {
+ fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: read_marker failed\n");
+ goto out;
+ }
+ fd_core_file = open_coredump_tmpfile(self->fd_tmpfs_detached);
+ if (fd_core_file < 0) {
+ fprintf(stderr, "socket_multiple_crashing_coredumps_epoll_workers: open_coredump_tmpfile failed: %m\n");
+ goto out;
+ }
+ pid_t worker = fork();
+ if (worker == 0) {
+ close(fd_server);
+ process_coredump_worker(fd_coredump, fd_peer_pidfd, fd_core_file);
+ }
+ worker_pids[n_conns] = worker;
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_core_file >= 0)
+ close(fd_core_file);
+ n_conns++;
+ }
+ exit_code = EXIT_SUCCESS;
+out:
+ if (fd_server >= 0)
+ close(fd_server);
+
+ // Reap all worker processes
+ for (int i = 0; i < n_conns; i++) {
+ int wstatus;
+ if (waitpid(worker_pids[i], &wstatus, 0) < 0) {
+ fprintf(stderr, "Failed to wait for worker %d: %m\n", worker_pids[i]);
+ } else if (WIFEXITED(wstatus) && WEXITSTATUS(wstatus) != EXIT_SUCCESS) {
+ fprintf(stderr, "Worker %d exited with error code %d\n", worker_pids[i], WEXITSTATUS(wstatus));
+ exit_code = EXIT_FAILURE;
+ }
+ }
+
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) {
+ pid[i] = fork();
+ ASSERT_GE(pid[i], 0);
+ if (pid[i] == 0)
+ crashing_child();
+ pidfd[i] = sys_pidfd_open(pid[i], 0);
+ ASSERT_GE(pidfd[i], 0);
+ }
+
+ for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) {
+ ASSERT_GE(waitpid(pid[i], &status[i], 0), 0);
+ ASSERT_TRUE(WIFSIGNALED(status[i]));
+ ASSERT_TRUE(WCOREDUMP(status[i]));
+ }
+
+ for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) {
+ info.mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP;
+ ASSERT_EQ(ioctl(pidfd[i], PIDFD_GET_INFO, &info), 0);
+ ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
+ ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
+ }
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/coredump/coredump_socket_test.c b/tools/testing/selftests/coredump/coredump_socket_test.c
new file mode 100644
index 000000000000..7e26d4a6a15d
--- /dev/null
+++ b/tools/testing/selftests/coredump/coredump_socket_test.c
@@ -0,0 +1,742 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <sys/stat.h>
+#include <sys/epoll.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include "coredump_test.h"
+
+FIXTURE_SETUP(coredump)
+{
+ FILE *file;
+ int ret;
+
+ self->pid_coredump_server = -ESRCH;
+ self->fd_tmpfs_detached = -1;
+ file = fopen("/proc/sys/kernel/core_pattern", "r");
+ ASSERT_NE(NULL, file);
+
+ ret = fread(self->original_core_pattern, 1, sizeof(self->original_core_pattern), file);
+ ASSERT_TRUE(ret || feof(file));
+ ASSERT_LT(ret, sizeof(self->original_core_pattern));
+
+ self->original_core_pattern[ret] = '\0';
+ self->fd_tmpfs_detached = create_detached_tmpfs();
+ ASSERT_GE(self->fd_tmpfs_detached, 0);
+
+ ret = fclose(file);
+ ASSERT_EQ(0, ret);
+}
+
+FIXTURE_TEARDOWN(coredump)
+{
+ const char *reason;
+ FILE *file;
+ int ret, status;
+
+ if (self->pid_coredump_server > 0) {
+ kill(self->pid_coredump_server, SIGTERM);
+ waitpid(self->pid_coredump_server, &status, 0);
+ }
+ unlink("/tmp/coredump.file");
+ unlink("/tmp/coredump.socket");
+
+ file = fopen("/proc/sys/kernel/core_pattern", "w");
+ if (!file) {
+ reason = "Unable to open core_pattern";
+ goto fail;
+ }
+
+ ret = fprintf(file, "%s", self->original_core_pattern);
+ if (ret < 0) {
+ reason = "Unable to write to core_pattern";
+ goto fail;
+ }
+
+ ret = fclose(file);
+ if (ret) {
+ reason = "Unable to close core_pattern";
+ goto fail;
+ }
+
+ if (self->fd_tmpfs_detached >= 0) {
+ ret = close(self->fd_tmpfs_detached);
+ if (ret < 0) {
+ reason = "Unable to close detached tmpfs";
+ goto fail;
+ }
+ self->fd_tmpfs_detached = -1;
+ }
+
+ return;
+fail:
+ /* This should never happen */
+ fprintf(stderr, "Failed to cleanup coredump test: %s\n", reason);
+}
+
+TEST_F(coredump, socket)
+{
+ int pidfd, ret, status;
+ pid_t pid, pid_coredump_server;
+ struct stat st;
+ struct pidfd_info info = {};
+ int ipc_sockets[2];
+ char c;
+
+ ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket"));
+
+ ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1;
+ int exit_code = EXIT_FAILURE;
+
+ close(ipc_sockets[0]);
+
+ fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
+ if (fd_server < 0) {
+ fprintf(stderr, "socket test: create_and_listen_unix_socket failed: %m\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "socket test: write_nointr to ipc socket failed: %m\n");
+ goto out;
+ }
+
+ close(ipc_sockets[1]);
+
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ fprintf(stderr, "socket test: accept4 failed: %m\n");
+ goto out;
+ }
+
+ fd_peer_pidfd = get_peer_pidfd(fd_coredump);
+ if (fd_peer_pidfd < 0) {
+ fprintf(stderr, "socket test: get_peer_pidfd failed\n");
+ goto out;
+ }
+
+ if (!get_pidfd_info(fd_peer_pidfd, &info)) {
+ fprintf(stderr, "socket test: get_pidfd_info failed\n");
+ goto out;
+ }
+
+ if (!(info.mask & PIDFD_INFO_COREDUMP)) {
+ fprintf(stderr, "socket test: PIDFD_INFO_COREDUMP not set in mask\n");
+ goto out;
+ }
+
+ if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
+ fprintf(stderr, "socket test: PIDFD_COREDUMPED not set in coredump_mask\n");
+ goto out;
+ }
+
+ fd_core_file = creat("/tmp/coredump.file", 0644);
+ if (fd_core_file < 0) {
+ fprintf(stderr, "socket test: creat coredump file failed: %m\n");
+ goto out;
+ }
+
+ for (;;) {
+ char buffer[4096];
+ ssize_t bytes_read, bytes_write;
+
+ bytes_read = read(fd_coredump, buffer, sizeof(buffer));
+ if (bytes_read < 0) {
+ fprintf(stderr, "socket test: read from coredump socket failed: %m\n");
+ goto out;
+ }
+
+ if (bytes_read == 0)
+ break;
+
+ bytes_write = write(fd_core_file, buffer, bytes_read);
+ if (bytes_read != bytes_write) {
+ if (bytes_write < 0 && errno == ENOSPC)
+ continue;
+ fprintf(stderr, "socket test: write to core file failed (read=%zd, write=%zd): %m\n", bytes_read, bytes_write);
+ goto out;
+ }
+ }
+
+ exit_code = EXIT_SUCCESS;
+ fprintf(stderr, "socket test: completed successfully\n");
+out:
+ if (fd_core_file >= 0)
+ close(fd_core_file);
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ if (fd_server >= 0)
+ close(fd_server);
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0)
+ crashing_child();
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_TRUE(WCOREDUMP(status));
+
+ ASSERT_TRUE(get_pidfd_info(pidfd, &info));
+ ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
+ ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+
+ ASSERT_EQ(stat("/tmp/coredump.file", &st), 0);
+ ASSERT_GT(st.st_size, 0);
+}
+
+TEST_F(coredump, socket_detect_userspace_client)
+{
+ int pidfd, ret, status;
+ pid_t pid, pid_coredump_server;
+ struct stat st;
+ struct pidfd_info info = {
+ .mask = PIDFD_INFO_COREDUMP,
+ };
+ int ipc_sockets[2];
+ char c;
+
+ ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket"));
+
+ ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
+ int exit_code = EXIT_FAILURE;
+
+ close(ipc_sockets[0]);
+
+ fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
+ if (fd_server < 0) {
+ fprintf(stderr, "socket_detect_userspace_client: create_and_listen_unix_socket failed: %m\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "socket_detect_userspace_client: write_nointr to ipc socket failed: %m\n");
+ goto out;
+ }
+
+ close(ipc_sockets[1]);
+
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ fprintf(stderr, "socket_detect_userspace_client: accept4 failed: %m\n");
+ goto out;
+ }
+
+ fd_peer_pidfd = get_peer_pidfd(fd_coredump);
+ if (fd_peer_pidfd < 0) {
+ fprintf(stderr, "socket_detect_userspace_client: get_peer_pidfd failed\n");
+ goto out;
+ }
+
+ if (!get_pidfd_info(fd_peer_pidfd, &info)) {
+ fprintf(stderr, "socket_detect_userspace_client: get_pidfd_info failed\n");
+ goto out;
+ }
+
+ if (!(info.mask & PIDFD_INFO_COREDUMP)) {
+ fprintf(stderr, "socket_detect_userspace_client: PIDFD_INFO_COREDUMP not set in mask\n");
+ goto out;
+ }
+
+ if (info.coredump_mask & PIDFD_COREDUMPED) {
+ fprintf(stderr, "socket_detect_userspace_client: PIDFD_COREDUMPED incorrectly set (should be userspace client)\n");
+ goto out;
+ }
+
+ exit_code = EXIT_SUCCESS;
+ fprintf(stderr, "socket_detect_userspace_client: completed successfully\n");
+out:
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ if (fd_server >= 0)
+ close(fd_server);
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0) {
+ int fd_socket;
+ ssize_t ret;
+ const struct sockaddr_un coredump_sk = {
+ .sun_family = AF_UNIX,
+ .sun_path = "/tmp/coredump.socket",
+ };
+ size_t coredump_sk_len =
+ offsetof(struct sockaddr_un, sun_path) +
+ sizeof("/tmp/coredump.socket");
+
+ fd_socket = socket(AF_UNIX, SOCK_STREAM, 0);
+ if (fd_socket < 0) {
+ fprintf(stderr, "socket_detect_userspace_client (client): socket failed: %m\n");
+ _exit(EXIT_FAILURE);
+ }
+
+ ret = connect(fd_socket, (const struct sockaddr *)&coredump_sk, coredump_sk_len);
+ if (ret < 0) {
+ fprintf(stderr, "socket_detect_userspace_client (client): connect failed: %m\n");
+ _exit(EXIT_FAILURE);
+ }
+
+ close(fd_socket);
+ pause();
+ fprintf(stderr, "socket_detect_userspace_client (client): completed successfully\n");
+ _exit(EXIT_SUCCESS);
+ }
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ ASSERT_TRUE(get_pidfd_info(pidfd, &info));
+ ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
+ ASSERT_EQ((info.coredump_mask & PIDFD_COREDUMPED), 0);
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+
+ ASSERT_EQ(sys_pidfd_send_signal(pidfd, SIGKILL, NULL, 0), 0);
+ ASSERT_EQ(close(pidfd), 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_EQ(WTERMSIG(status), SIGKILL);
+
+ ASSERT_NE(stat("/tmp/coredump.file", &st), 0);
+ ASSERT_EQ(errno, ENOENT);
+}
+
+TEST_F(coredump, socket_enoent)
+{
+ int pidfd, status;
+ pid_t pid;
+
+ ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket"));
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0)
+ crashing_child();
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_FALSE(WCOREDUMP(status));
+}
+
+TEST_F(coredump, socket_no_listener)
+{
+ int pidfd, ret, status;
+ pid_t pid, pid_coredump_server;
+ int ipc_sockets[2];
+ char c;
+ const struct sockaddr_un coredump_sk = {
+ .sun_family = AF_UNIX,
+ .sun_path = "/tmp/coredump.socket",
+ };
+ size_t coredump_sk_len = offsetof(struct sockaddr_un, sun_path) +
+ sizeof("/tmp/coredump.socket");
+
+ ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket"));
+
+ ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ int fd_server = -1;
+ int exit_code = EXIT_FAILURE;
+
+ close(ipc_sockets[0]);
+
+ fd_server = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0);
+ if (fd_server < 0) {
+ fprintf(stderr, "socket_no_listener: socket failed: %m\n");
+ goto out;
+ }
+
+ ret = bind(fd_server, (const struct sockaddr *)&coredump_sk, coredump_sk_len);
+ if (ret < 0) {
+ fprintf(stderr, "socket_no_listener: bind failed: %m\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "socket_no_listener: write_nointr to ipc socket failed: %m\n");
+ goto out;
+ }
+
+ exit_code = EXIT_SUCCESS;
+ fprintf(stderr, "socket_no_listener: completed successfully\n");
+out:
+ if (fd_server >= 0)
+ close(fd_server);
+ close(ipc_sockets[1]);
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0)
+ crashing_child();
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_FALSE(WCOREDUMP(status));
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+}
+
+/*
+ * Test: PIDFD_INFO_COREDUMP_SIGNAL via simple socket coredump
+ *
+ * Verify that when using simple socket-based coredump (@ pattern),
+ * the coredump_signal field is correctly exposed as SIGSEGV.
+ */
+TEST_F(coredump, socket_coredump_signal_sigsegv)
+{
+ int pidfd, ret, status;
+ pid_t pid, pid_coredump_server;
+ struct pidfd_info info = {};
+ int ipc_sockets[2];
+ char c;
+
+ ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket"));
+
+ ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1;
+ int exit_code = EXIT_FAILURE;
+
+ close(ipc_sockets[0]);
+
+ fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
+ if (fd_server < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: create_and_listen_unix_socket failed: %m\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: write_nointr to ipc socket failed: %m\n");
+ goto out;
+ }
+
+ close(ipc_sockets[1]);
+
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: accept4 failed: %m\n");
+ goto out;
+ }
+
+ fd_peer_pidfd = get_peer_pidfd(fd_coredump);
+ if (fd_peer_pidfd < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: get_peer_pidfd failed\n");
+ goto out;
+ }
+
+ if (!get_pidfd_info(fd_peer_pidfd, &info)) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: get_pidfd_info failed\n");
+ goto out;
+ }
+
+ if (!(info.mask & PIDFD_INFO_COREDUMP)) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: PIDFD_INFO_COREDUMP not set in mask\n");
+ goto out;
+ }
+
+ if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: PIDFD_COREDUMPED not set in coredump_mask\n");
+ goto out;
+ }
+
+ /* Verify coredump_signal is available and correct */
+ if (!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: PIDFD_INFO_COREDUMP_SIGNAL not set in mask\n");
+ goto out;
+ }
+
+ if (info.coredump_signal != SIGSEGV) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: coredump_signal=%d, expected SIGSEGV=%d\n",
+ info.coredump_signal, SIGSEGV);
+ goto out;
+ }
+
+ fd_core_file = open_coredump_tmpfile(self->fd_tmpfs_detached);
+ if (fd_core_file < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: open_coredump_tmpfile failed: %m\n");
+ goto out;
+ }
+
+ for (;;) {
+ char buffer[4096];
+ ssize_t bytes_read, bytes_write;
+
+ bytes_read = read(fd_coredump, buffer, sizeof(buffer));
+ if (bytes_read < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: read from coredump socket failed: %m\n");
+ goto out;
+ }
+
+ if (bytes_read == 0)
+ break;
+
+ bytes_write = write(fd_core_file, buffer, bytes_read);
+ if (bytes_read != bytes_write) {
+ fprintf(stderr, "socket_coredump_signal_sigsegv: write to core file failed (read=%zd, write=%zd): %m\n",
+ bytes_read, bytes_write);
+ goto out;
+ }
+ }
+
+ exit_code = EXIT_SUCCESS;
+ fprintf(stderr, "socket_coredump_signal_sigsegv: completed successfully\n");
+out:
+ if (fd_core_file >= 0)
+ close(fd_core_file);
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ if (fd_server >= 0)
+ close(fd_server);
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0)
+ crashing_child();
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_EQ(WTERMSIG(status), SIGSEGV);
+ ASSERT_TRUE(WCOREDUMP(status));
+
+ ASSERT_TRUE(get_pidfd_info(pidfd, &info));
+ ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP));
+ ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL));
+ ASSERT_EQ(info.coredump_signal, SIGSEGV);
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+}
+
+/*
+ * Test: PIDFD_INFO_COREDUMP_SIGNAL via simple socket coredump with SIGABRT
+ *
+ * Verify that when using simple socket-based coredump (@ pattern),
+ * the coredump_signal field is correctly exposed as SIGABRT.
+ */
+TEST_F(coredump, socket_coredump_signal_sigabrt)
+{
+ int pidfd, ret, status;
+ pid_t pid, pid_coredump_server;
+ struct pidfd_info info = {};
+ int ipc_sockets[2];
+ char c;
+
+ ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket"));
+
+ ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ pid_coredump_server = fork();
+ ASSERT_GE(pid_coredump_server, 0);
+ if (pid_coredump_server == 0) {
+ int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1;
+ int exit_code = EXIT_FAILURE;
+
+ close(ipc_sockets[0]);
+
+ fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
+ if (fd_server < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: create_and_listen_unix_socket failed: %m\n");
+ goto out;
+ }
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: write_nointr to ipc socket failed: %m\n");
+ goto out;
+ }
+
+ close(ipc_sockets[1]);
+
+ fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
+ if (fd_coredump < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: accept4 failed: %m\n");
+ goto out;
+ }
+
+ fd_peer_pidfd = get_peer_pidfd(fd_coredump);
+ if (fd_peer_pidfd < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: get_peer_pidfd failed\n");
+ goto out;
+ }
+
+ if (!get_pidfd_info(fd_peer_pidfd, &info)) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: get_pidfd_info failed\n");
+ goto out;
+ }
+
+ if (!(info.mask & PIDFD_INFO_COREDUMP)) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: PIDFD_INFO_COREDUMP not set in mask\n");
+ goto out;
+ }
+
+ if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: PIDFD_COREDUMPED not set in coredump_mask\n");
+ goto out;
+ }
+
+ /* Verify coredump_signal is available and correct */
+ if (!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL)) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: PIDFD_INFO_COREDUMP_SIGNAL not set in mask\n");
+ goto out;
+ }
+
+ if (info.coredump_signal != SIGABRT) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: coredump_signal=%d, expected SIGABRT=%d\n",
+ info.coredump_signal, SIGABRT);
+ goto out;
+ }
+
+ fd_core_file = open_coredump_tmpfile(self->fd_tmpfs_detached);
+ if (fd_core_file < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: open_coredump_tmpfile failed: %m\n");
+ goto out;
+ }
+
+ for (;;) {
+ char buffer[4096];
+ ssize_t bytes_read, bytes_write;
+
+ bytes_read = read(fd_coredump, buffer, sizeof(buffer));
+ if (bytes_read < 0) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: read from coredump socket failed: %m\n");
+ goto out;
+ }
+
+ if (bytes_read == 0)
+ break;
+
+ bytes_write = write(fd_core_file, buffer, bytes_read);
+ if (bytes_read != bytes_write) {
+ fprintf(stderr, "socket_coredump_signal_sigabrt: write to core file failed (read=%zd, write=%zd): %m\n",
+ bytes_read, bytes_write);
+ goto out;
+ }
+ }
+
+ exit_code = EXIT_SUCCESS;
+ fprintf(stderr, "socket_coredump_signal_sigabrt: completed successfully\n");
+out:
+ if (fd_core_file >= 0)
+ close(fd_core_file);
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ if (fd_server >= 0)
+ close(fd_server);
+ _exit(exit_code);
+ }
+ self->pid_coredump_server = pid_coredump_server;
+
+ EXPECT_EQ(close(ipc_sockets[1]), 0);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ EXPECT_EQ(close(ipc_sockets[0]), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0)
+ abort();
+
+ pidfd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_EQ(WTERMSIG(status), SIGABRT);
+ ASSERT_TRUE(WCOREDUMP(status));
+
+ ASSERT_TRUE(get_pidfd_info(pidfd, &info));
+ ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP));
+ ASSERT_TRUE(!!(info.mask & PIDFD_INFO_COREDUMP_SIGNAL));
+ ASSERT_EQ(info.coredump_signal, SIGABRT);
+
+ wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
+}
+
+TEST_F(coredump, socket_invalid_paths)
+{
+ ASSERT_FALSE(set_core_pattern("@ /tmp/coredump.socket"));
+ ASSERT_FALSE(set_core_pattern("@/tmp/../coredump.socket"));
+ ASSERT_FALSE(set_core_pattern("@../coredump.socket"));
+ ASSERT_FALSE(set_core_pattern("@/tmp/coredump.socket/.."));
+ ASSERT_FALSE(set_core_pattern("@.."));
+
+ ASSERT_FALSE(set_core_pattern("@@ /tmp/coredump.socket"));
+ ASSERT_FALSE(set_core_pattern("@@/tmp/../coredump.socket"));
+ ASSERT_FALSE(set_core_pattern("@@../coredump.socket"));
+ ASSERT_FALSE(set_core_pattern("@@/tmp/coredump.socket/.."));
+ ASSERT_FALSE(set_core_pattern("@@.."));
+
+ ASSERT_FALSE(set_core_pattern("@@@/tmp/coredump.socket"));
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/coredump/coredump_test.h b/tools/testing/selftests/coredump/coredump_test.h
new file mode 100644
index 000000000000..ed47f01fa53c
--- /dev/null
+++ b/tools/testing/selftests/coredump/coredump_test.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __COREDUMP_TEST_H
+#define __COREDUMP_TEST_H
+
+#include <stdbool.h>
+#include <sys/types.h>
+#include <linux/coredump.h>
+
+#include "../kselftest_harness.h"
+#include "../pidfd/pidfd.h"
+
+#ifndef PAGE_SIZE
+#define PAGE_SIZE 4096
+#endif
+
+#define NUM_THREAD_SPAWN 128
+
+/* Coredump fixture */
+FIXTURE(coredump)
+{
+ char original_core_pattern[256];
+ pid_t pid_coredump_server;
+ int fd_tmpfs_detached;
+};
+
+/* Shared helper function declarations */
+void *do_nothing(void *arg);
+void crashing_child(void);
+int create_detached_tmpfs(void);
+int create_and_listen_unix_socket(const char *path);
+bool set_core_pattern(const char *pattern);
+int get_peer_pidfd(int fd);
+bool get_pidfd_info(int fd_peer_pidfd, struct pidfd_info *info);
+
+/* Inline helper that uses harness types */
+static inline void wait_and_check_coredump_server(pid_t pid_coredump_server,
+ struct __test_metadata *const _metadata,
+ FIXTURE_DATA(coredump) *self)
+{
+ int status;
+ waitpid(pid_coredump_server, &status, 0);
+ self->pid_coredump_server = -ESRCH;
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+}
+
+/* Protocol helper function declarations */
+ssize_t recv_marker(int fd);
+bool read_marker(int fd, enum coredump_mark mark);
+bool read_coredump_req(int fd, struct coredump_req *req);
+bool send_coredump_ack(int fd, const struct coredump_req *req,
+ __u64 mask, size_t size_ack);
+bool check_coredump_req(const struct coredump_req *req, size_t min_size,
+ __u64 required_mask);
+int open_coredump_tmpfile(int fd_tmpfs_detached);
+void process_coredump_worker(int fd_coredump, int fd_peer_pidfd, int fd_core_file);
+
+#endif /* __COREDUMP_TEST_H */
diff --git a/tools/testing/selftests/coredump/coredump_test_helpers.c b/tools/testing/selftests/coredump/coredump_test_helpers.c
new file mode 100644
index 000000000000..a6f6d5f2ae07
--- /dev/null
+++ b/tools/testing/selftests/coredump/coredump_test_helpers.c
@@ -0,0 +1,383 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <linux/coredump.h>
+#include <linux/fs.h>
+#include <pthread.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/epoll.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "../filesystems/wrappers.h"
+#include "../pidfd/pidfd.h"
+
+/* Forward declarations to avoid including harness header */
+struct __test_metadata;
+
+/* Match the fixture definition from coredump_test.h */
+struct _fixture_coredump_data {
+ char original_core_pattern[256];
+ pid_t pid_coredump_server;
+ int fd_tmpfs_detached;
+};
+
+#ifndef PAGE_SIZE
+#define PAGE_SIZE 4096
+#endif
+
+#define NUM_THREAD_SPAWN 128
+
+void *do_nothing(void *arg)
+{
+ (void)arg;
+ while (1)
+ pause();
+
+ return NULL;
+}
+
+void crashing_child(void)
+{
+ pthread_t thread;
+ int i;
+
+ for (i = 0; i < NUM_THREAD_SPAWN; ++i)
+ pthread_create(&thread, NULL, do_nothing, NULL);
+
+ /* crash on purpose */
+ i = *(int *)NULL;
+}
+
+int create_detached_tmpfs(void)
+{
+ int fd_context, fd_tmpfs;
+
+ fd_context = sys_fsopen("tmpfs", 0);
+ if (fd_context < 0)
+ return -1;
+
+ if (sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0)
+ return -1;
+
+ fd_tmpfs = sys_fsmount(fd_context, 0, 0);
+ close(fd_context);
+ return fd_tmpfs;
+}
+
+int create_and_listen_unix_socket(const char *path)
+{
+ struct sockaddr_un addr = {
+ .sun_family = AF_UNIX,
+ };
+ assert(strlen(path) < sizeof(addr.sun_path) - 1);
+ strncpy(addr.sun_path, path, sizeof(addr.sun_path) - 1);
+ size_t addr_len =
+ offsetof(struct sockaddr_un, sun_path) + strlen(path) + 1;
+ int fd, ret;
+
+ fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0);
+ if (fd < 0)
+ goto out;
+
+ ret = bind(fd, (const struct sockaddr *)&addr, addr_len);
+ if (ret < 0)
+ goto out;
+
+ ret = listen(fd, 128);
+ if (ret < 0)
+ goto out;
+
+ return fd;
+
+out:
+ if (fd >= 0)
+ close(fd);
+ return -1;
+}
+
+bool set_core_pattern(const char *pattern)
+{
+ int fd;
+ ssize_t ret;
+
+ fd = open("/proc/sys/kernel/core_pattern", O_WRONLY | O_CLOEXEC);
+ if (fd < 0)
+ return false;
+
+ ret = write(fd, pattern, strlen(pattern));
+ close(fd);
+ if (ret < 0)
+ return false;
+
+ fprintf(stderr, "Set core_pattern to '%s' | %zu == %zu\n", pattern, ret, strlen(pattern));
+ return ret == strlen(pattern);
+}
+
+int get_peer_pidfd(int fd)
+{
+ int fd_peer_pidfd;
+ socklen_t fd_peer_pidfd_len = sizeof(fd_peer_pidfd);
+ int ret = getsockopt(fd, SOL_SOCKET, SO_PEERPIDFD, &fd_peer_pidfd,
+ &fd_peer_pidfd_len);
+ if (ret < 0) {
+ fprintf(stderr, "get_peer_pidfd: getsockopt(SO_PEERPIDFD) failed: %m\n");
+ return -1;
+ }
+ fprintf(stderr, "get_peer_pidfd: successfully retrieved pidfd %d\n", fd_peer_pidfd);
+ return fd_peer_pidfd;
+}
+
+bool get_pidfd_info(int fd_peer_pidfd, struct pidfd_info *info)
+{
+ int ret;
+ memset(info, 0, sizeof(*info));
+ info->mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP | PIDFD_INFO_COREDUMP_SIGNAL;
+ ret = ioctl(fd_peer_pidfd, PIDFD_GET_INFO, info);
+ if (ret < 0) {
+ fprintf(stderr, "get_pidfd_info: ioctl(PIDFD_GET_INFO) failed: %m\n");
+ return false;
+ }
+ fprintf(stderr, "get_pidfd_info: mask=0x%llx, coredump_mask=0x%x, coredump_signal=%d\n",
+ (unsigned long long)info->mask, info->coredump_mask, info->coredump_signal);
+ return true;
+}
+
+/* Protocol helper functions */
+
+ssize_t recv_marker(int fd)
+{
+ enum coredump_mark mark = COREDUMP_MARK_REQACK;
+ ssize_t ret;
+
+ ret = recv(fd, &mark, sizeof(mark), MSG_WAITALL);
+ if (ret != sizeof(mark))
+ return -1;
+
+ switch (mark) {
+ case COREDUMP_MARK_REQACK:
+ fprintf(stderr, "Received marker: ReqAck\n");
+ return COREDUMP_MARK_REQACK;
+ case COREDUMP_MARK_MINSIZE:
+ fprintf(stderr, "Received marker: MinSize\n");
+ return COREDUMP_MARK_MINSIZE;
+ case COREDUMP_MARK_MAXSIZE:
+ fprintf(stderr, "Received marker: MaxSize\n");
+ return COREDUMP_MARK_MAXSIZE;
+ case COREDUMP_MARK_UNSUPPORTED:
+ fprintf(stderr, "Received marker: Unsupported\n");
+ return COREDUMP_MARK_UNSUPPORTED;
+ case COREDUMP_MARK_CONFLICTING:
+ fprintf(stderr, "Received marker: Conflicting\n");
+ return COREDUMP_MARK_CONFLICTING;
+ default:
+ fprintf(stderr, "Received unknown marker: %u\n", mark);
+ break;
+ }
+ return -1;
+}
+
+bool read_marker(int fd, enum coredump_mark mark)
+{
+ ssize_t ret;
+
+ ret = recv_marker(fd);
+ if (ret < 0)
+ return false;
+ return ret == mark;
+}
+
+bool read_coredump_req(int fd, struct coredump_req *req)
+{
+ ssize_t ret;
+ size_t field_size, user_size, ack_size, kernel_size, remaining_size;
+
+ memset(req, 0, sizeof(*req));
+ field_size = sizeof(req->size);
+
+ /* Peek the size of the coredump request. */
+ ret = recv(fd, req, field_size, MSG_PEEK | MSG_WAITALL);
+ if (ret != field_size) {
+ fprintf(stderr, "read_coredump_req: peek failed (got %zd, expected %zu): %m\n",
+ ret, field_size);
+ return false;
+ }
+ kernel_size = req->size;
+
+ if (kernel_size < COREDUMP_ACK_SIZE_VER0) {
+ fprintf(stderr, "read_coredump_req: kernel_size %zu < min %d\n",
+ kernel_size, COREDUMP_ACK_SIZE_VER0);
+ return false;
+ }
+ if (kernel_size >= PAGE_SIZE) {
+ fprintf(stderr, "read_coredump_req: kernel_size %zu >= PAGE_SIZE %d\n",
+ kernel_size, PAGE_SIZE);
+ return false;
+ }
+
+ /* Use the minimum of user and kernel size to read the full request. */
+ user_size = sizeof(struct coredump_req);
+ ack_size = user_size < kernel_size ? user_size : kernel_size;
+ ret = recv(fd, req, ack_size, MSG_WAITALL);
+ if (ret != ack_size)
+ return false;
+
+ fprintf(stderr, "Read coredump request with size %u and mask 0x%llx\n",
+ req->size, (unsigned long long)req->mask);
+
+ if (user_size > kernel_size)
+ remaining_size = user_size - kernel_size;
+ else
+ remaining_size = kernel_size - user_size;
+
+ if (PAGE_SIZE <= remaining_size)
+ return false;
+
+ /*
+ * Discard any additional data if the kernel's request was larger than
+ * what we knew about or cared about.
+ */
+ if (remaining_size) {
+ char buffer[PAGE_SIZE];
+
+ ret = recv(fd, buffer, sizeof(buffer), MSG_WAITALL);
+ if (ret != remaining_size)
+ return false;
+ fprintf(stderr, "Discarded %zu bytes of data after coredump request\n", remaining_size);
+ }
+
+ return true;
+}
+
+bool send_coredump_ack(int fd, const struct coredump_req *req,
+ __u64 mask, size_t size_ack)
+{
+ ssize_t ret;
+ /*
+ * Wrap struct coredump_ack in a larger struct so we can
+ * simulate sending to much data to the kernel.
+ */
+ struct large_ack_for_size_testing {
+ struct coredump_ack ack;
+ char buffer[PAGE_SIZE];
+ } large_ack = {};
+
+ if (!size_ack)
+ size_ack = sizeof(struct coredump_ack) < req->size_ack ?
+ sizeof(struct coredump_ack) :
+ req->size_ack;
+ large_ack.ack.mask = mask;
+ large_ack.ack.size = size_ack;
+ ret = send(fd, &large_ack, size_ack, MSG_NOSIGNAL);
+ if (ret != size_ack)
+ return false;
+
+ fprintf(stderr, "Sent coredump ack with size %zu and mask 0x%llx\n",
+ size_ack, (unsigned long long)mask);
+ return true;
+}
+
+bool check_coredump_req(const struct coredump_req *req, size_t min_size,
+ __u64 required_mask)
+{
+ if (req->size < min_size)
+ return false;
+ if ((req->mask & required_mask) != required_mask)
+ return false;
+ if (req->mask & ~required_mask)
+ return false;
+ return true;
+}
+
+int open_coredump_tmpfile(int fd_tmpfs_detached)
+{
+ return openat(fd_tmpfs_detached, ".", O_TMPFILE | O_RDWR | O_EXCL, 0600);
+}
+
+void process_coredump_worker(int fd_coredump, int fd_peer_pidfd, int fd_core_file)
+{
+ int epfd = -1;
+ int exit_code = EXIT_FAILURE;
+ struct epoll_event ev;
+ int flags;
+
+ /* Set socket to non-blocking mode for edge-triggered epoll */
+ flags = fcntl(fd_coredump, F_GETFL, 0);
+ if (flags < 0) {
+ fprintf(stderr, "Worker: fcntl(F_GETFL) failed: %m\n");
+ goto out;
+ }
+ if (fcntl(fd_coredump, F_SETFL, flags | O_NONBLOCK) < 0) {
+ fprintf(stderr, "Worker: fcntl(F_SETFL, O_NONBLOCK) failed: %m\n");
+ goto out;
+ }
+
+ epfd = epoll_create1(0);
+ if (epfd < 0) {
+ fprintf(stderr, "Worker: epoll_create1() failed: %m\n");
+ goto out;
+ }
+
+ ev.events = EPOLLIN | EPOLLRDHUP | EPOLLET;
+ ev.data.fd = fd_coredump;
+ if (epoll_ctl(epfd, EPOLL_CTL_ADD, fd_coredump, &ev) < 0) {
+ fprintf(stderr, "Worker: epoll_ctl(EPOLL_CTL_ADD) failed: %m\n");
+ goto out;
+ }
+
+ for (;;) {
+ struct epoll_event events[1];
+ int n = epoll_wait(epfd, events, 1, -1);
+ if (n < 0) {
+ fprintf(stderr, "Worker: epoll_wait() failed: %m\n");
+ break;
+ }
+
+ if (events[0].events & (EPOLLIN | EPOLLRDHUP)) {
+ for (;;) {
+ char buffer[4096];
+ ssize_t bytes_read = read(fd_coredump, buffer, sizeof(buffer));
+ if (bytes_read < 0) {
+ if (errno == EAGAIN || errno == EWOULDBLOCK)
+ break;
+ fprintf(stderr, "Worker: read() failed: %m\n");
+ goto out;
+ }
+ if (bytes_read == 0)
+ goto done;
+ ssize_t bytes_write = write(fd_core_file, buffer, bytes_read);
+ if (bytes_write != bytes_read) {
+ if (bytes_write < 0 && errno == ENOSPC)
+ continue;
+ fprintf(stderr, "Worker: write() failed (read=%zd, write=%zd): %m\n",
+ bytes_read, bytes_write);
+ goto out;
+ }
+ }
+ }
+ }
+
+done:
+ exit_code = EXIT_SUCCESS;
+ fprintf(stderr, "Worker: completed successfully\n");
+out:
+ if (epfd >= 0)
+ close(epfd);
+ if (fd_core_file >= 0)
+ close(fd_core_file);
+ if (fd_peer_pidfd >= 0)
+ close(fd_peer_pidfd);
+ if (fd_coredump >= 0)
+ close(fd_coredump);
+ _exit(exit_code);
+}
diff --git a/tools/testing/selftests/coredump/stackdump_test.c b/tools/testing/selftests/coredump/stackdump_test.c
index a4ac80bb1003..c2e895bcc160 100644
--- a/tools/testing/selftests/coredump/stackdump_test.c
+++ b/tools/testing/selftests/coredump/stackdump_test.c
@@ -23,57 +23,15 @@
#include "../filesystems/wrappers.h"
#include "../pidfd/pidfd.h"
+#include "coredump_test.h"
+
#define STACKDUMP_FILE "stack_values"
#define STACKDUMP_SCRIPT "stackdump"
-#define NUM_THREAD_SPAWN 128
#ifndef PAGE_SIZE
#define PAGE_SIZE 4096
#endif
-static void *do_nothing(void *)
-{
- while (1)
- pause();
-
- return NULL;
-}
-
-static void crashing_child(void)
-{
- pthread_t thread;
- int i;
-
- for (i = 0; i < NUM_THREAD_SPAWN; ++i)
- pthread_create(&thread, NULL, do_nothing, NULL);
-
- /* crash on purpose */
- i = *(int *)NULL;
-}
-
-FIXTURE(coredump)
-{
- char original_core_pattern[256];
- pid_t pid_coredump_server;
- int fd_tmpfs_detached;
-};
-
-static int create_detached_tmpfs(void)
-{
- int fd_context, fd_tmpfs;
-
- fd_context = sys_fsopen("tmpfs", 0);
- if (fd_context < 0)
- return -1;
-
- if (sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0)
- return -1;
-
- fd_tmpfs = sys_fsmount(fd_context, 0, 0);
- close(fd_context);
- return fd_tmpfs;
-}
-
FIXTURE_SETUP(coredump)
{
FILE *file;
@@ -208,1620 +166,4 @@ TEST_F_TIMEOUT(coredump, stackdump, 120)
fclose(file);
}
-static int create_and_listen_unix_socket(const char *path)
-{
- struct sockaddr_un addr = {
- .sun_family = AF_UNIX,
- };
- assert(strlen(path) < sizeof(addr.sun_path) - 1);
- strncpy(addr.sun_path, path, sizeof(addr.sun_path) - 1);
- size_t addr_len =
- offsetof(struct sockaddr_un, sun_path) + strlen(path) + 1;
- int fd, ret;
-
- fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0);
- if (fd < 0)
- goto out;
-
- ret = bind(fd, (const struct sockaddr *)&addr, addr_len);
- if (ret < 0)
- goto out;
-
- ret = listen(fd, 128);
- if (ret < 0)
- goto out;
-
- return fd;
-
-out:
- if (fd >= 0)
- close(fd);
- return -1;
-}
-
-static bool set_core_pattern(const char *pattern)
-{
- int fd;
- ssize_t ret;
-
- fd = open("/proc/sys/kernel/core_pattern", O_WRONLY | O_CLOEXEC);
- if (fd < 0)
- return false;
-
- ret = write(fd, pattern, strlen(pattern));
- close(fd);
- if (ret < 0)
- return false;
-
- fprintf(stderr, "Set core_pattern to '%s' | %zu == %zu\n", pattern, ret, strlen(pattern));
- return ret == strlen(pattern);
-}
-
-static int get_peer_pidfd(int fd)
-{
- int fd_peer_pidfd;
- socklen_t fd_peer_pidfd_len = sizeof(fd_peer_pidfd);
- int ret = getsockopt(fd, SOL_SOCKET, SO_PEERPIDFD, &fd_peer_pidfd,
- &fd_peer_pidfd_len);
- if (ret < 0) {
- fprintf(stderr, "%m - Failed to retrieve peer pidfd for coredump socket connection\n");
- return -1;
- }
- return fd_peer_pidfd;
-}
-
-static bool get_pidfd_info(int fd_peer_pidfd, struct pidfd_info *info)
-{
- memset(info, 0, sizeof(*info));
- info->mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP;
- return ioctl(fd_peer_pidfd, PIDFD_GET_INFO, info) == 0;
-}
-
-static void
-wait_and_check_coredump_server(pid_t pid_coredump_server,
- struct __test_metadata *const _metadata,
- FIXTURE_DATA(coredump)* self)
-{
- int status;
- waitpid(pid_coredump_server, &status, 0);
- self->pid_coredump_server = -ESRCH;
- ASSERT_TRUE(WIFEXITED(status));
- ASSERT_EQ(WEXITSTATUS(status), 0);
-}
-
-TEST_F(coredump, socket)
-{
- int pidfd, ret, status;
- pid_t pid, pid_coredump_server;
- struct stat st;
- struct pidfd_info info = {};
- int ipc_sockets[2];
- char c;
-
- ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket"));
-
- ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
- ASSERT_EQ(ret, 0);
-
- pid_coredump_server = fork();
- ASSERT_GE(pid_coredump_server, 0);
- if (pid_coredump_server == 0) {
- int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1;
- int exit_code = EXIT_FAILURE;
-
- close(ipc_sockets[0]);
-
- fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
- if (fd_server < 0)
- goto out;
-
- if (write_nointr(ipc_sockets[1], "1", 1) < 0)
- goto out;
-
- close(ipc_sockets[1]);
-
- fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
- if (fd_coredump < 0)
- goto out;
-
- fd_peer_pidfd = get_peer_pidfd(fd_coredump);
- if (fd_peer_pidfd < 0)
- goto out;
-
- if (!get_pidfd_info(fd_peer_pidfd, &info))
- goto out;
-
- if (!(info.mask & PIDFD_INFO_COREDUMP))
- goto out;
-
- if (!(info.coredump_mask & PIDFD_COREDUMPED))
- goto out;
-
- fd_core_file = creat("/tmp/coredump.file", 0644);
- if (fd_core_file < 0)
- goto out;
-
- for (;;) {
- char buffer[4096];
- ssize_t bytes_read, bytes_write;
-
- bytes_read = read(fd_coredump, buffer, sizeof(buffer));
- if (bytes_read < 0)
- goto out;
-
- if (bytes_read == 0)
- break;
-
- bytes_write = write(fd_core_file, buffer, bytes_read);
- if (bytes_read != bytes_write)
- goto out;
- }
-
- exit_code = EXIT_SUCCESS;
-out:
- if (fd_core_file >= 0)
- close(fd_core_file);
- if (fd_peer_pidfd >= 0)
- close(fd_peer_pidfd);
- if (fd_coredump >= 0)
- close(fd_coredump);
- if (fd_server >= 0)
- close(fd_server);
- _exit(exit_code);
- }
- self->pid_coredump_server = pid_coredump_server;
-
- EXPECT_EQ(close(ipc_sockets[1]), 0);
- ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
- EXPECT_EQ(close(ipc_sockets[0]), 0);
-
- pid = fork();
- ASSERT_GE(pid, 0);
- if (pid == 0)
- crashing_child();
-
- pidfd = sys_pidfd_open(pid, 0);
- ASSERT_GE(pidfd, 0);
-
- waitpid(pid, &status, 0);
- ASSERT_TRUE(WIFSIGNALED(status));
- ASSERT_TRUE(WCOREDUMP(status));
-
- ASSERT_TRUE(get_pidfd_info(pidfd, &info));
- ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
- ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
-
- wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
-
- ASSERT_EQ(stat("/tmp/coredump.file", &st), 0);
- ASSERT_GT(st.st_size, 0);
- system("file /tmp/coredump.file");
-}
-
-TEST_F(coredump, socket_detect_userspace_client)
-{
- int pidfd, ret, status;
- pid_t pid, pid_coredump_server;
- struct stat st;
- struct pidfd_info info = {};
- int ipc_sockets[2];
- char c;
-
- ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket"));
-
- ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
- ASSERT_EQ(ret, 0);
-
- pid_coredump_server = fork();
- ASSERT_GE(pid_coredump_server, 0);
- if (pid_coredump_server == 0) {
- int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
- int exit_code = EXIT_FAILURE;
-
- close(ipc_sockets[0]);
-
- fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
- if (fd_server < 0)
- goto out;
-
- if (write_nointr(ipc_sockets[1], "1", 1) < 0)
- goto out;
-
- close(ipc_sockets[1]);
-
- fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
- if (fd_coredump < 0)
- goto out;
-
- fd_peer_pidfd = get_peer_pidfd(fd_coredump);
- if (fd_peer_pidfd < 0)
- goto out;
-
- if (!get_pidfd_info(fd_peer_pidfd, &info))
- goto out;
-
- if (!(info.mask & PIDFD_INFO_COREDUMP))
- goto out;
-
- if (info.coredump_mask & PIDFD_COREDUMPED)
- goto out;
-
- exit_code = EXIT_SUCCESS;
-out:
- if (fd_peer_pidfd >= 0)
- close(fd_peer_pidfd);
- if (fd_coredump >= 0)
- close(fd_coredump);
- if (fd_server >= 0)
- close(fd_server);
- _exit(exit_code);
- }
- self->pid_coredump_server = pid_coredump_server;
-
- EXPECT_EQ(close(ipc_sockets[1]), 0);
- ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
- EXPECT_EQ(close(ipc_sockets[0]), 0);
-
- pid = fork();
- ASSERT_GE(pid, 0);
- if (pid == 0) {
- int fd_socket;
- ssize_t ret;
- const struct sockaddr_un coredump_sk = {
- .sun_family = AF_UNIX,
- .sun_path = "/tmp/coredump.socket",
- };
- size_t coredump_sk_len =
- offsetof(struct sockaddr_un, sun_path) +
- sizeof("/tmp/coredump.socket");
-
- fd_socket = socket(AF_UNIX, SOCK_STREAM, 0);
- if (fd_socket < 0)
- _exit(EXIT_FAILURE);
-
- ret = connect(fd_socket, (const struct sockaddr *)&coredump_sk, coredump_sk_len);
- if (ret < 0)
- _exit(EXIT_FAILURE);
-
- close(fd_socket);
- _exit(EXIT_SUCCESS);
- }
-
- pidfd = sys_pidfd_open(pid, 0);
- ASSERT_GE(pidfd, 0);
-
- waitpid(pid, &status, 0);
- ASSERT_TRUE(WIFEXITED(status));
- ASSERT_EQ(WEXITSTATUS(status), 0);
-
- ASSERT_TRUE(get_pidfd_info(pidfd, &info));
- ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
- ASSERT_EQ((info.coredump_mask & PIDFD_COREDUMPED), 0);
-
- wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
-
- ASSERT_NE(stat("/tmp/coredump.file", &st), 0);
- ASSERT_EQ(errno, ENOENT);
-}
-
-TEST_F(coredump, socket_enoent)
-{
- int pidfd, status;
- pid_t pid;
-
- ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket"));
-
- pid = fork();
- ASSERT_GE(pid, 0);
- if (pid == 0)
- crashing_child();
-
- pidfd = sys_pidfd_open(pid, 0);
- ASSERT_GE(pidfd, 0);
-
- waitpid(pid, &status, 0);
- ASSERT_TRUE(WIFSIGNALED(status));
- ASSERT_FALSE(WCOREDUMP(status));
-}
-
-TEST_F(coredump, socket_no_listener)
-{
- int pidfd, ret, status;
- pid_t pid, pid_coredump_server;
- int ipc_sockets[2];
- char c;
- const struct sockaddr_un coredump_sk = {
- .sun_family = AF_UNIX,
- .sun_path = "/tmp/coredump.socket",
- };
- size_t coredump_sk_len = offsetof(struct sockaddr_un, sun_path) +
- sizeof("/tmp/coredump.socket");
-
- ASSERT_TRUE(set_core_pattern("@/tmp/coredump.socket"));
-
- ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
- ASSERT_EQ(ret, 0);
-
- pid_coredump_server = fork();
- ASSERT_GE(pid_coredump_server, 0);
- if (pid_coredump_server == 0) {
- int fd_server = -1;
- int exit_code = EXIT_FAILURE;
-
- close(ipc_sockets[0]);
-
- fd_server = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0);
- if (fd_server < 0)
- goto out;
-
- ret = bind(fd_server, (const struct sockaddr *)&coredump_sk, coredump_sk_len);
- if (ret < 0)
- goto out;
-
- if (write_nointr(ipc_sockets[1], "1", 1) < 0)
- goto out;
-
- exit_code = EXIT_SUCCESS;
-out:
- if (fd_server >= 0)
- close(fd_server);
- close(ipc_sockets[1]);
- _exit(exit_code);
- }
- self->pid_coredump_server = pid_coredump_server;
-
- EXPECT_EQ(close(ipc_sockets[1]), 0);
- ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
- EXPECT_EQ(close(ipc_sockets[0]), 0);
-
- pid = fork();
- ASSERT_GE(pid, 0);
- if (pid == 0)
- crashing_child();
-
- pidfd = sys_pidfd_open(pid, 0);
- ASSERT_GE(pidfd, 0);
-
- waitpid(pid, &status, 0);
- ASSERT_TRUE(WIFSIGNALED(status));
- ASSERT_FALSE(WCOREDUMP(status));
-
- wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
-}
-
-static ssize_t recv_marker(int fd)
-{
- enum coredump_mark mark = COREDUMP_MARK_REQACK;
- ssize_t ret;
-
- ret = recv(fd, &mark, sizeof(mark), MSG_WAITALL);
- if (ret != sizeof(mark))
- return -1;
-
- switch (mark) {
- case COREDUMP_MARK_REQACK:
- fprintf(stderr, "Received marker: ReqAck\n");
- return COREDUMP_MARK_REQACK;
- case COREDUMP_MARK_MINSIZE:
- fprintf(stderr, "Received marker: MinSize\n");
- return COREDUMP_MARK_MINSIZE;
- case COREDUMP_MARK_MAXSIZE:
- fprintf(stderr, "Received marker: MaxSize\n");
- return COREDUMP_MARK_MAXSIZE;
- case COREDUMP_MARK_UNSUPPORTED:
- fprintf(stderr, "Received marker: Unsupported\n");
- return COREDUMP_MARK_UNSUPPORTED;
- case COREDUMP_MARK_CONFLICTING:
- fprintf(stderr, "Received marker: Conflicting\n");
- return COREDUMP_MARK_CONFLICTING;
- default:
- fprintf(stderr, "Received unknown marker: %u\n", mark);
- break;
- }
- return -1;
-}
-
-static bool read_marker(int fd, enum coredump_mark mark)
-{
- ssize_t ret;
-
- ret = recv_marker(fd);
- if (ret < 0)
- return false;
- return ret == mark;
-}
-
-static bool read_coredump_req(int fd, struct coredump_req *req)
-{
- ssize_t ret;
- size_t field_size, user_size, ack_size, kernel_size, remaining_size;
-
- memset(req, 0, sizeof(*req));
- field_size = sizeof(req->size);
-
- /* Peek the size of the coredump request. */
- ret = recv(fd, req, field_size, MSG_PEEK | MSG_WAITALL);
- if (ret != field_size)
- return false;
- kernel_size = req->size;
-
- if (kernel_size < COREDUMP_ACK_SIZE_VER0)
- return false;
- if (kernel_size >= PAGE_SIZE)
- return false;
-
- /* Use the minimum of user and kernel size to read the full request. */
- user_size = sizeof(struct coredump_req);
- ack_size = user_size < kernel_size ? user_size : kernel_size;
- ret = recv(fd, req, ack_size, MSG_WAITALL);
- if (ret != ack_size)
- return false;
-
- fprintf(stderr, "Read coredump request with size %u and mask 0x%llx\n",
- req->size, (unsigned long long)req->mask);
-
- if (user_size > kernel_size)
- remaining_size = user_size - kernel_size;
- else
- remaining_size = kernel_size - user_size;
-
- if (PAGE_SIZE <= remaining_size)
- return false;
-
- /*
- * Discard any additional data if the kernel's request was larger than
- * what we knew about or cared about.
- */
- if (remaining_size) {
- char buffer[PAGE_SIZE];
-
- ret = recv(fd, buffer, sizeof(buffer), MSG_WAITALL);
- if (ret != remaining_size)
- return false;
- fprintf(stderr, "Discarded %zu bytes of data after coredump request\n", remaining_size);
- }
-
- return true;
-}
-
-static bool send_coredump_ack(int fd, const struct coredump_req *req,
- __u64 mask, size_t size_ack)
-{
- ssize_t ret;
- /*
- * Wrap struct coredump_ack in a larger struct so we can
- * simulate sending to much data to the kernel.
- */
- struct large_ack_for_size_testing {
- struct coredump_ack ack;
- char buffer[PAGE_SIZE];
- } large_ack = {};
-
- if (!size_ack)
- size_ack = sizeof(struct coredump_ack) < req->size_ack ?
- sizeof(struct coredump_ack) :
- req->size_ack;
- large_ack.ack.mask = mask;
- large_ack.ack.size = size_ack;
- ret = send(fd, &large_ack, size_ack, MSG_NOSIGNAL);
- if (ret != size_ack)
- return false;
-
- fprintf(stderr, "Sent coredump ack with size %zu and mask 0x%llx\n",
- size_ack, (unsigned long long)mask);
- return true;
-}
-
-static bool check_coredump_req(const struct coredump_req *req, size_t min_size,
- __u64 required_mask)
-{
- if (req->size < min_size)
- return false;
- if ((req->mask & required_mask) != required_mask)
- return false;
- if (req->mask & ~required_mask)
- return false;
- return true;
-}
-
-TEST_F(coredump, socket_request_kernel)
-{
- int pidfd, ret, status;
- pid_t pid, pid_coredump_server;
- struct stat st;
- struct pidfd_info info = {};
- int ipc_sockets[2];
- char c;
-
- ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
-
- ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
- ASSERT_EQ(ret, 0);
-
- pid_coredump_server = fork();
- ASSERT_GE(pid_coredump_server, 0);
- if (pid_coredump_server == 0) {
- struct coredump_req req = {};
- int fd_server = -1, fd_coredump = -1, fd_core_file = -1, fd_peer_pidfd = -1;
- int exit_code = EXIT_FAILURE;
-
- close(ipc_sockets[0]);
-
- fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
- if (fd_server < 0)
- goto out;
-
- if (write_nointr(ipc_sockets[1], "1", 1) < 0)
- goto out;
-
- close(ipc_sockets[1]);
-
- fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
- if (fd_coredump < 0)
- goto out;
-
- fd_peer_pidfd = get_peer_pidfd(fd_coredump);
- if (fd_peer_pidfd < 0)
- goto out;
-
- if (!get_pidfd_info(fd_peer_pidfd, &info))
- goto out;
-
- if (!(info.mask & PIDFD_INFO_COREDUMP))
- goto out;
-
- if (!(info.coredump_mask & PIDFD_COREDUMPED))
- goto out;
-
- fd_core_file = creat("/tmp/coredump.file", 0644);
- if (fd_core_file < 0)
- goto out;
-
- if (!read_coredump_req(fd_coredump, &req))
- goto out;
-
- if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
- COREDUMP_KERNEL | COREDUMP_USERSPACE |
- COREDUMP_REJECT | COREDUMP_WAIT))
- goto out;
-
- if (!send_coredump_ack(fd_coredump, &req,
- COREDUMP_KERNEL | COREDUMP_WAIT, 0))
- goto out;
-
- if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK))
- goto out;
-
- for (;;) {
- char buffer[4096];
- ssize_t bytes_read, bytes_write;
-
- bytes_read = read(fd_coredump, buffer, sizeof(buffer));
- if (bytes_read < 0)
- goto out;
-
- if (bytes_read == 0)
- break;
-
- bytes_write = write(fd_core_file, buffer, bytes_read);
- if (bytes_read != bytes_write)
- goto out;
- }
-
- exit_code = EXIT_SUCCESS;
-out:
- if (fd_core_file >= 0)
- close(fd_core_file);
- if (fd_peer_pidfd >= 0)
- close(fd_peer_pidfd);
- if (fd_coredump >= 0)
- close(fd_coredump);
- if (fd_server >= 0)
- close(fd_server);
- _exit(exit_code);
- }
- self->pid_coredump_server = pid_coredump_server;
-
- EXPECT_EQ(close(ipc_sockets[1]), 0);
- ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
- EXPECT_EQ(close(ipc_sockets[0]), 0);
-
- pid = fork();
- ASSERT_GE(pid, 0);
- if (pid == 0)
- crashing_child();
-
- pidfd = sys_pidfd_open(pid, 0);
- ASSERT_GE(pidfd, 0);
-
- waitpid(pid, &status, 0);
- ASSERT_TRUE(WIFSIGNALED(status));
- ASSERT_TRUE(WCOREDUMP(status));
-
- ASSERT_TRUE(get_pidfd_info(pidfd, &info));
- ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
- ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
-
- wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
-
- ASSERT_EQ(stat("/tmp/coredump.file", &st), 0);
- ASSERT_GT(st.st_size, 0);
- system("file /tmp/coredump.file");
-}
-
-TEST_F(coredump, socket_request_userspace)
-{
- int pidfd, ret, status;
- pid_t pid, pid_coredump_server;
- struct pidfd_info info = {};
- int ipc_sockets[2];
- char c;
-
- ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
-
- ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
- ASSERT_EQ(ret, 0);
-
- pid_coredump_server = fork();
- ASSERT_GE(pid_coredump_server, 0);
- if (pid_coredump_server == 0) {
- struct coredump_req req = {};
- int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
- int exit_code = EXIT_FAILURE;
-
- close(ipc_sockets[0]);
-
- fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
- if (fd_server < 0)
- goto out;
-
- if (write_nointr(ipc_sockets[1], "1", 1) < 0)
- goto out;
-
- close(ipc_sockets[1]);
-
- fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
- if (fd_coredump < 0)
- goto out;
-
- fd_peer_pidfd = get_peer_pidfd(fd_coredump);
- if (fd_peer_pidfd < 0)
- goto out;
-
- if (!get_pidfd_info(fd_peer_pidfd, &info))
- goto out;
-
- if (!(info.mask & PIDFD_INFO_COREDUMP))
- goto out;
-
- if (!(info.coredump_mask & PIDFD_COREDUMPED))
- goto out;
-
- if (!read_coredump_req(fd_coredump, &req))
- goto out;
-
- if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
- COREDUMP_KERNEL | COREDUMP_USERSPACE |
- COREDUMP_REJECT | COREDUMP_WAIT))
- goto out;
-
- if (!send_coredump_ack(fd_coredump, &req,
- COREDUMP_USERSPACE | COREDUMP_WAIT, 0))
- goto out;
-
- if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK))
- goto out;
-
- for (;;) {
- char buffer[4096];
- ssize_t bytes_read;
-
- bytes_read = read(fd_coredump, buffer, sizeof(buffer));
- if (bytes_read > 0)
- goto out;
-
- if (bytes_read < 0)
- goto out;
-
- if (bytes_read == 0)
- break;
- }
-
- exit_code = EXIT_SUCCESS;
-out:
- if (fd_peer_pidfd >= 0)
- close(fd_peer_pidfd);
- if (fd_coredump >= 0)
- close(fd_coredump);
- if (fd_server >= 0)
- close(fd_server);
- _exit(exit_code);
- }
- self->pid_coredump_server = pid_coredump_server;
-
- EXPECT_EQ(close(ipc_sockets[1]), 0);
- ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
- EXPECT_EQ(close(ipc_sockets[0]), 0);
-
- pid = fork();
- ASSERT_GE(pid, 0);
- if (pid == 0)
- crashing_child();
-
- pidfd = sys_pidfd_open(pid, 0);
- ASSERT_GE(pidfd, 0);
-
- waitpid(pid, &status, 0);
- ASSERT_TRUE(WIFSIGNALED(status));
- ASSERT_TRUE(WCOREDUMP(status));
-
- ASSERT_TRUE(get_pidfd_info(pidfd, &info));
- ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
- ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
-
- wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
-}
-
-TEST_F(coredump, socket_request_reject)
-{
- int pidfd, ret, status;
- pid_t pid, pid_coredump_server;
- struct pidfd_info info = {};
- int ipc_sockets[2];
- char c;
-
- ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
-
- ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
- ASSERT_EQ(ret, 0);
-
- pid_coredump_server = fork();
- ASSERT_GE(pid_coredump_server, 0);
- if (pid_coredump_server == 0) {
- struct coredump_req req = {};
- int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
- int exit_code = EXIT_FAILURE;
-
- close(ipc_sockets[0]);
-
- fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
- if (fd_server < 0)
- goto out;
-
- if (write_nointr(ipc_sockets[1], "1", 1) < 0)
- goto out;
-
- close(ipc_sockets[1]);
-
- fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
- if (fd_coredump < 0)
- goto out;
-
- fd_peer_pidfd = get_peer_pidfd(fd_coredump);
- if (fd_peer_pidfd < 0)
- goto out;
-
- if (!get_pidfd_info(fd_peer_pidfd, &info))
- goto out;
-
- if (!(info.mask & PIDFD_INFO_COREDUMP))
- goto out;
-
- if (!(info.coredump_mask & PIDFD_COREDUMPED))
- goto out;
-
- if (!read_coredump_req(fd_coredump, &req))
- goto out;
-
- if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
- COREDUMP_KERNEL | COREDUMP_USERSPACE |
- COREDUMP_REJECT | COREDUMP_WAIT))
- goto out;
-
- if (!send_coredump_ack(fd_coredump, &req,
- COREDUMP_REJECT | COREDUMP_WAIT, 0))
- goto out;
-
- if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK))
- goto out;
-
- for (;;) {
- char buffer[4096];
- ssize_t bytes_read;
-
- bytes_read = read(fd_coredump, buffer, sizeof(buffer));
- if (bytes_read > 0)
- goto out;
-
- if (bytes_read < 0)
- goto out;
-
- if (bytes_read == 0)
- break;
- }
-
- exit_code = EXIT_SUCCESS;
-out:
- if (fd_peer_pidfd >= 0)
- close(fd_peer_pidfd);
- if (fd_coredump >= 0)
- close(fd_coredump);
- if (fd_server >= 0)
- close(fd_server);
- _exit(exit_code);
- }
- self->pid_coredump_server = pid_coredump_server;
-
- EXPECT_EQ(close(ipc_sockets[1]), 0);
- ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
- EXPECT_EQ(close(ipc_sockets[0]), 0);
-
- pid = fork();
- ASSERT_GE(pid, 0);
- if (pid == 0)
- crashing_child();
-
- pidfd = sys_pidfd_open(pid, 0);
- ASSERT_GE(pidfd, 0);
-
- waitpid(pid, &status, 0);
- ASSERT_TRUE(WIFSIGNALED(status));
- ASSERT_FALSE(WCOREDUMP(status));
-
- ASSERT_TRUE(get_pidfd_info(pidfd, &info));
- ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
- ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
-
- wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
-}
-
-TEST_F(coredump, socket_request_invalid_flag_combination)
-{
- int pidfd, ret, status;
- pid_t pid, pid_coredump_server;
- struct pidfd_info info = {};
- int ipc_sockets[2];
- char c;
-
- ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
-
- ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
- ASSERT_EQ(ret, 0);
-
- pid_coredump_server = fork();
- ASSERT_GE(pid_coredump_server, 0);
- if (pid_coredump_server == 0) {
- struct coredump_req req = {};
- int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
- int exit_code = EXIT_FAILURE;
-
- close(ipc_sockets[0]);
-
- fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
- if (fd_server < 0)
- goto out;
-
- if (write_nointr(ipc_sockets[1], "1", 1) < 0)
- goto out;
-
- close(ipc_sockets[1]);
-
- fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
- if (fd_coredump < 0)
- goto out;
-
- fd_peer_pidfd = get_peer_pidfd(fd_coredump);
- if (fd_peer_pidfd < 0)
- goto out;
-
- if (!get_pidfd_info(fd_peer_pidfd, &info))
- goto out;
-
- if (!(info.mask & PIDFD_INFO_COREDUMP))
- goto out;
-
- if (!(info.coredump_mask & PIDFD_COREDUMPED))
- goto out;
-
- if (!read_coredump_req(fd_coredump, &req))
- goto out;
-
- if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
- COREDUMP_KERNEL | COREDUMP_USERSPACE |
- COREDUMP_REJECT | COREDUMP_WAIT))
- goto out;
-
- if (!send_coredump_ack(fd_coredump, &req,
- COREDUMP_KERNEL | COREDUMP_REJECT | COREDUMP_WAIT, 0))
- goto out;
-
- if (!read_marker(fd_coredump, COREDUMP_MARK_CONFLICTING))
- goto out;
-
- exit_code = EXIT_SUCCESS;
-out:
- if (fd_peer_pidfd >= 0)
- close(fd_peer_pidfd);
- if (fd_coredump >= 0)
- close(fd_coredump);
- if (fd_server >= 0)
- close(fd_server);
- _exit(exit_code);
- }
- self->pid_coredump_server = pid_coredump_server;
-
- EXPECT_EQ(close(ipc_sockets[1]), 0);
- ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
- EXPECT_EQ(close(ipc_sockets[0]), 0);
-
- pid = fork();
- ASSERT_GE(pid, 0);
- if (pid == 0)
- crashing_child();
-
- pidfd = sys_pidfd_open(pid, 0);
- ASSERT_GE(pidfd, 0);
-
- waitpid(pid, &status, 0);
- ASSERT_TRUE(WIFSIGNALED(status));
- ASSERT_FALSE(WCOREDUMP(status));
-
- ASSERT_TRUE(get_pidfd_info(pidfd, &info));
- ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
- ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
-
- wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
-}
-
-TEST_F(coredump, socket_request_unknown_flag)
-{
- int pidfd, ret, status;
- pid_t pid, pid_coredump_server;
- struct pidfd_info info = {};
- int ipc_sockets[2];
- char c;
-
- ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
-
- ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
- ASSERT_EQ(ret, 0);
-
- pid_coredump_server = fork();
- ASSERT_GE(pid_coredump_server, 0);
- if (pid_coredump_server == 0) {
- struct coredump_req req = {};
- int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
- int exit_code = EXIT_FAILURE;
-
- close(ipc_sockets[0]);
-
- fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
- if (fd_server < 0)
- goto out;
-
- if (write_nointr(ipc_sockets[1], "1", 1) < 0)
- goto out;
-
- close(ipc_sockets[1]);
-
- fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
- if (fd_coredump < 0)
- goto out;
-
- fd_peer_pidfd = get_peer_pidfd(fd_coredump);
- if (fd_peer_pidfd < 0)
- goto out;
-
- if (!get_pidfd_info(fd_peer_pidfd, &info))
- goto out;
-
- if (!(info.mask & PIDFD_INFO_COREDUMP))
- goto out;
-
- if (!(info.coredump_mask & PIDFD_COREDUMPED))
- goto out;
-
- if (!read_coredump_req(fd_coredump, &req))
- goto out;
-
- if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
- COREDUMP_KERNEL | COREDUMP_USERSPACE |
- COREDUMP_REJECT | COREDUMP_WAIT))
- goto out;
-
- if (!send_coredump_ack(fd_coredump, &req, (1ULL << 63), 0))
- goto out;
-
- if (!read_marker(fd_coredump, COREDUMP_MARK_UNSUPPORTED))
- goto out;
-
- exit_code = EXIT_SUCCESS;
-out:
- if (fd_peer_pidfd >= 0)
- close(fd_peer_pidfd);
- if (fd_coredump >= 0)
- close(fd_coredump);
- if (fd_server >= 0)
- close(fd_server);
- _exit(exit_code);
- }
- self->pid_coredump_server = pid_coredump_server;
-
- EXPECT_EQ(close(ipc_sockets[1]), 0);
- ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
- EXPECT_EQ(close(ipc_sockets[0]), 0);
-
- pid = fork();
- ASSERT_GE(pid, 0);
- if (pid == 0)
- crashing_child();
-
- pidfd = sys_pidfd_open(pid, 0);
- ASSERT_GE(pidfd, 0);
-
- waitpid(pid, &status, 0);
- ASSERT_TRUE(WIFSIGNALED(status));
- ASSERT_FALSE(WCOREDUMP(status));
-
- ASSERT_TRUE(get_pidfd_info(pidfd, &info));
- ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
- ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
-
- wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
-}
-
-TEST_F(coredump, socket_request_invalid_size_small)
-{
- int pidfd, ret, status;
- pid_t pid, pid_coredump_server;
- struct pidfd_info info = {};
- int ipc_sockets[2];
- char c;
-
- ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
-
- ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
- ASSERT_EQ(ret, 0);
-
- pid_coredump_server = fork();
- ASSERT_GE(pid_coredump_server, 0);
- if (pid_coredump_server == 0) {
- struct coredump_req req = {};
- int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
- int exit_code = EXIT_FAILURE;
-
- close(ipc_sockets[0]);
-
- fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
- if (fd_server < 0)
- goto out;
-
- if (write_nointr(ipc_sockets[1], "1", 1) < 0)
- goto out;
-
- close(ipc_sockets[1]);
-
- fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
- if (fd_coredump < 0)
- goto out;
-
- fd_peer_pidfd = get_peer_pidfd(fd_coredump);
- if (fd_peer_pidfd < 0)
- goto out;
-
- if (!get_pidfd_info(fd_peer_pidfd, &info))
- goto out;
-
- if (!(info.mask & PIDFD_INFO_COREDUMP))
- goto out;
-
- if (!(info.coredump_mask & PIDFD_COREDUMPED))
- goto out;
-
- if (!read_coredump_req(fd_coredump, &req))
- goto out;
-
- if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
- COREDUMP_KERNEL | COREDUMP_USERSPACE |
- COREDUMP_REJECT | COREDUMP_WAIT))
- goto out;
-
- if (!send_coredump_ack(fd_coredump, &req,
- COREDUMP_REJECT | COREDUMP_WAIT,
- COREDUMP_ACK_SIZE_VER0 / 2))
- goto out;
-
- if (!read_marker(fd_coredump, COREDUMP_MARK_MINSIZE))
- goto out;
-
- exit_code = EXIT_SUCCESS;
-out:
- if (fd_peer_pidfd >= 0)
- close(fd_peer_pidfd);
- if (fd_coredump >= 0)
- close(fd_coredump);
- if (fd_server >= 0)
- close(fd_server);
- _exit(exit_code);
- }
- self->pid_coredump_server = pid_coredump_server;
-
- EXPECT_EQ(close(ipc_sockets[1]), 0);
- ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
- EXPECT_EQ(close(ipc_sockets[0]), 0);
-
- pid = fork();
- ASSERT_GE(pid, 0);
- if (pid == 0)
- crashing_child();
-
- pidfd = sys_pidfd_open(pid, 0);
- ASSERT_GE(pidfd, 0);
-
- waitpid(pid, &status, 0);
- ASSERT_TRUE(WIFSIGNALED(status));
- ASSERT_FALSE(WCOREDUMP(status));
-
- ASSERT_TRUE(get_pidfd_info(pidfd, &info));
- ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
- ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
-
- wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
-}
-
-TEST_F(coredump, socket_request_invalid_size_large)
-{
- int pidfd, ret, status;
- pid_t pid, pid_coredump_server;
- struct pidfd_info info = {};
- int ipc_sockets[2];
- char c;
-
- ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
-
- ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
- ASSERT_EQ(ret, 0);
-
- pid_coredump_server = fork();
- ASSERT_GE(pid_coredump_server, 0);
- if (pid_coredump_server == 0) {
- struct coredump_req req = {};
- int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1;
- int exit_code = EXIT_FAILURE;
-
- close(ipc_sockets[0]);
-
- fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
- if (fd_server < 0)
- goto out;
-
- if (write_nointr(ipc_sockets[1], "1", 1) < 0)
- goto out;
-
- close(ipc_sockets[1]);
-
- fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
- if (fd_coredump < 0)
- goto out;
-
- fd_peer_pidfd = get_peer_pidfd(fd_coredump);
- if (fd_peer_pidfd < 0)
- goto out;
-
- if (!get_pidfd_info(fd_peer_pidfd, &info))
- goto out;
-
- if (!(info.mask & PIDFD_INFO_COREDUMP))
- goto out;
-
- if (!(info.coredump_mask & PIDFD_COREDUMPED))
- goto out;
-
- if (!read_coredump_req(fd_coredump, &req))
- goto out;
-
- if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
- COREDUMP_KERNEL | COREDUMP_USERSPACE |
- COREDUMP_REJECT | COREDUMP_WAIT))
- goto out;
-
- if (!send_coredump_ack(fd_coredump, &req,
- COREDUMP_REJECT | COREDUMP_WAIT,
- COREDUMP_ACK_SIZE_VER0 + PAGE_SIZE))
- goto out;
-
- if (!read_marker(fd_coredump, COREDUMP_MARK_MAXSIZE))
- goto out;
-
- exit_code = EXIT_SUCCESS;
-out:
- if (fd_peer_pidfd >= 0)
- close(fd_peer_pidfd);
- if (fd_coredump >= 0)
- close(fd_coredump);
- if (fd_server >= 0)
- close(fd_server);
- _exit(exit_code);
- }
- self->pid_coredump_server = pid_coredump_server;
-
- EXPECT_EQ(close(ipc_sockets[1]), 0);
- ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
- EXPECT_EQ(close(ipc_sockets[0]), 0);
-
- pid = fork();
- ASSERT_GE(pid, 0);
- if (pid == 0)
- crashing_child();
-
- pidfd = sys_pidfd_open(pid, 0);
- ASSERT_GE(pidfd, 0);
-
- waitpid(pid, &status, 0);
- ASSERT_TRUE(WIFSIGNALED(status));
- ASSERT_FALSE(WCOREDUMP(status));
-
- ASSERT_TRUE(get_pidfd_info(pidfd, &info));
- ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
- ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
-
- wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
-}
-
-static int open_coredump_tmpfile(int fd_tmpfs_detached)
-{
- return openat(fd_tmpfs_detached, ".", O_TMPFILE | O_RDWR | O_EXCL, 0600);
-}
-
-#define NUM_CRASHING_COREDUMPS 5
-
-TEST_F_TIMEOUT(coredump, socket_multiple_crashing_coredumps, 500)
-{
- int pidfd[NUM_CRASHING_COREDUMPS], status[NUM_CRASHING_COREDUMPS];
- pid_t pid[NUM_CRASHING_COREDUMPS], pid_coredump_server;
- struct pidfd_info info = {};
- int ipc_sockets[2];
- char c;
-
- ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
-
- ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0);
-
- pid_coredump_server = fork();
- ASSERT_GE(pid_coredump_server, 0);
- if (pid_coredump_server == 0) {
- int fd_server = -1, fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1;
- int exit_code = EXIT_FAILURE;
- struct coredump_req req = {};
-
- close(ipc_sockets[0]);
- fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
- if (fd_server < 0) {
- fprintf(stderr, "Failed to create and listen on unix socket\n");
- goto out;
- }
-
- if (write_nointr(ipc_sockets[1], "1", 1) < 0) {
- fprintf(stderr, "Failed to notify parent via ipc socket\n");
- goto out;
- }
- close(ipc_sockets[1]);
-
- for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) {
- fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
- if (fd_coredump < 0) {
- fprintf(stderr, "accept4 failed: %m\n");
- goto out;
- }
-
- fd_peer_pidfd = get_peer_pidfd(fd_coredump);
- if (fd_peer_pidfd < 0) {
- fprintf(stderr, "get_peer_pidfd failed for fd %d: %m\n", fd_coredump);
- goto out;
- }
-
- if (!get_pidfd_info(fd_peer_pidfd, &info)) {
- fprintf(stderr, "get_pidfd_info failed for fd %d\n", fd_peer_pidfd);
- goto out;
- }
-
- if (!(info.mask & PIDFD_INFO_COREDUMP)) {
- fprintf(stderr, "pidfd info missing PIDFD_INFO_COREDUMP for fd %d\n", fd_peer_pidfd);
- goto out;
- }
- if (!(info.coredump_mask & PIDFD_COREDUMPED)) {
- fprintf(stderr, "pidfd info missing PIDFD_COREDUMPED for fd %d\n", fd_peer_pidfd);
- goto out;
- }
-
- if (!read_coredump_req(fd_coredump, &req)) {
- fprintf(stderr, "read_coredump_req failed for fd %d\n", fd_coredump);
- goto out;
- }
-
- if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
- COREDUMP_KERNEL | COREDUMP_USERSPACE |
- COREDUMP_REJECT | COREDUMP_WAIT)) {
- fprintf(stderr, "check_coredump_req failed for fd %d\n", fd_coredump);
- goto out;
- }
-
- if (!send_coredump_ack(fd_coredump, &req,
- COREDUMP_KERNEL | COREDUMP_WAIT, 0)) {
- fprintf(stderr, "send_coredump_ack failed for fd %d\n", fd_coredump);
- goto out;
- }
-
- if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK)) {
- fprintf(stderr, "read_marker failed for fd %d\n", fd_coredump);
- goto out;
- }
-
- fd_core_file = open_coredump_tmpfile(self->fd_tmpfs_detached);
- if (fd_core_file < 0) {
- fprintf(stderr, "%m - open_coredump_tmpfile failed for fd %d\n", fd_coredump);
- goto out;
- }
-
- for (;;) {
- char buffer[4096];
- ssize_t bytes_read, bytes_write;
-
- bytes_read = read(fd_coredump, buffer, sizeof(buffer));
- if (bytes_read < 0) {
- fprintf(stderr, "read failed for fd %d: %m\n", fd_coredump);
- goto out;
- }
-
- if (bytes_read == 0)
- break;
-
- bytes_write = write(fd_core_file, buffer, bytes_read);
- if (bytes_read != bytes_write) {
- fprintf(stderr, "write failed for fd %d: %m\n", fd_core_file);
- goto out;
- }
- }
-
- close(fd_core_file);
- close(fd_peer_pidfd);
- close(fd_coredump);
- fd_peer_pidfd = -1;
- fd_coredump = -1;
- }
-
- exit_code = EXIT_SUCCESS;
-out:
- if (fd_core_file >= 0)
- close(fd_core_file);
- if (fd_peer_pidfd >= 0)
- close(fd_peer_pidfd);
- if (fd_coredump >= 0)
- close(fd_coredump);
- if (fd_server >= 0)
- close(fd_server);
- _exit(exit_code);
- }
- self->pid_coredump_server = pid_coredump_server;
-
- EXPECT_EQ(close(ipc_sockets[1]), 0);
- ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
- EXPECT_EQ(close(ipc_sockets[0]), 0);
-
- for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) {
- pid[i] = fork();
- ASSERT_GE(pid[i], 0);
- if (pid[i] == 0)
- crashing_child();
- pidfd[i] = sys_pidfd_open(pid[i], 0);
- ASSERT_GE(pidfd[i], 0);
- }
-
- for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) {
- waitpid(pid[i], &status[i], 0);
- ASSERT_TRUE(WIFSIGNALED(status[i]));
- ASSERT_TRUE(WCOREDUMP(status[i]));
- }
-
- for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) {
- info.mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP;
- ASSERT_EQ(ioctl(pidfd[i], PIDFD_GET_INFO, &info), 0);
- ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
- ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
- }
-
- wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
-}
-
-#define MAX_EVENTS 128
-
-static void process_coredump_worker(int fd_coredump, int fd_peer_pidfd, int fd_core_file)
-{
- int epfd = -1;
- int exit_code = EXIT_FAILURE;
-
- epfd = epoll_create1(0);
- if (epfd < 0)
- goto out;
-
- struct epoll_event ev;
- ev.events = EPOLLIN | EPOLLRDHUP | EPOLLET;
- ev.data.fd = fd_coredump;
- if (epoll_ctl(epfd, EPOLL_CTL_ADD, fd_coredump, &ev) < 0)
- goto out;
-
- for (;;) {
- struct epoll_event events[1];
- int n = epoll_wait(epfd, events, 1, -1);
- if (n < 0)
- break;
-
- if (events[0].events & (EPOLLIN | EPOLLRDHUP)) {
- for (;;) {
- char buffer[4096];
- ssize_t bytes_read = read(fd_coredump, buffer, sizeof(buffer));
- if (bytes_read < 0) {
- if (errno == EAGAIN || errno == EWOULDBLOCK)
- break;
- goto out;
- }
- if (bytes_read == 0)
- goto done;
- ssize_t bytes_write = write(fd_core_file, buffer, bytes_read);
- if (bytes_write != bytes_read)
- goto out;
- }
- }
- }
-
-done:
- exit_code = EXIT_SUCCESS;
-out:
- if (epfd >= 0)
- close(epfd);
- if (fd_core_file >= 0)
- close(fd_core_file);
- if (fd_peer_pidfd >= 0)
- close(fd_peer_pidfd);
- if (fd_coredump >= 0)
- close(fd_coredump);
- _exit(exit_code);
-}
-
-TEST_F_TIMEOUT(coredump, socket_multiple_crashing_coredumps_epoll_workers, 500)
-{
- int pidfd[NUM_CRASHING_COREDUMPS], status[NUM_CRASHING_COREDUMPS];
- pid_t pid[NUM_CRASHING_COREDUMPS], pid_coredump_server, worker_pids[NUM_CRASHING_COREDUMPS];
- struct pidfd_info info = {};
- int ipc_sockets[2];
- char c;
-
- ASSERT_TRUE(set_core_pattern("@@/tmp/coredump.socket"));
- ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0);
-
- pid_coredump_server = fork();
- ASSERT_GE(pid_coredump_server, 0);
- if (pid_coredump_server == 0) {
- int fd_server = -1, exit_code = EXIT_FAILURE, n_conns = 0;
- fd_server = -1;
- exit_code = EXIT_FAILURE;
- n_conns = 0;
- close(ipc_sockets[0]);
- fd_server = create_and_listen_unix_socket("/tmp/coredump.socket");
- if (fd_server < 0)
- goto out;
-
- if (write_nointr(ipc_sockets[1], "1", 1) < 0)
- goto out;
- close(ipc_sockets[1]);
-
- while (n_conns < NUM_CRASHING_COREDUMPS) {
- int fd_coredump = -1, fd_peer_pidfd = -1, fd_core_file = -1;
- struct coredump_req req = {};
- fd_coredump = accept4(fd_server, NULL, NULL, SOCK_CLOEXEC);
- if (fd_coredump < 0) {
- if (errno == EAGAIN || errno == EWOULDBLOCK)
- continue;
- goto out;
- }
- fd_peer_pidfd = get_peer_pidfd(fd_coredump);
- if (fd_peer_pidfd < 0)
- goto out;
- if (!get_pidfd_info(fd_peer_pidfd, &info))
- goto out;
- if (!(info.mask & PIDFD_INFO_COREDUMP) || !(info.coredump_mask & PIDFD_COREDUMPED))
- goto out;
- if (!read_coredump_req(fd_coredump, &req))
- goto out;
- if (!check_coredump_req(&req, COREDUMP_ACK_SIZE_VER0,
- COREDUMP_KERNEL | COREDUMP_USERSPACE |
- COREDUMP_REJECT | COREDUMP_WAIT))
- goto out;
- if (!send_coredump_ack(fd_coredump, &req, COREDUMP_KERNEL | COREDUMP_WAIT, 0))
- goto out;
- if (!read_marker(fd_coredump, COREDUMP_MARK_REQACK))
- goto out;
- fd_core_file = open_coredump_tmpfile(self->fd_tmpfs_detached);
- if (fd_core_file < 0)
- goto out;
- pid_t worker = fork();
- if (worker == 0) {
- close(fd_server);
- process_coredump_worker(fd_coredump, fd_peer_pidfd, fd_core_file);
- }
- worker_pids[n_conns] = worker;
- if (fd_coredump >= 0)
- close(fd_coredump);
- if (fd_peer_pidfd >= 0)
- close(fd_peer_pidfd);
- if (fd_core_file >= 0)
- close(fd_core_file);
- n_conns++;
- }
- exit_code = EXIT_SUCCESS;
-out:
- if (fd_server >= 0)
- close(fd_server);
-
- // Reap all worker processes
- for (int i = 0; i < n_conns; i++) {
- int wstatus;
- if (waitpid(worker_pids[i], &wstatus, 0) < 0) {
- fprintf(stderr, "Failed to wait for worker %d: %m\n", worker_pids[i]);
- } else if (WIFEXITED(wstatus) && WEXITSTATUS(wstatus) != EXIT_SUCCESS) {
- fprintf(stderr, "Worker %d exited with error code %d\n", worker_pids[i], WEXITSTATUS(wstatus));
- exit_code = EXIT_FAILURE;
- }
- }
-
- _exit(exit_code);
- }
- self->pid_coredump_server = pid_coredump_server;
-
- EXPECT_EQ(close(ipc_sockets[1]), 0);
- ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
- EXPECT_EQ(close(ipc_sockets[0]), 0);
-
- for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) {
- pid[i] = fork();
- ASSERT_GE(pid[i], 0);
- if (pid[i] == 0)
- crashing_child();
- pidfd[i] = sys_pidfd_open(pid[i], 0);
- ASSERT_GE(pidfd[i], 0);
- }
-
- for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) {
- ASSERT_GE(waitpid(pid[i], &status[i], 0), 0);
- ASSERT_TRUE(WIFSIGNALED(status[i]));
- ASSERT_TRUE(WCOREDUMP(status[i]));
- }
-
- for (int i = 0; i < NUM_CRASHING_COREDUMPS; i++) {
- info.mask = PIDFD_INFO_EXIT | PIDFD_INFO_COREDUMP;
- ASSERT_EQ(ioctl(pidfd[i], PIDFD_GET_INFO, &info), 0);
- ASSERT_GT((info.mask & PIDFD_INFO_COREDUMP), 0);
- ASSERT_GT((info.coredump_mask & PIDFD_COREDUMPED), 0);
- }
-
- wait_and_check_coredump_server(pid_coredump_server, _metadata, self);
-}
-
-TEST_F(coredump, socket_invalid_paths)
-{
- ASSERT_FALSE(set_core_pattern("@ /tmp/coredump.socket"));
- ASSERT_FALSE(set_core_pattern("@/tmp/../coredump.socket"));
- ASSERT_FALSE(set_core_pattern("@../coredump.socket"));
- ASSERT_FALSE(set_core_pattern("@/tmp/coredump.socket/.."));
- ASSERT_FALSE(set_core_pattern("@.."));
-
- ASSERT_FALSE(set_core_pattern("@@ /tmp/coredump.socket"));
- ASSERT_FALSE(set_core_pattern("@@/tmp/../coredump.socket"));
- ASSERT_FALSE(set_core_pattern("@@../coredump.socket"));
- ASSERT_FALSE(set_core_pattern("@@/tmp/coredump.socket/.."));
- ASSERT_FALSE(set_core_pattern("@@.."));
-
- ASSERT_FALSE(set_core_pattern("@@@/tmp/coredump.socket"));
-}
-
TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/drivers/net/Makefile b/tools/testing/selftests/drivers/net/Makefile
index 6e41635bd55a..71ee69e524d7 100644
--- a/tools/testing/selftests/drivers/net/Makefile
+++ b/tools/testing/selftests/drivers/net/Makefile
@@ -18,6 +18,7 @@ TEST_PROGS := \
netcons_fragmented_msg.sh \
netcons_overflow.sh \
netcons_sysdata.sh \
+ netcons_torture.sh \
netpoll_basic.py \
ping.py \
psp.py \
diff --git a/tools/testing/selftests/drivers/net/bonding/Makefile b/tools/testing/selftests/drivers/net/bonding/Makefile
index 402d4ee84f2e..6c5c60adb5e8 100644
--- a/tools/testing/selftests/drivers/net/bonding/Makefile
+++ b/tools/testing/selftests/drivers/net/bonding/Makefile
@@ -14,6 +14,7 @@ TEST_PROGS := \
dev_addr_lists.sh \
mode-1-recovery-updelay.sh \
mode-2-recovery-updelay.sh \
+ netcons_over_bonding.sh \
# end of TEST_PROGS
TEST_FILES := \
@@ -24,6 +25,7 @@ TEST_FILES := \
TEST_INCLUDES := \
../../../net/lib.sh \
+ ../lib/sh/lib_netcons.sh \
../../../net/forwarding/lib.sh \
# end of TEST_INCLUDES
diff --git a/tools/testing/selftests/drivers/net/bonding/config b/tools/testing/selftests/drivers/net/bonding/config
index 6bb290abd48b..991494376223 100644
--- a/tools/testing/selftests/drivers/net/bonding/config
+++ b/tools/testing/selftests/drivers/net/bonding/config
@@ -1,5 +1,6 @@
CONFIG_BONDING=y
CONFIG_BRIDGE=y
+CONFIG_CONFIGFS_FS=y
CONFIG_DUMMY=y
CONFIG_INET_ESP=y
CONFIG_INET_ESP_OFFLOAD=y
@@ -9,6 +10,9 @@ CONFIG_MACVLAN=y
CONFIG_NET_ACT_GACT=y
CONFIG_NET_CLS_FLOWER=y
CONFIG_NET_CLS_MATCHALL=m
+CONFIG_NETCONSOLE=m
+CONFIG_NETCONSOLE_DYNAMIC=y
+CONFIG_NETCONSOLE_EXTENDED_LOG=y
CONFIG_NETDEVSIM=m
CONFIG_NET_SCH_INGRESS=y
CONFIG_NLMON=y
diff --git a/tools/testing/selftests/drivers/net/bonding/netcons_over_bonding.sh b/tools/testing/selftests/drivers/net/bonding/netcons_over_bonding.sh
new file mode 100755
index 000000000000..477cc9379500
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/bonding/netcons_over_bonding.sh
@@ -0,0 +1,361 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# This selftest exercises trying to have multiple netpoll users at the same
+# time.
+#
+# This selftest has multiple smalls test inside, and the goal is to
+# get interfaces with bonding and netconsole in different orders in order
+# to catch any possible issue.
+#
+# The main test composes of four interfaces being created using netdevsim; two
+# of them are bonded to serve as the netconsole's transmit interface. The
+# remaining two interfaces are similarly bonded and assigned to a separate
+# network namespace, which acts as the receive interface, where socat monitors
+# for incoming messages.
+#
+# A netconsole message is then sent to ensure it is properly received across
+# this configuration.
+#
+# Later, run a few other tests, to make sure that bonding and netconsole
+# cannot coexist.
+#
+# The test's objective is to exercise netpoll usage when managed simultaneously
+# by multiple subsystems (netconsole and bonding).
+#
+# Author: Breno Leitao <leitao@debian.org>
+
+set -euo pipefail
+
+SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")")
+
+source "${SCRIPTDIR}"/../lib/sh/lib_netcons.sh
+
+modprobe netdevsim 2> /dev/null || true
+modprobe netconsole 2> /dev/null || true
+modprobe bonding 2> /dev/null || true
+modprobe veth 2> /dev/null || true
+
+# The content of kmsg will be save to the following file
+OUTPUT_FILE="/tmp/${TARGET}"
+
+# Check for basic system dependency and exit if not found
+check_for_dependencies
+# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5)
+echo "6 5" > /proc/sys/kernel/printk
+# Remove the namespace, interfaces and netconsole target on exit
+trap cleanup_bond EXIT
+
+FORMAT="extended"
+IP_VERSION="ipv4"
+VETH0="veth"$(( RANDOM % 256))
+VETH1="veth"$((256 + RANDOM % 256))
+TXNS=""
+RXNS=""
+
+# Create "bond_tx_XX" and "bond_rx_XX" interfaces, and set DSTIF and SRCIF with
+# the bonding interfaces
+function setup_bonding_ifaces() {
+ local RAND=$(( RANDOM % 100 ))
+ BOND_TX_MAIN_IF="bond_tx_$RAND"
+ BOND_RX_MAIN_IF="bond_rx_$RAND"
+
+ # Setup TX
+ if ! ip -n "${TXNS}" link add "${BOND_TX_MAIN_IF}" type bond mode balance-rr
+ then
+ echo "Failed to create bond TX interface. Is CONFIG_BONDING set?" >&2
+ # only clean nsim ifaces and namespace. Nothing else has been
+ # initialized
+ cleanup_bond_nsim
+ trap - EXIT
+ exit "${ksft_skip}"
+ fi
+
+ # create_netdevsim() got the interface up, but it needs to be down
+ # before being enslaved.
+ ip -n "${TXNS}" \
+ link set "${BOND_TX1_SLAVE_IF}" down
+ ip -n "${TXNS}" \
+ link set "${BOND_TX2_SLAVE_IF}" down
+ ip -n "${TXNS}" \
+ link set "${BOND_TX1_SLAVE_IF}" master "${BOND_TX_MAIN_IF}"
+ ip -n "${TXNS}" \
+ link set "${BOND_TX2_SLAVE_IF}" master "${BOND_TX_MAIN_IF}"
+ ip -n "${TXNS}" \
+ link set "${BOND_TX_MAIN_IF}" up
+
+ # Setup RX
+ ip -n "${RXNS}" \
+ link add "${BOND_RX_MAIN_IF}" type bond mode balance-rr
+ ip -n "${RXNS}" \
+ link set "${BOND_RX1_SLAVE_IF}" down
+ ip -n "${RXNS}" \
+ link set "${BOND_RX2_SLAVE_IF}" down
+ ip -n "${RXNS}" \
+ link set "${BOND_RX1_SLAVE_IF}" master "${BOND_RX_MAIN_IF}"
+ ip -n "${RXNS}" \
+ link set "${BOND_RX2_SLAVE_IF}" master "${BOND_RX_MAIN_IF}"
+ ip -n "${RXNS}" \
+ link set "${BOND_RX_MAIN_IF}" up
+
+ export DSTIF="${BOND_RX_MAIN_IF}"
+ export SRCIF="${BOND_TX_MAIN_IF}"
+}
+
+# Create 4 netdevsim interfaces. Two of them will be bound to TX bonding iface
+# and the other two will be bond to the RX interface (on the other namespace)
+function create_ifaces_bond() {
+ BOND_TX1_SLAVE_IF=$(create_netdevsim "${NSIM_BOND_TX_1}" "${TXNS}")
+ BOND_TX2_SLAVE_IF=$(create_netdevsim "${NSIM_BOND_TX_2}" "${TXNS}")
+ BOND_RX1_SLAVE_IF=$(create_netdevsim "${NSIM_BOND_RX_1}" "${RXNS}")
+ BOND_RX2_SLAVE_IF=$(create_netdevsim "${NSIM_BOND_RX_2}" "${RXNS}")
+}
+
+# netdevsim link BOND_TX to BOND_RX interfaces
+function link_ifaces_bond() {
+ local BOND_TX1_SLAVE_IFIDX
+ local BOND_TX2_SLAVE_IFIDX
+ local BOND_RX1_SLAVE_IFIDX
+ local BOND_RX2_SLAVE_IFIDX
+ local TXNS_FD
+ local RXNS_FD
+
+ BOND_TX1_SLAVE_IFIDX=$(ip netns exec "${TXNS}" \
+ cat /sys/class/net/"$BOND_TX1_SLAVE_IF"/ifindex)
+ BOND_TX2_SLAVE_IFIDX=$(ip netns exec "${TXNS}" \
+ cat /sys/class/net/"$BOND_TX2_SLAVE_IF"/ifindex)
+ BOND_RX1_SLAVE_IFIDX=$(ip netns exec "${RXNS}" \
+ cat /sys/class/net/"$BOND_RX1_SLAVE_IF"/ifindex)
+ BOND_RX2_SLAVE_IFIDX=$(ip netns exec "${RXNS}" \
+ cat /sys/class/net/"$BOND_RX2_SLAVE_IF"/ifindex)
+
+ exec {TXNS_FD}</var/run/netns/"${TXNS}"
+ exec {RXNS_FD}</var/run/netns/"${RXNS}"
+
+ # Linking TX ifaces to the RX ones (on the other namespace)
+ echo "${TXNS_FD}:$BOND_TX1_SLAVE_IFIDX $RXNS_FD:$BOND_RX1_SLAVE_IFIDX" \
+ > "$NSIM_DEV_SYS_LINK"
+ echo "${TXNS_FD}:$BOND_TX2_SLAVE_IFIDX $RXNS_FD:$BOND_RX2_SLAVE_IFIDX" \
+ > "$NSIM_DEV_SYS_LINK"
+
+ exec {TXNS_FD}<&-
+ exec {RXNS_FD}<&-
+}
+
+function create_all_ifaces() {
+ # setup_ns function is coming from lib.sh
+ setup_ns TXNS RXNS
+ export NAMESPACE="${RXNS}"
+
+ # Create two interfaces for RX and two for TX
+ create_ifaces_bond
+ # Link netlink ifaces
+ link_ifaces_bond
+}
+
+# configure DSTIF and SRCIF IPs
+function configure_ifaces_ips() {
+ local IP_VERSION=${1:-"ipv4"}
+ select_ipv4_or_ipv6 "${IP_VERSION}"
+
+ ip -n "${RXNS}" addr add "${DSTIP}"/24 dev "${DSTIF}"
+ ip -n "${RXNS}" link set "${DSTIF}" up
+
+ ip -n "${TXNS}" addr add "${SRCIP}"/24 dev "${SRCIF}"
+ ip -n "${TXNS}" link set "${SRCIF}" up
+}
+
+function test_enable_netpoll_on_enslaved_iface() {
+ echo 0 > "${NETCONS_PATH}"/enabled
+
+ # At this stage, BOND_TX1_SLAVE_IF is enslaved to BOND_TX_MAIN_IF, and
+ # linked to BOND_RX1_SLAVE_IF inside the namespace.
+ echo "${BOND_TX1_SLAVE_IF}" > "${NETCONS_PATH}"/dev_name
+
+ # This should fail with the following message in dmesg:
+ # netpoll: netconsole: ethX is a slave device, aborting
+ set +e
+ enable_netcons_ns 2> /dev/null
+ set -e
+
+ if [[ $(cat "${NETCONS_PATH}"/enabled) -eq 1 ]]
+ then
+ echo "test failed: Bonding and netpoll cannot co-exists." >&2
+ exit "${ksft_fail}"
+ fi
+}
+
+function test_delete_bond_and_reenable_target() {
+ ip -n "${TXNS}" \
+ link delete "${BOND_TX_MAIN_IF}" type bond
+
+ # BOND_TX1_SLAVE_IF is not attached to a bond interface anymore
+ # netpoll can be plugged in there
+ echo "${BOND_TX1_SLAVE_IF}" > "${NETCONS_PATH}"/dev_name
+
+ # this should work, since the interface is not enslaved
+ enable_netcons_ns
+
+ if [[ $(cat "${NETCONS_PATH}"/enabled) -eq 0 ]]
+ then
+ echo "test failed: Unable to start netpoll on an unbond iface." >&2
+ exit "${ksft_fail}"
+ fi
+}
+
+# Send a netconsole message to the netconsole target
+function test_send_netcons_msg_through_bond_iface() {
+ # Listen for netconsole port inside the namespace and
+ # destination interface
+ listen_port_and_save_to "${OUTPUT_FILE}" "${IP_VERSION}" &
+ # Wait for socat to start and listen to the port.
+ wait_for_port "${RXNS}" "${PORT}" "${IP_VERSION}"
+ # Send the message
+ echo "${MSG}: ${TARGET}" > /dev/kmsg
+ # Wait until socat saves the file to disk
+ busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}"
+ # Make sure the message was received in the dst part
+ # and exit
+ validate_result "${OUTPUT_FILE}" "${FORMAT}"
+ # kill socat in case it is still running
+ pkill_socat
+}
+
+# BOND_TX1_SLAVE_IF has netconsole enabled on it, bind it to BOND_TX_MAIN_IF.
+# Given BOND_TX_MAIN_IF was deleted, recreate it first
+function test_enslave_netcons_enabled_iface {
+ # netconsole got disabled while the interface was down
+ if [[ $(cat "${NETCONS_PATH}"/enabled) -eq 0 ]]
+ then
+ echo "test failed: netconsole expected to be enabled against BOND_TX1_SLAVE_IF" >&2
+ exit "${ksft_fail}"
+ fi
+
+ # recreate the bonding iface. it got deleted by previous
+ # test (test_delete_bond_and_reenable_target)
+ ip -n "${TXNS}" \
+ link add "${BOND_TX_MAIN_IF}" type bond mode balance-rr
+
+ # sub-interface need to be down before attaching to bonding
+ # This will also disable netconsole.
+ ip -n "${TXNS}" \
+ link set "${BOND_TX1_SLAVE_IF}" down
+ ip -n "${TXNS}" \
+ link set "${BOND_TX1_SLAVE_IF}" master "${BOND_TX_MAIN_IF}"
+ ip -n "${TXNS}" \
+ link set "${BOND_TX_MAIN_IF}" up
+
+ # netconsole got disabled while the interface was down
+ if [[ $(cat "${NETCONS_PATH}"/enabled) -eq 1 ]]
+ then
+ echo "test failed: Device is part of a bond iface, cannot have netcons enabled" >&2
+ exit "${ksft_fail}"
+ fi
+}
+
+# Get netconsole enabled on a bonding interface and attach a second
+# sub-interface.
+function test_enslave_iface_to_bond {
+ # BOND_TX_MAIN_IF has only BOND_TX1_SLAVE_IF right now
+ echo "${BOND_TX_MAIN_IF}" > "${NETCONS_PATH}"/dev_name
+ enable_netcons_ns
+
+ # netcons is attached to bond0 and BOND_TX1_SLAVE_IF is
+ # part of BOND_TX_MAIN_IF. Attach BOND_TX2_SLAVE_IF to BOND_TX_MAIN_IF.
+ ip -n "${TXNS}" \
+ link set "${BOND_TX2_SLAVE_IF}" master "${BOND_TX_MAIN_IF}"
+ if [[ $(cat "${NETCONS_PATH}"/enabled) -eq 0 ]]
+ then
+ echo "test failed: Netconsole should be enabled on bonding interface. Failed" >&2
+ exit "${ksft_fail}"
+ fi
+}
+
+function test_enslave_iff_disabled_netpoll_iface {
+ local ret
+
+ # Create two interfaces. veth interfaces it known to have
+ # IFF_DISABLE_NETPOLL set
+ if ! ip link add "${VETH0}" type veth peer name "${VETH1}"
+ then
+ echo "Failed to create veth TX interface. Is CONFIG_VETH set?" >&2
+ exit "${ksft_skip}"
+ fi
+ set +e
+ # This will print RTNETLINK answers: Device or resource busy
+ ip link set "${VETH0}" master "${BOND_TX_MAIN_IF}" 2> /dev/null
+ ret=$?
+ set -e
+ if [[ $ret -eq 0 ]]
+ then
+ echo "test failed: veth interface could not be enslaved"
+ exit "${ksft_fail}"
+ fi
+}
+
+# Given that netconsole picks the current net namespace, we need to enable it
+# from inside the TXNS namespace
+function enable_netcons_ns() {
+ ip netns exec "${TXNS}" sh -c \
+ "mount -t configfs configfs /sys/kernel/config && echo 1 > $NETCONS_PATH/enabled"
+}
+
+####################
+# Tests start here #
+####################
+
+# Create regular interfaces using netdevsim and link them
+create_all_ifaces
+
+# Setup the bonding interfaces
+# BOND_RX_MAIN_IF has BOND_RX{1,2}_SLAVE_IF
+# BOND_TX_MAIN_IF has BOND_TX{1,2}_SLAVE_IF
+setup_bonding_ifaces
+
+# Configure the ips as BOND_RX1_SLAVE_IF and BOND_TX1_SLAVE_IF
+configure_ifaces_ips "${IP_VERSION}"
+
+_create_dynamic_target "${FORMAT}" "${NETCONS_PATH}"
+enable_netcons_ns
+set_user_data
+
+# Test #1 : Create an bonding interface and attach netpoll into
+# the bonding interface. Netconsole/netpoll should work on
+# the bonding interface.
+test_send_netcons_msg_through_bond_iface
+echo "test #1: netpoll on bonding interface worked. Test passed" >&2
+
+# Test #2: Attach netpoll to an enslaved interface
+# Try to attach netpoll to an enslaved sub-interface (while still being part of
+# a bonding interface), which shouldn't be allowed
+test_enable_netpoll_on_enslaved_iface
+echo "test #2: netpoll correctly rejected enslaved interface (expected behavior). Test passed." >&2
+
+# Test #3: Unplug the sub-interface from bond and enable netconsole
+# Detach the interface from a bonding interface and attach netpoll again
+test_delete_bond_and_reenable_target
+echo "test #3: Able to attach to an unbound interface. Test passed." >&2
+
+# Test #4: Enslave a sub-interface that had netconsole enabled
+# Try to enslave an interface that has netconsole/netpoll enabled.
+# Previous test has netconsole enabled in BOND_TX1_SLAVE_IF, try to enslave it
+test_enslave_netcons_enabled_iface
+echo "test #4: Enslaving an interface with netpoll attached. Test passed." >&2
+
+# Test #5: Enslave a sub-interface to a bonding interface
+# Enslave an interface to a bond interface that has netpoll attached
+# At this stage, BOND_TX_MAIN_IF is created and BOND_TX1_SLAVE_IF is part of
+# it. Netconsole is currently disabled
+test_enslave_iface_to_bond
+echo "test #5: Enslaving an interface to bond+netpoll. Test passed." >&2
+
+# Test #6: Enslave a IFF_DISABLE_NETPOLL sub-interface to a bonding interface
+# At this stage, BOND_TX_MAIN_IF has both sub interface and netconsole is
+# enabled. This test will try to enslave an a veth (IFF_DISABLE_NETPOLL) interface
+# and it should fail, with netpoll: veth0 doesn't support polling
+test_enslave_iff_disabled_netpoll_iface
+echo "test #6: Enslaving IFF_DISABLE_NETPOLL ifaces to bond iface is not supported. Test passed." >&2
+
+cleanup_bond
+trap - EXIT
+exit "${EXIT_STATUS}"
diff --git a/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh
index 8e1085e89647..87f89fd92f8c 100644
--- a/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh
+++ b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh
@@ -11,9 +11,11 @@ set -euo pipefail
LIBDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")")
SRCIF="" # to be populated later
+SRCIP="" # to be populated later
SRCIP4="192.0.2.1"
SRCIP6="fc00::1"
DSTIF="" # to be populated later
+DSTIP="" # to be populated later
DSTIP4="192.0.2.2"
DSTIP6="fc00::2"
@@ -28,17 +30,23 @@ NETCONS_PATH="${NETCONS_CONFIGFS}"/"${TARGET}"
# NAMESPACE will be populated by setup_ns with a random value
NAMESPACE=""
-# IDs for netdevsim
+# IDs for netdevsim. We either use NSIM_DEV_{1,2}_ID for standard test
+# or NSIM_BOND_{T,R}X_{1,2} for the bonding tests. Not both at the
+# same time.
NSIM_DEV_1_ID=$((256 + RANDOM % 256))
NSIM_DEV_2_ID=$((512 + RANDOM % 256))
+NSIM_BOND_TX_1=$((768 + RANDOM % 256))
+NSIM_BOND_TX_2=$((1024 + RANDOM % 256))
+NSIM_BOND_RX_1=$((1280 + RANDOM % 256))
+NSIM_BOND_RX_2=$((1536 + RANDOM % 256))
NSIM_DEV_SYS_NEW="/sys/bus/netdevsim/new_device"
+NSIM_DEV_SYS_LINK="/sys/bus/netdevsim/link_device"
# Used to create and delete namespaces
source "${LIBDIR}"/../../../../net/lib.sh
# Create netdevsim interfaces
create_ifaces() {
-
echo "$NSIM_DEV_2_ID" > "$NSIM_DEV_SYS_NEW"
echo "$NSIM_DEV_1_ID" > "$NSIM_DEV_SYS_NEW"
udevadm settle 2> /dev/null || true
@@ -113,31 +121,38 @@ function set_network() {
configure_ip
}
-function create_dynamic_target() {
- local FORMAT=${1:-"extended"}
+function _create_dynamic_target() {
+ local FORMAT="${1:?FORMAT parameter required}"
+ local NCPATH="${2:?NCPATH parameter required}"
DSTMAC=$(ip netns exec "${NAMESPACE}" \
ip link show "${DSTIF}" | awk '/ether/ {print $2}')
# Create a dynamic target
- mkdir "${NETCONS_PATH}"
+ mkdir "${NCPATH}"
- echo "${DSTIP}" > "${NETCONS_PATH}"/remote_ip
- echo "${SRCIP}" > "${NETCONS_PATH}"/local_ip
- echo "${DSTMAC}" > "${NETCONS_PATH}"/remote_mac
- echo "${SRCIF}" > "${NETCONS_PATH}"/dev_name
+ echo "${DSTIP}" > "${NCPATH}"/remote_ip
+ echo "${SRCIP}" > "${NCPATH}"/local_ip
+ echo "${DSTMAC}" > "${NCPATH}"/remote_mac
+ echo "${SRCIF}" > "${NCPATH}"/dev_name
if [ "${FORMAT}" == "basic" ]
then
# Basic target does not support release
- echo 0 > "${NETCONS_PATH}"/release
- echo 0 > "${NETCONS_PATH}"/extended
+ echo 0 > "${NCPATH}"/release
+ echo 0 > "${NCPATH}"/extended
elif [ "${FORMAT}" == "extended" ]
then
- echo 1 > "${NETCONS_PATH}"/extended
+ echo 1 > "${NCPATH}"/extended
fi
+}
- echo 1 > "${NETCONS_PATH}"/enabled
+function create_dynamic_target() {
+ local FORMAT=${1:-"extended"}
+ local NCPATH=${2:-"$NETCONS_PATH"}
+ _create_dynamic_target "${FORMAT}" "${NCPATH}"
+
+ echo 1 > "${NCPATH}"/enabled
# This will make sure that the kernel was able to
# load the netconsole driver configuration. The console message
@@ -185,14 +200,26 @@ function do_cleanup() {
echo "${DEFAULT_PRINTK_VALUES}" > /proc/sys/kernel/printk
}
-function cleanup() {
+function cleanup_netcons() {
# delete netconsole dynamic reconfiguration
- echo 0 > "${NETCONS_PATH}"/enabled
+ # do not fail if the target is already disabled
+ if [[ ! -d "${NETCONS_PATH}" ]]
+ then
+ # in some cases this is called before netcons path is created
+ return
+ fi
+ if [[ $(cat "${NETCONS_PATH}"/enabled) != 0 ]]
+ then
+ echo 0 > "${NETCONS_PATH}"/enabled || true
+ fi
# Remove all the keys that got created during the selftest
find "${NETCONS_PATH}/userdata/" -mindepth 1 -type d -delete
# Remove the configfs entry
rmdir "${NETCONS_PATH}"
+}
+function cleanup() {
+ cleanup_netcons
do_cleanup
}
@@ -369,3 +396,24 @@ function wait_for_port() {
# more frequently on IPv6
sleep 1
}
+
+# Clean up netdevsim ifaces created for bonding test
+function cleanup_bond_nsim() {
+ ip -n "${TXNS}" \
+ link delete "${BOND_TX_MAIN_IF}" type bond || true
+ ip -n "${RXNS}" \
+ link delete "${BOND_RX_MAIN_IF}" type bond || true
+
+ cleanup_netdevsim "$NSIM_BOND_TX_1"
+ cleanup_netdevsim "$NSIM_BOND_TX_2"
+ cleanup_netdevsim "$NSIM_BOND_RX_1"
+ cleanup_netdevsim "$NSIM_BOND_RX_2"
+}
+
+# cleanup tests that use bonding interfaces
+function cleanup_bond() {
+ cleanup_netcons
+ cleanup_bond_nsim
+ cleanup_all_ns
+ ip link delete "${VETH0}" || true
+}
diff --git a/tools/testing/selftests/drivers/net/netcons_torture.sh b/tools/testing/selftests/drivers/net/netcons_torture.sh
new file mode 100755
index 000000000000..2ce9ee3719d1
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/netcons_torture.sh
@@ -0,0 +1,130 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Repeatedly send kernel messages, toggles netconsole targets on and off,
+# creates and deletes targets in parallel, and toggles the source interface to
+# simulate stress conditions.
+#
+# This test aims to verify the robustness of netconsole under dynamic
+# configurations and concurrent operations.
+#
+# The major goal is to run this test with LOCKDEP, Kmemleak and KASAN to make
+# sure no issues is reported.
+#
+# Author: Breno Leitao <leitao@debian.org>
+
+set -euo pipefail
+
+SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")")
+
+source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh
+
+# Number of times the main loop run
+ITERATIONS=${1:-150}
+
+# Only test extended format
+FORMAT="extended"
+# And ipv6 only
+IP_VERSION="ipv6"
+
+# Create, enable and delete some targets.
+create_and_delete_random_target() {
+ COUNT=2
+ RND_PREFIX=$(mktemp -u netcons_rnd_XXXX_)
+
+ if [ -d "${NETCONS_CONFIGFS}/${RND_PREFIX}${COUNT}" ] || \
+ [ -d "${NETCONS_CONFIGFS}/${RND_PREFIX}0" ]; then
+ echo "Function didn't finish yet, skipping it." >&2
+ return
+ fi
+
+ # enable COUNT targets
+ for i in $(seq ${COUNT})
+ do
+ RND_TARGET="${RND_PREFIX}"${i}
+ RND_TARGET_PATH="${NETCONS_CONFIGFS}"/"${RND_TARGET}"
+
+ # Basic population so the target can come up
+ _create_dynamic_target "${FORMAT}" "${RND_TARGET_PATH}"
+ done
+
+ echo "netconsole selftest: ${COUNT} additional targets were created" > /dev/kmsg
+ # disable them all
+ for i in $(seq ${COUNT})
+ do
+ RND_TARGET="${RND_PREFIX}"${i}
+ RND_TARGET_PATH="${NETCONS_CONFIGFS}"/"${RND_TARGET}"
+ if [[ $(cat "${RND_TARGET_PATH}/enabled") -eq 1 ]]
+ then
+ echo 0 > "${RND_TARGET_PATH}"/enabled
+ fi
+ rmdir "${RND_TARGET_PATH}"
+ done
+}
+
+# Disable and enable the target mid-air, while messages
+# are being transmitted.
+toggle_netcons_target() {
+ for i in $(seq 2)
+ do
+ if [ ! -d "${NETCONS_PATH}" ]
+ then
+ break
+ fi
+ echo 0 > "${NETCONS_PATH}"/enabled 2> /dev/null || true
+ # Try to enable a bit harder, given it might fail to enable
+ # Write to `enabled` might fail depending on the lock, which is
+ # highly contentious here
+ for _ in $(seq 5)
+ do
+ echo 1 > "${NETCONS_PATH}"/enabled 2> /dev/null || true
+ done
+ done
+}
+
+toggle_iface(){
+ ip link set "${SRCIF}" down
+ ip link set "${SRCIF}" up
+}
+
+# Start here
+
+modprobe netdevsim 2> /dev/null || true
+modprobe netconsole 2> /dev/null || true
+
+# Check for basic system dependency and exit if not found
+check_for_dependencies
+# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5)
+echo "6 5" > /proc/sys/kernel/printk
+# Remove the namespace, interfaces and netconsole target on exit
+trap cleanup EXIT
+# Create one namespace and two interfaces
+set_network "${IP_VERSION}"
+# Create a dynamic target for netconsole
+create_dynamic_target "${FORMAT}"
+
+for i in $(seq "$ITERATIONS")
+do
+ for _ in $(seq 10)
+ do
+ echo "${MSG}: ${TARGET} ${i}" > /dev/kmsg
+ done
+ wait
+
+ if (( i % 30 == 0 )); then
+ toggle_netcons_target &
+ fi
+
+ if (( i % 50 == 0 )); then
+ # create some targets, enable them, send msg and disable
+ # all in a parallel thread
+ create_and_delete_random_target &
+ fi
+
+ if (( i % 70 == 0 )); then
+ toggle_iface &
+ fi
+done
+wait
+
+exit "${EXIT_STATUS}"
diff --git a/tools/testing/selftests/filesystems/utils.c b/tools/testing/selftests/filesystems/utils.c
index c43a69dffd83..a0c64f415a7f 100644
--- a/tools/testing/selftests/filesystems/utils.c
+++ b/tools/testing/selftests/filesystems/utils.c
@@ -487,7 +487,7 @@ int setup_userns(void)
uid_t uid = getuid();
gid_t gid = getgid();
- ret = unshare(CLONE_NEWNS|CLONE_NEWUSER|CLONE_NEWPID);
+ ret = unshare(CLONE_NEWNS|CLONE_NEWUSER);
if (ret) {
ksft_exit_fail_msg("unsharing mountns and userns: %s\n",
strerror(errno));
diff --git a/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc b/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc
index c62165fabd0c..cfa16aa1f39a 100644
--- a/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc
+++ b/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc
@@ -20,6 +20,10 @@ sample_events() {
echo 0 > tracing_on
echo 0 > events/enable
+# Clear functions caused by page cache; run sample_events twice
+sample_events
+sample_events
+
echo "Get the most frequently calling function"
echo > trace
sample_events
diff --git a/tools/testing/selftests/kvm/arm64/get-reg-list.c b/tools/testing/selftests/kvm/arm64/get-reg-list.c
index c9b84eeaab6b..0a3a94c4cca1 100644
--- a/tools/testing/selftests/kvm/arm64/get-reg-list.c
+++ b/tools/testing/selftests/kvm/arm64/get-reg-list.c
@@ -63,11 +63,13 @@ static struct feature_id_reg feat_id_regs[] = {
REG_FEAT(HDFGWTR2_EL2, ID_AA64MMFR0_EL1, FGT, FGT2),
REG_FEAT(ZCR_EL2, ID_AA64PFR0_EL1, SVE, IMP),
REG_FEAT(SCTLR2_EL1, ID_AA64MMFR3_EL1, SCTLRX, IMP),
+ REG_FEAT(SCTLR2_EL2, ID_AA64MMFR3_EL1, SCTLRX, IMP),
REG_FEAT(VDISR_EL2, ID_AA64PFR0_EL1, RAS, IMP),
REG_FEAT(VSESR_EL2, ID_AA64PFR0_EL1, RAS, IMP),
REG_FEAT(VNCR_EL2, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY),
REG_FEAT(CNTHV_CTL_EL2, ID_AA64MMFR1_EL1, VH, IMP),
REG_FEAT(CNTHV_CVAL_EL2,ID_AA64MMFR1_EL1, VH, IMP),
+ REG_FEAT(ZCR_EL2, ID_AA64PFR0_EL1, SVE, IMP),
};
bool filter_reg(__u64 reg)
@@ -718,6 +720,7 @@ static __u64 el2_regs[] = {
SYS_REG(VMPIDR_EL2),
SYS_REG(SCTLR_EL2),
SYS_REG(ACTLR_EL2),
+ SYS_REG(SCTLR2_EL2),
SYS_REG(HCR_EL2),
SYS_REG(MDCR_EL2),
SYS_REG(CPTR_EL2),
diff --git a/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c b/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c
index 09f270545646..0e2f8ed90f30 100644
--- a/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c
+++ b/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c
@@ -15,6 +15,8 @@
#include "gic_v3.h"
#include "processor.h"
+#define GITS_COLLECTION_TARGET_SHIFT 16
+
static u64 its_read_u64(unsigned long offset)
{
return readq_relaxed(GITS_BASE_GVA + offset);
@@ -163,6 +165,11 @@ static void its_encode_collection(struct its_cmd_block *cmd, u16 col)
its_mask_encode(&cmd->raw_cmd[2], col, 15, 0);
}
+static u64 procnum_to_rdbase(u32 vcpu_id)
+{
+ return vcpu_id << GITS_COLLECTION_TARGET_SHIFT;
+}
+
#define GITS_CMDQ_POLL_ITERATIONS 0
static void its_send_cmd(void *cmdq_base, struct its_cmd_block *cmd)
@@ -217,7 +224,7 @@ void its_send_mapc_cmd(void *cmdq_base, u32 vcpu_id, u32 collection_id, bool val
its_encode_cmd(&cmd, GITS_CMD_MAPC);
its_encode_collection(&cmd, collection_id);
- its_encode_target(&cmd, vcpu_id);
+ its_encode_target(&cmd, procnum_to_rdbase(vcpu_id));
its_encode_valid(&cmd, valid);
its_send_cmd(cmdq_base, &cmd);
diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c
index 9e3be2ee7f1b..f917b4c4c943 100644
--- a/tools/testing/selftests/mm/uffd-unit-tests.c
+++ b/tools/testing/selftests/mm/uffd-unit-tests.c
@@ -1758,10 +1758,15 @@ int main(int argc, char *argv[])
uffd_test_ops = mem_type->mem_ops;
uffd_test_case_ops = test->test_case_ops;
- if (mem_type->mem_flag & (MEM_HUGETLB_PRIVATE | MEM_HUGETLB))
+ if (mem_type->mem_flag & (MEM_HUGETLB_PRIVATE | MEM_HUGETLB)) {
gopts.page_size = default_huge_page_size();
- else
+ if (gopts.page_size == 0) {
+ uffd_test_skip("huge page size is 0, feature missing?");
+ continue;
+ }
+ } else {
gopts.page_size = psize();
+ }
/* Ensure we have at least 2 pages */
gopts.nr_pages = MAX(UFFD_TEST_MEM_SIZE, gopts.page_size * 2)
@@ -1776,12 +1781,6 @@ int main(int argc, char *argv[])
continue;
uffd_test_start("%s on %s", test->name, mem_type->name);
- if ((mem_type->mem_flag == MEM_HUGETLB ||
- mem_type->mem_flag == MEM_HUGETLB_PRIVATE) &&
- (default_huge_page_size() == 0)) {
- uffd_test_skip("huge page size is 0, feature missing?");
- continue;
- }
if (!uffd_feature_supported(test)) {
uffd_test_skip("feature missing");
continue;
diff --git a/tools/testing/selftests/namespaces/.gitignore b/tools/testing/selftests/namespaces/.gitignore
index ccfb40837a73..0989e80da457 100644
--- a/tools/testing/selftests/namespaces/.gitignore
+++ b/tools/testing/selftests/namespaces/.gitignore
@@ -1,3 +1,12 @@
nsid_test
file_handle_test
init_ino_test
+ns_active_ref_test
+listns_test
+listns_permissions_test
+listns_efault_test
+siocgskns_test
+cred_change_test
+stress_test
+listns_pagination_bug
+regression_pidfd_setns_test
diff --git a/tools/testing/selftests/namespaces/Makefile b/tools/testing/selftests/namespaces/Makefile
index 5fe4b3dc07d3..fbb821652c17 100644
--- a/tools/testing/selftests/namespaces/Makefile
+++ b/tools/testing/selftests/namespaces/Makefile
@@ -1,7 +1,29 @@
# SPDX-License-Identifier: GPL-2.0-only
CFLAGS += -Wall -O0 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES)
+LDLIBS += -lcap
-TEST_GEN_PROGS := nsid_test file_handle_test init_ino_test
+TEST_GEN_PROGS := nsid_test \
+ file_handle_test \
+ init_ino_test \
+ ns_active_ref_test \
+ listns_test \
+ listns_permissions_test \
+ listns_efault_test \
+ siocgskns_test \
+ cred_change_test \
+ stress_test \
+ listns_pagination_bug \
+ regression_pidfd_setns_test
include ../lib.mk
+$(OUTPUT)/ns_active_ref_test: ../filesystems/utils.c
+$(OUTPUT)/listns_test: ../filesystems/utils.c
+$(OUTPUT)/listns_permissions_test: ../filesystems/utils.c
+$(OUTPUT)/listns_efault_test: ../filesystems/utils.c
+$(OUTPUT)/siocgskns_test: ../filesystems/utils.c
+$(OUTPUT)/cred_change_test: ../filesystems/utils.c
+$(OUTPUT)/stress_test: ../filesystems/utils.c
+$(OUTPUT)/listns_pagination_bug: ../filesystems/utils.c
+$(OUTPUT)/regression_pidfd_setns_test: ../filesystems/utils.c
+
diff --git a/tools/testing/selftests/namespaces/cred_change_test.c b/tools/testing/selftests/namespaces/cred_change_test.c
new file mode 100644
index 000000000000..7b4f5ad3f725
--- /dev/null
+++ b/tools/testing/selftests/namespaces/cred_change_test.c
@@ -0,0 +1,814 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/capability.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <linux/nsfs.h>
+#include "../kselftest_harness.h"
+#include "../filesystems/utils.h"
+#include "wrappers.h"
+
+/*
+ * Test credential changes and their impact on namespace active references.
+ */
+
+/*
+ * Test setuid() in a user namespace properly swaps active references.
+ * Create a user namespace with multiple UIDs mapped, then setuid() between them.
+ * Verify that the user namespace remains active throughout.
+ */
+TEST(setuid_preserves_active_refs)
+{
+ pid_t pid;
+ int status;
+ __u64 userns_id;
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[256];
+ ssize_t ret;
+ int i;
+ bool found = false;
+ int pipefd[2];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ int fd, userns_fd;
+ __u64 child_userns_id;
+ uid_t orig_uid = getuid();
+ int setuid_count;
+
+ close(pipefd[0]);
+
+ /* Create new user namespace with multiple UIDs mapped (0-9) */
+ userns_fd = get_userns_fd(0, orig_uid, 10);
+ if (userns_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (setns(userns_fd, CLONE_NEWUSER) < 0) {
+ close(userns_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(userns_fd);
+
+ /* Get user namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Send namespace ID to parent */
+ write(pipefd[1], &child_userns_id, sizeof(child_userns_id));
+
+ /*
+ * Perform multiple setuid() calls.
+ * Each setuid() triggers commit_creds() which should properly
+ * swap active references via switch_cred_namespaces().
+ */
+ for (setuid_count = 0; setuid_count < 50; setuid_count++) {
+ uid_t target_uid = (setuid_count % 10);
+ if (setuid(target_uid) < 0) {
+ if (errno != EPERM) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ }
+ }
+
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+
+ if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) {
+ close(pipefd[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to get namespace ID from child");
+ }
+ close(pipefd[0]);
+
+ TH_LOG("Child user namespace ID: %llu", (unsigned long long)userns_id);
+
+ /* Verify namespace is active while child is running */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret, 0);
+ }
+
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == userns_id) {
+ found = true;
+ break;
+ }
+ }
+ ASSERT_TRUE(found);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Verify namespace becomes inactive after child exits */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ ASSERT_GE(ret, 0);
+
+ found = false;
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == userns_id) {
+ found = true;
+ break;
+ }
+ }
+
+ ASSERT_FALSE(found);
+ TH_LOG("setuid() correctly preserved active references (no leak)");
+}
+
+/*
+ * Test setgid() in a user namespace properly handles active references.
+ */
+TEST(setgid_preserves_active_refs)
+{
+ pid_t pid;
+ int status;
+ __u64 userns_id;
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[256];
+ ssize_t ret;
+ int i;
+ bool found = false;
+ int pipefd[2];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ int fd, userns_fd;
+ __u64 child_userns_id;
+ uid_t orig_uid = getuid();
+ int setgid_count;
+
+ close(pipefd[0]);
+
+ /* Create new user namespace with multiple GIDs mapped */
+ userns_fd = get_userns_fd(0, orig_uid, 10);
+ if (userns_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (setns(userns_fd, CLONE_NEWUSER) < 0) {
+ close(userns_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(userns_fd);
+
+ /* Get user namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ write(pipefd[1], &child_userns_id, sizeof(child_userns_id));
+
+ /* Perform multiple setgid() calls */
+ for (setgid_count = 0; setgid_count < 50; setgid_count++) {
+ gid_t target_gid = (setgid_count % 10);
+ if (setgid(target_gid) < 0) {
+ if (errno != EPERM) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ }
+ }
+
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+
+ if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) {
+ close(pipefd[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to get namespace ID from child");
+ }
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Verify namespace becomes inactive */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret, 0);
+ }
+
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == userns_id) {
+ found = true;
+ break;
+ }
+ }
+
+ ASSERT_FALSE(found);
+ TH_LOG("setgid() correctly preserved active references (no leak)");
+}
+
+/*
+ * Test setresuid() which changes real, effective, and saved UIDs.
+ * This should properly swap active references via commit_creds().
+ */
+TEST(setresuid_preserves_active_refs)
+{
+ pid_t pid;
+ int status;
+ __u64 userns_id;
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[256];
+ ssize_t ret;
+ int i;
+ bool found = false;
+ int pipefd[2];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ int fd, userns_fd;
+ __u64 child_userns_id;
+ uid_t orig_uid = getuid();
+ int setres_count;
+
+ close(pipefd[0]);
+
+ /* Create new user namespace */
+ userns_fd = get_userns_fd(0, orig_uid, 10);
+ if (userns_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (setns(userns_fd, CLONE_NEWUSER) < 0) {
+ close(userns_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(userns_fd);
+
+ /* Get user namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ write(pipefd[1], &child_userns_id, sizeof(child_userns_id));
+
+ /* Perform multiple setresuid() calls */
+ for (setres_count = 0; setres_count < 30; setres_count++) {
+ uid_t uid1 = (setres_count % 5);
+ uid_t uid2 = ((setres_count + 1) % 5);
+ uid_t uid3 = ((setres_count + 2) % 5);
+
+ if (setresuid(uid1, uid2, uid3) < 0) {
+ if (errno != EPERM) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ }
+ }
+
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+
+ if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) {
+ close(pipefd[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to get namespace ID from child");
+ }
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Verify namespace becomes inactive */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret, 0);
+ }
+
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == userns_id) {
+ found = true;
+ break;
+ }
+ }
+
+ ASSERT_FALSE(found);
+ TH_LOG("setresuid() correctly preserved active references (no leak)");
+}
+
+/*
+ * Test credential changes across multiple user namespaces.
+ * Create nested user namespaces and verify active reference tracking.
+ */
+TEST(cred_change_nested_userns)
+{
+ pid_t pid;
+ int status;
+ __u64 parent_userns_id, child_userns_id;
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[256];
+ ssize_t ret;
+ int i;
+ bool found_parent = false, found_child = false;
+ int pipefd[2];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ int fd, userns_fd;
+ __u64 parent_id, child_id;
+ uid_t orig_uid = getuid();
+
+ close(pipefd[0]);
+
+ /* Create first user namespace */
+ userns_fd = get_userns_fd(0, orig_uid, 1);
+ if (userns_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (setns(userns_fd, CLONE_NEWUSER) < 0) {
+ close(userns_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(userns_fd);
+
+ /* Get first namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &parent_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Create nested user namespace */
+ userns_fd = get_userns_fd(0, 0, 1);
+ if (userns_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (setns(userns_fd, CLONE_NEWUSER) < 0) {
+ close(userns_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(userns_fd);
+
+ /* Get nested namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &child_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Send both IDs to parent */
+ write(pipefd[1], &parent_id, sizeof(parent_id));
+ write(pipefd[1], &child_id, sizeof(child_id));
+
+ /* Perform some credential changes in nested namespace */
+ setuid(0);
+ setgid(0);
+
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+
+ /* Read both namespace IDs */
+ if (read(pipefd[0], &parent_userns_id, sizeof(parent_userns_id)) != sizeof(parent_userns_id)) {
+ close(pipefd[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to get parent namespace ID");
+ }
+
+ if (read(pipefd[0], &child_userns_id, sizeof(child_userns_id)) != sizeof(child_userns_id)) {
+ close(pipefd[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to get child namespace ID");
+ }
+ close(pipefd[0]);
+
+ TH_LOG("Parent userns: %llu, Child userns: %llu",
+ (unsigned long long)parent_userns_id,
+ (unsigned long long)child_userns_id);
+
+ /* Verify both namespaces are active */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret, 0);
+ }
+
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == parent_userns_id)
+ found_parent = true;
+ if (ns_ids[i] == child_userns_id)
+ found_child = true;
+ }
+
+ ASSERT_TRUE(found_parent);
+ ASSERT_TRUE(found_child);
+
+ /* Wait for child */
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Verify both namespaces become inactive */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ ASSERT_GE(ret, 0);
+
+ found_parent = false;
+ found_child = false;
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == parent_userns_id)
+ found_parent = true;
+ if (ns_ids[i] == child_userns_id)
+ found_child = true;
+ }
+
+ ASSERT_FALSE(found_parent);
+ ASSERT_FALSE(found_child);
+ TH_LOG("Nested user namespace credential changes preserved active refs (no leak)");
+}
+
+/*
+ * Test rapid credential changes don't cause refcount imbalances.
+ * This stress-tests the switch_cred_namespaces() logic.
+ */
+TEST(rapid_cred_changes_no_leak)
+{
+ pid_t pid;
+ int status;
+ __u64 userns_id;
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[256];
+ ssize_t ret;
+ int i;
+ bool found = false;
+ int pipefd[2];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ int fd, userns_fd;
+ __u64 child_userns_id;
+ uid_t orig_uid = getuid();
+ int change_count;
+
+ close(pipefd[0]);
+
+ /* Create new user namespace with wider range of UIDs/GIDs */
+ userns_fd = get_userns_fd(0, orig_uid, 100);
+ if (userns_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (setns(userns_fd, CLONE_NEWUSER) < 0) {
+ close(userns_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(userns_fd);
+
+ /* Get user namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ write(pipefd[1], &child_userns_id, sizeof(child_userns_id));
+
+ /*
+ * Perform many rapid credential changes.
+ * Mix setuid, setgid, setreuid, setregid, setresuid, setresgid.
+ */
+ for (change_count = 0; change_count < 200; change_count++) {
+ switch (change_count % 6) {
+ case 0:
+ setuid(change_count % 50);
+ break;
+ case 1:
+ setgid(change_count % 50);
+ break;
+ case 2:
+ setreuid(change_count % 50, (change_count + 1) % 50);
+ break;
+ case 3:
+ setregid(change_count % 50, (change_count + 1) % 50);
+ break;
+ case 4:
+ setresuid(change_count % 50, (change_count + 1) % 50, (change_count + 2) % 50);
+ break;
+ case 5:
+ setresgid(change_count % 50, (change_count + 1) % 50, (change_count + 2) % 50);
+ break;
+ }
+ }
+
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+
+ if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) {
+ close(pipefd[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to get namespace ID from child");
+ }
+ close(pipefd[0]);
+
+ TH_LOG("Testing with user namespace ID: %llu", (unsigned long long)userns_id);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Verify namespace becomes inactive (no leaked active refs) */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret, 0);
+ }
+
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == userns_id) {
+ found = true;
+ break;
+ }
+ }
+
+ ASSERT_FALSE(found);
+ TH_LOG("200 rapid credential changes completed with no active ref leak");
+}
+
+/*
+ * Test setfsuid/setfsgid which change filesystem UID/GID.
+ * These also trigger credential changes but may have different code paths.
+ */
+TEST(setfsuid_preserves_active_refs)
+{
+ pid_t pid;
+ int status;
+ __u64 userns_id;
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[256];
+ ssize_t ret;
+ int i;
+ bool found = false;
+ int pipefd[2];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ int fd, userns_fd;
+ __u64 child_userns_id;
+ uid_t orig_uid = getuid();
+ int change_count;
+
+ close(pipefd[0]);
+
+ /* Create new user namespace */
+ userns_fd = get_userns_fd(0, orig_uid, 10);
+ if (userns_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (setns(userns_fd, CLONE_NEWUSER) < 0) {
+ close(userns_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(userns_fd);
+
+ /* Get user namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ write(pipefd[1], &child_userns_id, sizeof(child_userns_id));
+
+ /* Perform multiple setfsuid/setfsgid calls */
+ for (change_count = 0; change_count < 50; change_count++) {
+ setfsuid(change_count % 10);
+ setfsgid(change_count % 10);
+ }
+
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+
+ if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) {
+ close(pipefd[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to get namespace ID from child");
+ }
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Verify namespace becomes inactive */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret, 0);
+ }
+
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == userns_id) {
+ found = true;
+ break;
+ }
+ }
+
+ ASSERT_FALSE(found);
+ TH_LOG("setfsuid/setfsgid correctly preserved active references (no leak)");
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/namespaces/listns_efault_test.c b/tools/testing/selftests/namespaces/listns_efault_test.c
new file mode 100644
index 000000000000..c7ed4023d7a8
--- /dev/null
+++ b/tools/testing/selftests/namespaces/listns_efault_test.c
@@ -0,0 +1,530 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <linux/nsfs.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/mount.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "../kselftest_harness.h"
+#include "../filesystems/utils.h"
+#include "../pidfd/pidfd.h"
+#include "wrappers.h"
+
+/*
+ * Test listns() error handling with invalid buffer addresses.
+ *
+ * When the buffer pointer is invalid (e.g., crossing page boundaries
+ * into unmapped memory), listns() returns EINVAL.
+ *
+ * This test also creates mount namespaces that get destroyed during
+ * iteration, testing that namespace cleanup happens outside the RCU
+ * read lock.
+ */
+TEST(listns_partial_fault_with_ns_cleanup)
+{
+ void *map;
+ __u64 *ns_ids;
+ ssize_t ret;
+ long page_size;
+ pid_t pid, iter_pid;
+ int pidfds[5];
+ int sv[5][2];
+ int iter_pidfd;
+ int i, status;
+ char c;
+
+ page_size = sysconf(_SC_PAGESIZE);
+ ASSERT_GT(page_size, 0);
+
+ /*
+ * Map two pages:
+ * - First page: readable and writable
+ * - Second page: will be unmapped to trigger EFAULT
+ */
+ map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ ASSERT_NE(map, MAP_FAILED);
+
+ /* Unmap the second page */
+ ret = munmap((char *)map + page_size, page_size);
+ ASSERT_EQ(ret, 0);
+
+ /*
+ * Position the buffer pointer so there's room for exactly one u64
+ * before the page boundary. The second u64 would fall into the
+ * unmapped page.
+ */
+ ns_ids = ((__u64 *)((char *)map + page_size)) - 1;
+
+ /*
+ * Create a separate process to run listns() in a loop concurrently
+ * with namespace creation and destruction.
+ */
+ iter_pid = create_child(&iter_pidfd, 0);
+ ASSERT_NE(iter_pid, -1);
+
+ if (iter_pid == 0) {
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0, /* All types */
+ .spare2 = 0,
+ .user_ns_id = 0, /* Global listing */
+ };
+ int iter_ret;
+
+ /*
+ * Loop calling listns() until killed.
+ * The kernel should:
+ * 1. Successfully write the first namespace ID (within valid page)
+ * 2. Fail with EFAULT when trying to write the second ID (unmapped page)
+ * 3. Handle concurrent namespace destruction without deadlock
+ */
+ while (1) {
+ iter_ret = sys_listns(&req, ns_ids, 2, 0);
+
+ if (iter_ret == -1 && errno == ENOSYS)
+ _exit(PIDFD_SKIP);
+ }
+ }
+
+ /* Small delay to let iterator start looping */
+ usleep(50000);
+
+ /*
+ * Create several child processes, each in its own mount namespace.
+ * These will be destroyed while the iterator is running listns().
+ */
+ for (i = 0; i < 5; i++) {
+ /* Create socketpair for synchronization */
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0);
+
+ pid = create_child(&pidfds[i], CLONE_NEWNS);
+ ASSERT_NE(pid, -1);
+
+ if (pid == 0) {
+ close(sv[i][0]); /* Close parent end */
+
+ if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0))
+ _exit(1);
+
+ /* Child: create a couple of tmpfs mounts */
+ if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST)
+ _exit(1);
+ if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST)
+ _exit(1);
+
+ if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1)
+ _exit(1);
+ if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1)
+ _exit(1);
+
+ /* Signal parent that setup is complete */
+ if (write_nointr(sv[i][1], "R", 1) != 1)
+ _exit(1);
+
+ /* Wait for parent to signal us to exit */
+ if (read_nointr(sv[i][1], &c, 1) != 1)
+ _exit(1);
+
+ close(sv[i][1]);
+ _exit(0);
+ }
+
+ close(sv[i][1]); /* Close child end */
+ }
+
+ /* Wait for all children to finish setup */
+ for (i = 0; i < 5; i++) {
+ ret = read_nointr(sv[i][0], &c, 1);
+ ASSERT_EQ(ret, 1);
+ ASSERT_EQ(c, 'R');
+ }
+
+ /*
+ * Signal children to exit. This will destroy their mount namespaces
+ * while listns() is iterating the namespace tree.
+ * This tests that cleanup happens outside the RCU read lock.
+ */
+ for (i = 0; i < 5; i++)
+ write_nointr(sv[i][0], "X", 1);
+
+ /* Wait for all mount namespace children to exit and cleanup */
+ for (i = 0; i < 5; i++) {
+ waitpid(-1, NULL, 0);
+ close(sv[i][0]);
+ close(pidfds[i]);
+ }
+
+ /* Kill iterator and wait for it */
+ sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0);
+ ret = waitpid(iter_pid, &status, 0);
+ ASSERT_EQ(ret, iter_pid);
+ close(iter_pidfd);
+
+ /* Should have been killed */
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_EQ(WTERMSIG(status), SIGKILL);
+
+ /* Clean up */
+ munmap(map, page_size);
+}
+
+/*
+ * Test listns() error handling when the entire buffer is invalid.
+ * This is a sanity check that basic invalid pointer detection works.
+ */
+TEST(listns_complete_fault)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 *ns_ids;
+ ssize_t ret;
+
+ /* Use a clearly invalid pointer */
+ ns_ids = (__u64 *)0xdeadbeef;
+
+ ret = sys_listns(&req, ns_ids, 10, 0);
+
+ if (ret == -1 && errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+
+ /* Should fail with EFAULT */
+ ASSERT_EQ(ret, -1);
+ ASSERT_EQ(errno, EFAULT);
+}
+
+/*
+ * Test listns() error handling when the buffer is NULL.
+ */
+TEST(listns_null_buffer)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ ssize_t ret;
+
+ /* NULL buffer with non-zero count should fail */
+ ret = sys_listns(&req, NULL, 10, 0);
+
+ if (ret == -1 && errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+
+ /* Should fail with EFAULT */
+ ASSERT_EQ(ret, -1);
+ ASSERT_EQ(errno, EFAULT);
+}
+
+/*
+ * Test listns() with a buffer that becomes invalid mid-iteration
+ * (after several successful writes), combined with mount namespace
+ * destruction to test RCU cleanup logic.
+ */
+TEST(listns_late_fault_with_ns_cleanup)
+{
+ void *map;
+ __u64 *ns_ids;
+ ssize_t ret;
+ long page_size;
+ pid_t pid, iter_pid;
+ int pidfds[10];
+ int sv[10][2];
+ int iter_pidfd;
+ int i, status;
+ char c;
+
+ page_size = sysconf(_SC_PAGESIZE);
+ ASSERT_GT(page_size, 0);
+
+ /* Map two pages */
+ map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ ASSERT_NE(map, MAP_FAILED);
+
+ /* Unmap the second page */
+ ret = munmap((char *)map + page_size, page_size);
+ ASSERT_EQ(ret, 0);
+
+ /*
+ * Position buffer so we can write several u64s successfully
+ * before hitting the page boundary.
+ */
+ ns_ids = ((__u64 *)((char *)map + page_size)) - 5;
+
+ /*
+ * Create a separate process to run listns() concurrently.
+ */
+ iter_pid = create_child(&iter_pidfd, 0);
+ ASSERT_NE(iter_pid, -1);
+
+ if (iter_pid == 0) {
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ int iter_ret;
+
+ /*
+ * Loop calling listns() until killed.
+ * Request 10 namespace IDs while namespaces are being destroyed.
+ * This tests:
+ * 1. EFAULT handling when buffer becomes invalid
+ * 2. Namespace cleanup outside RCU read lock during iteration
+ */
+ while (1) {
+ iter_ret = sys_listns(&req, ns_ids, 10, 0);
+
+ if (iter_ret == -1 && errno == ENOSYS)
+ _exit(PIDFD_SKIP);
+ }
+ }
+
+ /* Small delay to let iterator start looping */
+ usleep(50000);
+
+ /*
+ * Create more children with mount namespaces to increase the
+ * likelihood that namespace cleanup happens during iteration.
+ */
+ for (i = 0; i < 10; i++) {
+ /* Create socketpair for synchronization */
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0);
+
+ pid = create_child(&pidfds[i], CLONE_NEWNS);
+ ASSERT_NE(pid, -1);
+
+ if (pid == 0) {
+ close(sv[i][0]); /* Close parent end */
+
+ if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0))
+ _exit(1);
+
+ /* Child: create tmpfs mounts */
+ if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST)
+ _exit(1);
+ if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST)
+ _exit(1);
+
+ if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1)
+ _exit(1);
+ if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1)
+ _exit(1);
+
+ /* Signal parent that setup is complete */
+ if (write_nointr(sv[i][1], "R", 1) != 1)
+ _exit(1);
+
+ /* Wait for parent to signal us to exit */
+ if (read_nointr(sv[i][1], &c, 1) != 1)
+ _exit(1);
+
+ close(sv[i][1]);
+ _exit(0);
+ }
+
+ close(sv[i][1]); /* Close child end */
+ }
+
+ /* Wait for all children to finish setup */
+ for (i = 0; i < 10; i++) {
+ ret = read_nointr(sv[i][0], &c, 1);
+ ASSERT_EQ(ret, 1);
+ ASSERT_EQ(c, 'R');
+ }
+
+ /* Kill half the children */
+ for (i = 0; i < 5; i++)
+ write_nointr(sv[i][0], "X", 1);
+
+ /* Small delay to let some exit */
+ usleep(10000);
+
+ /* Kill remaining children */
+ for (i = 5; i < 10; i++)
+ write_nointr(sv[i][0], "X", 1);
+
+ /* Wait for all children and cleanup */
+ for (i = 0; i < 10; i++) {
+ waitpid(-1, NULL, 0);
+ close(sv[i][0]);
+ close(pidfds[i]);
+ }
+
+ /* Kill iterator and wait for it */
+ sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0);
+ ret = waitpid(iter_pid, &status, 0);
+ ASSERT_EQ(ret, iter_pid);
+ close(iter_pidfd);
+
+ /* Should have been killed */
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_EQ(WTERMSIG(status), SIGKILL);
+
+ /* Clean up */
+ munmap(map, page_size);
+}
+
+/*
+ * Test specifically focused on mount namespace cleanup during EFAULT.
+ * Filter for mount namespaces only.
+ */
+TEST(listns_mnt_ns_cleanup_on_fault)
+{
+ void *map;
+ __u64 *ns_ids;
+ ssize_t ret;
+ long page_size;
+ pid_t pid, iter_pid;
+ int pidfds[8];
+ int sv[8][2];
+ int iter_pidfd;
+ int i, status;
+ char c;
+
+ page_size = sysconf(_SC_PAGESIZE);
+ ASSERT_GT(page_size, 0);
+
+ /* Set up partial fault buffer */
+ map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ ASSERT_NE(map, MAP_FAILED);
+
+ ret = munmap((char *)map + page_size, page_size);
+ ASSERT_EQ(ret, 0);
+
+ /* Position for 3 successful writes, then fault */
+ ns_ids = ((__u64 *)((char *)map + page_size)) - 3;
+
+ /*
+ * Create a separate process to run listns() concurrently.
+ */
+ iter_pid = create_child(&iter_pidfd, 0);
+ ASSERT_NE(iter_pid, -1);
+
+ if (iter_pid == 0) {
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWNS, /* Only mount namespaces */
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ int iter_ret;
+
+ /*
+ * Loop calling listns() until killed.
+ * Call listns() to race with namespace destruction.
+ */
+ while (1) {
+ iter_ret = sys_listns(&req, ns_ids, 10, 0);
+
+ if (iter_ret == -1 && errno == ENOSYS)
+ _exit(PIDFD_SKIP);
+ }
+ }
+
+ /* Small delay to let iterator start looping */
+ usleep(50000);
+
+ /* Create children with mount namespaces */
+ for (i = 0; i < 8; i++) {
+ /* Create socketpair for synchronization */
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0);
+
+ pid = create_child(&pidfds[i], CLONE_NEWNS);
+ ASSERT_NE(pid, -1);
+
+ if (pid == 0) {
+ close(sv[i][0]); /* Close parent end */
+
+ if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0))
+ _exit(1);
+
+ /* Do some mount operations to make cleanup more interesting */
+ if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST)
+ _exit(1);
+ if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST)
+ _exit(1);
+
+ if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1)
+ _exit(1);
+ if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1)
+ _exit(1);
+
+ /* Signal parent that setup is complete */
+ if (write_nointr(sv[i][1], "R", 1) != 1)
+ _exit(1);
+
+ /* Wait for parent to signal us to exit */
+ if (read_nointr(sv[i][1], &c, 1) != 1)
+ _exit(1);
+
+ close(sv[i][1]);
+ _exit(0);
+ }
+
+ close(sv[i][1]); /* Close child end */
+ }
+
+ /* Wait for all children to finish setup */
+ for (i = 0; i < 8; i++) {
+ ret = read_nointr(sv[i][0], &c, 1);
+ ASSERT_EQ(ret, 1);
+ ASSERT_EQ(c, 'R');
+ }
+
+ /* Kill children to trigger namespace destruction during iteration */
+ for (i = 0; i < 8; i++)
+ write_nointr(sv[i][0], "X", 1);
+
+ /* Wait for children and cleanup */
+ for (i = 0; i < 8; i++) {
+ waitpid(-1, NULL, 0);
+ close(sv[i][0]);
+ close(pidfds[i]);
+ }
+
+ /* Kill iterator and wait for it */
+ sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0);
+ ret = waitpid(iter_pid, &status, 0);
+ ASSERT_EQ(ret, iter_pid);
+ close(iter_pidfd);
+
+ /* Should have been killed */
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_EQ(WTERMSIG(status), SIGKILL);
+
+ munmap(map, page_size);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/namespaces/listns_pagination_bug.c b/tools/testing/selftests/namespaces/listns_pagination_bug.c
new file mode 100644
index 000000000000..da7d33f96397
--- /dev/null
+++ b/tools/testing/selftests/namespaces/listns_pagination_bug.c
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/socket.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "../kselftest_harness.h"
+#include "../filesystems/utils.h"
+#include "wrappers.h"
+
+/*
+ * Minimal test case to reproduce KASAN out-of-bounds in listns pagination.
+ *
+ * The bug occurs when:
+ * 1. Filtering by a specific namespace type (e.g., CLONE_NEWUSER)
+ * 2. Using pagination (req.ns_id != 0)
+ * 3. The lookup_ns_id_at() call in do_listns() passes ns_type=0 instead of
+ * the filtered type, causing it to search the unified tree and potentially
+ * return a namespace of the wrong type.
+ */
+TEST(pagination_with_type_filter)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER, /* Filter by user namespace */
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ pid_t pids[10];
+ int num_children = 10;
+ int i;
+ int sv[2];
+ __u64 first_batch[3];
+ ssize_t ret;
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0);
+
+ /* Create children with user namespaces */
+ for (i = 0; i < num_children; i++) {
+ pids[i] = fork();
+ ASSERT_GE(pids[i], 0);
+
+ if (pids[i] == 0) {
+ char c;
+ close(sv[0]);
+
+ if (setup_userns() < 0) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ /* Signal parent we're ready */
+ if (write(sv[1], &c, 1) != 1) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ /* Wait for parent signal to exit */
+ if (read(sv[1], &c, 1) != 1) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ close(sv[1]);
+ exit(0);
+ }
+ }
+
+ close(sv[1]);
+
+ /* Wait for all children to signal ready */
+ for (i = 0; i < num_children; i++) {
+ char c;
+ if (read(sv[0], &c, 1) != 1) {
+ close(sv[0]);
+ for (int j = 0; j < num_children; j++)
+ kill(pids[j], SIGKILL);
+ for (int j = 0; j < num_children; j++)
+ waitpid(pids[j], NULL, 0);
+ ASSERT_TRUE(false);
+ }
+ }
+
+ /* First batch - this should work */
+ ret = sys_listns(&req, first_batch, 3, 0);
+ if (ret < 0) {
+ if (errno == ENOSYS) {
+ close(sv[0]);
+ for (i = 0; i < num_children; i++)
+ kill(pids[i], SIGKILL);
+ for (i = 0; i < num_children; i++)
+ waitpid(pids[i], NULL, 0);
+ SKIP(return, "listns() not supported");
+ }
+ ASSERT_GE(ret, 0);
+ }
+
+ TH_LOG("First batch returned %zd entries", ret);
+
+ if (ret == 3) {
+ __u64 second_batch[3];
+
+ /* Second batch - pagination triggers the bug */
+ req.ns_id = first_batch[2]; /* Continue from last ID */
+ ret = sys_listns(&req, second_batch, 3, 0);
+
+ TH_LOG("Second batch returned %zd entries", ret);
+ ASSERT_GE(ret, 0);
+ }
+
+ /* Signal all children to exit */
+ for (i = 0; i < num_children; i++) {
+ char c = 'X';
+ if (write(sv[0], &c, 1) != 1) {
+ close(sv[0]);
+ for (int j = i; j < num_children; j++)
+ kill(pids[j], SIGKILL);
+ for (int j = 0; j < num_children; j++)
+ waitpid(pids[j], NULL, 0);
+ ASSERT_TRUE(false);
+ }
+ }
+
+ close(sv[0]);
+
+ /* Cleanup */
+ for (i = 0; i < num_children; i++) {
+ int status;
+ waitpid(pids[i], &status, 0);
+ }
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/namespaces/listns_permissions_test.c b/tools/testing/selftests/namespaces/listns_permissions_test.c
new file mode 100644
index 000000000000..82d818751a5f
--- /dev/null
+++ b/tools/testing/selftests/namespaces/listns_permissions_test.c
@@ -0,0 +1,759 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <linux/nsfs.h>
+#include <sys/capability.h>
+#include <sys/ioctl.h>
+#include <sys/prctl.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "../kselftest_harness.h"
+#include "../filesystems/utils.h"
+#include "wrappers.h"
+
+/*
+ * Test that unprivileged users can only see namespaces they're currently in.
+ * Create a namespace, drop privileges, verify we can only see our own namespaces.
+ */
+TEST(listns_unprivileged_current_only)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWNET,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[100];
+ ssize_t ret;
+ int pipefd[2];
+ pid_t pid;
+ int status;
+ bool found_ours;
+ int unexpected_count;
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ int fd;
+ __u64 our_netns_id;
+ bool found_ours;
+ int unexpected_count;
+
+ close(pipefd[0]);
+
+ /* Create user namespace to be unprivileged */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Create a network namespace */
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Get our network namespace ID */
+ fd = open("/proc/self/ns/net", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &our_netns_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Now we're unprivileged - list all network namespaces */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* We should only see our own network namespace */
+ found_ours = false;
+ unexpected_count = 0;
+
+ for (ssize_t i = 0; i < ret; i++) {
+ if (ns_ids[i] == our_netns_id) {
+ found_ours = true;
+ } else {
+ /* This is either init_net (which we can see) or unexpected */
+ unexpected_count++;
+ }
+ }
+
+ /* Send results to parent */
+ write(pipefd[1], &found_ours, sizeof(found_ours));
+ write(pipefd[1], &unexpected_count, sizeof(unexpected_count));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent */
+ close(pipefd[1]);
+
+ found_ours = false;
+ unexpected_count = 0;
+ read(pipefd[0], &found_ours, sizeof(found_ours));
+ read(pipefd[0], &unexpected_count, sizeof(unexpected_count));
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Child should have seen its own namespace */
+ ASSERT_TRUE(found_ours);
+
+ TH_LOG("Unprivileged child saw its own namespace, plus %d others (likely init_net)",
+ unexpected_count);
+}
+
+/*
+ * Test that users with CAP_SYS_ADMIN in a user namespace can see
+ * all namespaces owned by that user namespace.
+ */
+TEST(listns_cap_sys_admin_in_userns)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0, /* All types */
+ .spare2 = 0,
+ .user_ns_id = 0, /* Will be set to our created user namespace */
+ };
+ __u64 ns_ids[100];
+ int pipefd[2];
+ pid_t pid;
+ int status;
+ bool success;
+ ssize_t count;
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ int fd;
+ __u64 userns_id;
+ ssize_t ret;
+ int min_expected;
+ bool success;
+
+ close(pipefd[0]);
+
+ /* Create user namespace - we'll have CAP_SYS_ADMIN in it */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Get the user namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &userns_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Create several namespaces owned by this user namespace */
+ unshare(CLONE_NEWNET);
+ unshare(CLONE_NEWUTS);
+ unshare(CLONE_NEWIPC);
+
+ /* List namespaces owned by our user namespace */
+ req.user_ns_id = userns_id;
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /*
+ * We have CAP_SYS_ADMIN in this user namespace,
+ * so we should see all namespaces owned by it.
+ * That includes: net, uts, ipc, and the user namespace itself.
+ */
+ min_expected = 4;
+ success = (ret >= min_expected);
+
+ write(pipefd[1], &success, sizeof(success));
+ write(pipefd[1], &ret, sizeof(ret));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent */
+ close(pipefd[1]);
+
+ success = false;
+ count = 0;
+ read(pipefd[0], &success, sizeof(success));
+ read(pipefd[0], &count, sizeof(count));
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ ASSERT_TRUE(success);
+ TH_LOG("User with CAP_SYS_ADMIN saw %zd namespaces owned by their user namespace",
+ count);
+}
+
+/*
+ * Test that users cannot see namespaces from unrelated user namespaces.
+ * Create two sibling user namespaces, verify they can't see each other's
+ * owned namespaces.
+ */
+TEST(listns_cannot_see_sibling_userns_namespaces)
+{
+ int pipefd[2];
+ pid_t pid1, pid2;
+ int status;
+ __u64 netns_a_id;
+ int pipefd2[2];
+ bool found_sibling_netns;
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ /* Fork first child - creates user namespace A */
+ pid1 = fork();
+ ASSERT_GE(pid1, 0);
+
+ if (pid1 == 0) {
+ int fd;
+ __u64 netns_a_id;
+ char buf;
+
+ close(pipefd[0]);
+
+ /* Create user namespace A */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Create network namespace owned by user namespace A */
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Get network namespace ID */
+ fd = open("/proc/self/ns/net", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &netns_a_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Send namespace ID to parent */
+ write(pipefd[1], &netns_a_id, sizeof(netns_a_id));
+
+ /* Keep alive for sibling to check */
+ read(pipefd[1], &buf, 1);
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent reads namespace A ID */
+ close(pipefd[1]);
+ netns_a_id = 0;
+ read(pipefd[0], &netns_a_id, sizeof(netns_a_id));
+
+ TH_LOG("User namespace A created network namespace with ID %llu",
+ (unsigned long long)netns_a_id);
+
+ /* Fork second child - creates user namespace B */
+ ASSERT_EQ(pipe(pipefd2), 0);
+
+ pid2 = fork();
+ ASSERT_GE(pid2, 0);
+
+ if (pid2 == 0) {
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWNET,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[100];
+ ssize_t ret;
+ bool found_sibling_netns;
+
+ close(pipefd[0]);
+ close(pipefd2[0]);
+
+ /* Create user namespace B (sibling to A) */
+ if (setup_userns() < 0) {
+ close(pipefd2[1]);
+ exit(1);
+ }
+
+ /* Try to list all network namespaces */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+
+ found_sibling_netns = false;
+ if (ret > 0) {
+ for (ssize_t i = 0; i < ret; i++) {
+ if (ns_ids[i] == netns_a_id) {
+ found_sibling_netns = true;
+ break;
+ }
+ }
+ }
+
+ /* We should NOT see the sibling's network namespace */
+ write(pipefd2[1], &found_sibling_netns, sizeof(found_sibling_netns));
+ close(pipefd2[1]);
+ exit(0);
+ }
+
+ /* Parent reads result from second child */
+ close(pipefd2[1]);
+ found_sibling_netns = false;
+ read(pipefd2[0], &found_sibling_netns, sizeof(found_sibling_netns));
+ close(pipefd2[0]);
+
+ /* Signal first child to exit */
+ close(pipefd[0]);
+
+ /* Wait for both children */
+ waitpid(pid2, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+
+ waitpid(pid1, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+
+ /* Second child should NOT have seen first child's namespace */
+ ASSERT_FALSE(found_sibling_netns);
+ TH_LOG("User namespace B correctly could not see sibling namespace A's network namespace");
+}
+
+/*
+ * Test permission checking with LISTNS_CURRENT_USER.
+ * Verify that listing with LISTNS_CURRENT_USER respects permissions.
+ */
+TEST(listns_current_user_permissions)
+{
+ int pipefd[2];
+ pid_t pid;
+ int status;
+ bool success;
+ ssize_t count;
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0,
+ .spare2 = 0,
+ .user_ns_id = LISTNS_CURRENT_USER,
+ };
+ __u64 ns_ids[100];
+ ssize_t ret;
+ bool success;
+
+ close(pipefd[0]);
+
+ /* Create user namespace */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Create some namespaces owned by this user namespace */
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (unshare(CLONE_NEWUTS) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* List with LISTNS_CURRENT_USER - should see our owned namespaces */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+
+ success = (ret >= 3); /* At least user, net, uts */
+ write(pipefd[1], &success, sizeof(success));
+ write(pipefd[1], &ret, sizeof(ret));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent */
+ close(pipefd[1]);
+
+ success = false;
+ count = 0;
+ read(pipefd[0], &success, sizeof(success));
+ read(pipefd[0], &count, sizeof(count));
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ ASSERT_TRUE(success);
+ TH_LOG("LISTNS_CURRENT_USER returned %zd namespaces", count);
+}
+
+/*
+ * Test that CAP_SYS_ADMIN in parent user namespace allows seeing
+ * child user namespace's owned namespaces.
+ */
+TEST(listns_parent_userns_cap_sys_admin)
+{
+ int pipefd[2];
+ pid_t pid;
+ int status;
+ bool found_child_userns;
+ ssize_t count;
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ int fd;
+ __u64 parent_userns_id;
+ __u64 child_userns_id;
+ struct ns_id_req req;
+ __u64 ns_ids[100];
+ ssize_t ret;
+ bool found_child_userns;
+
+ close(pipefd[0]);
+
+ /* Create parent user namespace - we have CAP_SYS_ADMIN in it */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Get parent user namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &parent_userns_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Create child user namespace */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Get child user namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Create namespaces owned by child user namespace */
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* List namespaces owned by parent user namespace */
+ req.size = sizeof(req);
+ req.spare = 0;
+ req.ns_id = 0;
+ req.ns_type = 0;
+ req.spare2 = 0;
+ req.user_ns_id = parent_userns_id;
+
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+
+ /* Should see child user namespace in the list */
+ found_child_userns = false;
+ if (ret > 0) {
+ for (ssize_t i = 0; i < ret; i++) {
+ if (ns_ids[i] == child_userns_id) {
+ found_child_userns = true;
+ break;
+ }
+ }
+ }
+
+ write(pipefd[1], &found_child_userns, sizeof(found_child_userns));
+ write(pipefd[1], &ret, sizeof(ret));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent */
+ close(pipefd[1]);
+
+ found_child_userns = false;
+ count = 0;
+ read(pipefd[0], &found_child_userns, sizeof(found_child_userns));
+ read(pipefd[0], &count, sizeof(count));
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ ASSERT_TRUE(found_child_userns);
+ TH_LOG("Process with CAP_SYS_ADMIN in parent user namespace saw child user namespace (total: %zd)",
+ count);
+}
+
+/*
+ * Test that we can see user namespaces we have CAP_SYS_ADMIN inside of.
+ * This is different from seeing namespaces owned by a user namespace.
+ */
+TEST(listns_cap_sys_admin_inside_userns)
+{
+ int pipefd[2];
+ pid_t pid;
+ int status;
+ bool found_ours;
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ int fd;
+ __u64 our_userns_id;
+ struct ns_id_req req;
+ __u64 ns_ids[100];
+ ssize_t ret;
+ bool found_ours;
+
+ close(pipefd[0]);
+
+ /* Create user namespace - we have CAP_SYS_ADMIN inside it */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Get our user namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &our_userns_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* List all user namespaces globally */
+ req.size = sizeof(req);
+ req.spare = 0;
+ req.ns_id = 0;
+ req.ns_type = CLONE_NEWUSER;
+ req.spare2 = 0;
+ req.user_ns_id = 0;
+
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+
+ /* We should be able to see our own user namespace */
+ found_ours = false;
+ if (ret > 0) {
+ for (ssize_t i = 0; i < ret; i++) {
+ if (ns_ids[i] == our_userns_id) {
+ found_ours = true;
+ break;
+ }
+ }
+ }
+
+ write(pipefd[1], &found_ours, sizeof(found_ours));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent */
+ close(pipefd[1]);
+
+ found_ours = false;
+ read(pipefd[0], &found_ours, sizeof(found_ours));
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ ASSERT_TRUE(found_ours);
+ TH_LOG("Process can see user namespace it has CAP_SYS_ADMIN inside of");
+}
+
+/*
+ * Test that dropping CAP_SYS_ADMIN restricts what we can see.
+ */
+TEST(listns_drop_cap_sys_admin)
+{
+ cap_t caps;
+ cap_value_t cap_list[1] = { CAP_SYS_ADMIN };
+
+ /* This test needs to start with CAP_SYS_ADMIN */
+ caps = cap_get_proc();
+ if (!caps) {
+ SKIP(return, "Cannot get capabilities");
+ }
+
+ cap_flag_value_t cap_val;
+ if (cap_get_flag(caps, CAP_SYS_ADMIN, CAP_EFFECTIVE, &cap_val) < 0) {
+ cap_free(caps);
+ SKIP(return, "Cannot check CAP_SYS_ADMIN");
+ }
+
+ if (cap_val != CAP_SET) {
+ cap_free(caps);
+ SKIP(return, "Test needs CAP_SYS_ADMIN to start");
+ }
+ cap_free(caps);
+
+ int pipefd[2];
+ pid_t pid;
+ int status;
+ bool correct;
+ ssize_t count_before, count_after;
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWNET,
+ .spare2 = 0,
+ .user_ns_id = LISTNS_CURRENT_USER,
+ };
+ __u64 ns_ids_before[100];
+ ssize_t count_before;
+ __u64 ns_ids_after[100];
+ ssize_t count_after;
+ bool correct;
+
+ close(pipefd[0]);
+
+ /* Create user namespace */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Count namespaces with CAP_SYS_ADMIN */
+ count_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0);
+
+ /* Drop CAP_SYS_ADMIN */
+ caps = cap_get_proc();
+ if (caps) {
+ cap_set_flag(caps, CAP_EFFECTIVE, 1, cap_list, CAP_CLEAR);
+ cap_set_flag(caps, CAP_PERMITTED, 1, cap_list, CAP_CLEAR);
+ cap_set_proc(caps);
+ cap_free(caps);
+ }
+
+ /* Ensure we can't regain the capability */
+ prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+
+ /* Count namespaces without CAP_SYS_ADMIN */
+ count_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0);
+
+ /* Without CAP_SYS_ADMIN, we should see same or fewer namespaces */
+ correct = (count_after <= count_before);
+
+ write(pipefd[1], &correct, sizeof(correct));
+ write(pipefd[1], &count_before, sizeof(count_before));
+ write(pipefd[1], &count_after, sizeof(count_after));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent */
+ close(pipefd[1]);
+
+ correct = false;
+ count_before = 0;
+ count_after = 0;
+ read(pipefd[0], &correct, sizeof(correct));
+ read(pipefd[0], &count_before, sizeof(count_before));
+ read(pipefd[0], &count_after, sizeof(count_after));
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ ASSERT_TRUE(correct);
+ TH_LOG("With CAP_SYS_ADMIN: %zd namespaces, without: %zd namespaces",
+ count_before, count_after);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/namespaces/listns_test.c b/tools/testing/selftests/namespaces/listns_test.c
new file mode 100644
index 000000000000..8a95789d6a87
--- /dev/null
+++ b/tools/testing/selftests/namespaces/listns_test.c
@@ -0,0 +1,679 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <linux/nsfs.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "../kselftest_harness.h"
+#include "../filesystems/utils.h"
+#include "wrappers.h"
+
+/*
+ * Test basic listns() functionality with the unified namespace tree.
+ * List all active namespaces globally.
+ */
+TEST(listns_basic_unified)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0, /* All types */
+ .spare2 = 0,
+ .user_ns_id = 0, /* Global listing */
+ };
+ __u64 ns_ids[100];
+ ssize_t ret;
+
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno);
+ ASSERT_TRUE(false);
+ }
+
+ /* Should find at least the initial namespaces */
+ ASSERT_GT(ret, 0);
+ TH_LOG("Found %zd active namespaces", ret);
+
+ /* Verify all returned IDs are non-zero */
+ for (ssize_t i = 0; i < ret; i++) {
+ ASSERT_NE(ns_ids[i], 0);
+ TH_LOG(" [%zd] ns_id: %llu", i, (unsigned long long)ns_ids[i]);
+ }
+}
+
+/*
+ * Test listns() with type filtering.
+ * List only network namespaces.
+ */
+TEST(listns_filter_by_type)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWNET, /* Only network namespaces */
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[100];
+ ssize_t ret;
+
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno);
+ ASSERT_TRUE(false);
+ }
+ ASSERT_GE(ret, 0);
+
+ /* Should find at least init_net */
+ ASSERT_GT(ret, 0);
+ TH_LOG("Found %zd active network namespaces", ret);
+
+ /* Verify we can open each namespace and it's actually a network namespace */
+ for (ssize_t i = 0; i < ret && i < 5; i++) {
+ struct nsfs_file_handle nsfh = {
+ .ns_id = ns_ids[i],
+ .ns_type = CLONE_NEWNET,
+ .ns_inum = 0,
+ };
+ struct file_handle *fh;
+ int fd;
+
+ fh = (struct file_handle *)malloc(sizeof(*fh) + sizeof(nsfh));
+ ASSERT_NE(fh, NULL);
+ fh->handle_bytes = sizeof(nsfh);
+ fh->handle_type = 0;
+ memcpy(fh->f_handle, &nsfh, sizeof(nsfh));
+
+ fd = open_by_handle_at(-10003, fh, O_RDONLY);
+ free(fh);
+
+ if (fd >= 0) {
+ int ns_type;
+ /* Verify it's a network namespace via ioctl */
+ ns_type = ioctl(fd, NS_GET_NSTYPE);
+ if (ns_type >= 0) {
+ ASSERT_EQ(ns_type, CLONE_NEWNET);
+ }
+ close(fd);
+ }
+ }
+}
+
+/*
+ * Test listns() pagination.
+ * List namespaces in batches.
+ */
+TEST(listns_pagination)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 batch1[2], batch2[2];
+ ssize_t ret1, ret2;
+
+ /* Get first batch */
+ ret1 = sys_listns(&req, batch1, ARRAY_SIZE(batch1), 0);
+ if (ret1 < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno);
+ ASSERT_TRUE(false);
+ }
+ ASSERT_GE(ret1, 0);
+
+ if (ret1 == 0)
+ SKIP(return, "No namespaces found");
+
+ TH_LOG("First batch: %zd namespaces", ret1);
+
+ /* Get second batch using last ID from first batch */
+ if (ret1 == ARRAY_SIZE(batch1)) {
+ req.ns_id = batch1[ret1 - 1];
+ ret2 = sys_listns(&req, batch2, ARRAY_SIZE(batch2), 0);
+ ASSERT_GE(ret2, 0);
+
+ TH_LOG("Second batch: %zd namespaces (after ns_id=%llu)",
+ ret2, (unsigned long long)req.ns_id);
+
+ /* If we got more results, verify IDs are monotonically increasing */
+ if (ret2 > 0) {
+ ASSERT_GT(batch2[0], batch1[ret1 - 1]);
+ TH_LOG("Pagination working: %llu > %llu",
+ (unsigned long long)batch2[0],
+ (unsigned long long)batch1[ret1 - 1]);
+ }
+ } else {
+ TH_LOG("All namespaces fit in first batch");
+ }
+}
+
+/*
+ * Test listns() with LISTNS_CURRENT_USER.
+ * List namespaces owned by current user namespace.
+ */
+TEST(listns_current_user)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0,
+ .spare2 = 0,
+ .user_ns_id = LISTNS_CURRENT_USER,
+ };
+ __u64 ns_ids[100];
+ ssize_t ret;
+
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno);
+ ASSERT_TRUE(false);
+ }
+ ASSERT_GE(ret, 0);
+
+ /* Should find at least the initial namespaces if we're in init_user_ns */
+ TH_LOG("Found %zd namespaces owned by current user namespace", ret);
+
+ for (ssize_t i = 0; i < ret; i++)
+ TH_LOG(" [%zd] ns_id: %llu", i, (unsigned long long)ns_ids[i]);
+}
+
+/*
+ * Test that listns() only returns active namespaces.
+ * Create a namespace, let it become inactive, verify it's not listed.
+ */
+TEST(listns_only_active)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWNET,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids_before[100], ns_ids_after[100];
+ ssize_t ret_before, ret_after;
+ int pipefd[2];
+ pid_t pid;
+ __u64 new_ns_id = 0;
+ int status;
+
+ /* Get initial list */
+ ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0);
+ if (ret_before < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno);
+ ASSERT_TRUE(false);
+ }
+ ASSERT_GE(ret_before, 0);
+
+ TH_LOG("Before: %zd active network namespaces", ret_before);
+
+ /* Create a new namespace in a child process and get its ID */
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ int fd;
+ __u64 ns_id;
+
+ close(pipefd[0]);
+
+ /* Create new network namespace */
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Get its ID */
+ fd = open("/proc/self/ns/net", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &ns_id) < 0) {
+ close(fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Send ID to parent */
+ write(pipefd[1], &ns_id, sizeof(ns_id));
+ close(pipefd[1]);
+
+ /* Keep namespace active briefly */
+ usleep(100000);
+ exit(0);
+ }
+
+ /* Parent reads the new namespace ID */
+ {
+ int bytes;
+
+ close(pipefd[1]);
+ bytes = read(pipefd[0], &new_ns_id, sizeof(new_ns_id));
+ close(pipefd[0]);
+
+ if (bytes == sizeof(new_ns_id)) {
+ __u64 ns_ids_during[100];
+ int ret_during;
+
+ TH_LOG("Child created namespace with ID %llu", (unsigned long long)new_ns_id);
+
+ /* List namespaces while child is still alive - should see new one */
+ ret_during = sys_listns(&req, ns_ids_during, ARRAY_SIZE(ns_ids_during), 0);
+ ASSERT_GE(ret_during, 0);
+ TH_LOG("During: %d active network namespaces", ret_during);
+
+ /* Should have more namespaces than before */
+ ASSERT_GE(ret_during, ret_before);
+ }
+ }
+
+ /* Wait for child to exit */
+ waitpid(pid, &status, 0);
+
+ /* Give time for namespace to become inactive */
+ usleep(100000);
+
+ /* List namespaces after child exits - should not see new one */
+ ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0);
+ ASSERT_GE(ret_after, 0);
+ TH_LOG("After: %zd active network namespaces", ret_after);
+
+ /* Verify the new namespace ID is not in the after list */
+ if (new_ns_id != 0) {
+ bool found = false;
+
+ for (ssize_t i = 0; i < ret_after; i++) {
+ if (ns_ids_after[i] == new_ns_id) {
+ found = true;
+ break;
+ }
+ }
+ ASSERT_FALSE(found);
+ }
+}
+
+/*
+ * Test listns() with specific user namespace ID.
+ * Create a user namespace and list namespaces it owns.
+ */
+TEST(listns_specific_userns)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0,
+ .spare2 = 0,
+ .user_ns_id = 0, /* Will be filled with created userns ID */
+ };
+ __u64 ns_ids[100];
+ int sv[2];
+ pid_t pid;
+ int status;
+ __u64 user_ns_id = 0;
+ int bytes;
+ ssize_t ret;
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ int fd;
+ __u64 ns_id;
+ char buf;
+
+ close(sv[0]);
+
+ /* Create new user namespace */
+ if (setup_userns() < 0) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ /* Get user namespace ID */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &ns_id) < 0) {
+ close(fd);
+ close(sv[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Send ID to parent */
+ if (write(sv[1], &ns_id, sizeof(ns_id)) != sizeof(ns_id)) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ /* Create some namespaces owned by this user namespace */
+ unshare(CLONE_NEWNET);
+ unshare(CLONE_NEWUTS);
+
+ /* Wait for parent signal */
+ if (read(sv[1], &buf, 1) != 1) {
+ close(sv[1]);
+ exit(1);
+ }
+ close(sv[1]);
+ exit(0);
+ }
+
+ /* Parent */
+ close(sv[1]);
+ bytes = read(sv[0], &user_ns_id, sizeof(user_ns_id));
+
+ if (bytes != sizeof(user_ns_id)) {
+ close(sv[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to get user namespace ID from child");
+ }
+
+ TH_LOG("Child created user namespace with ID %llu", (unsigned long long)user_ns_id);
+
+ /* List namespaces owned by this user namespace */
+ req.user_ns_id = user_ns_id;
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+
+ if (ret < 0) {
+ TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno);
+ close(sv[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ if (errno == ENOSYS) {
+ SKIP(return, "listns() not supported");
+ }
+ ASSERT_GE(ret, 0);
+ }
+
+ TH_LOG("Found %zd namespaces owned by user namespace %llu", ret,
+ (unsigned long long)user_ns_id);
+
+ /* Should find at least the network and UTS namespaces we created */
+ if (ret > 0) {
+ for (ssize_t i = 0; i < ret && i < 10; i++)
+ TH_LOG(" [%zd] ns_id: %llu", i, (unsigned long long)ns_ids[i]);
+ }
+
+ /* Signal child to exit */
+ if (write(sv[0], "X", 1) != 1) {
+ close(sv[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ ASSERT_TRUE(false);
+ }
+ close(sv[0]);
+ waitpid(pid, &status, 0);
+}
+
+/*
+ * Test listns() with multiple namespace types filter.
+ */
+TEST(listns_multiple_types)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWNET | CLONE_NEWUTS, /* Network and UTS */
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[100];
+ ssize_t ret;
+
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno);
+ ASSERT_TRUE(false);
+ }
+ ASSERT_GE(ret, 0);
+
+ TH_LOG("Found %zd active network/UTS namespaces", ret);
+
+ for (ssize_t i = 0; i < ret; i++)
+ TH_LOG(" [%zd] ns_id: %llu", i, (unsigned long long)ns_ids[i]);
+}
+
+/*
+ * Test that hierarchical active reference propagation keeps parent
+ * user namespaces visible in listns().
+ */
+TEST(listns_hierarchical_visibility)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 parent_ns_id = 0, child_ns_id = 0;
+ int sv[2];
+ pid_t pid;
+ int status;
+ int bytes;
+ __u64 ns_ids[100];
+ ssize_t ret;
+ bool found_parent, found_child;
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ int fd;
+ char buf;
+
+ close(sv[0]);
+
+ /* Create parent user namespace */
+ if (setup_userns() < 0) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &parent_ns_id) < 0) {
+ close(fd);
+ close(sv[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Create child user namespace */
+ if (setup_userns() < 0) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ if (ioctl(fd, NS_GET_ID, &child_ns_id) < 0) {
+ close(fd);
+ close(sv[1]);
+ exit(1);
+ }
+ close(fd);
+
+ /* Send both IDs to parent */
+ if (write(sv[1], &parent_ns_id, sizeof(parent_ns_id)) != sizeof(parent_ns_id)) {
+ close(sv[1]);
+ exit(1);
+ }
+ if (write(sv[1], &child_ns_id, sizeof(child_ns_id)) != sizeof(child_ns_id)) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ /* Wait for parent signal */
+ if (read(sv[1], &buf, 1) != 1) {
+ close(sv[1]);
+ exit(1);
+ }
+ close(sv[1]);
+ exit(0);
+ }
+
+ /* Parent */
+ close(sv[1]);
+
+ /* Read both namespace IDs */
+ bytes = read(sv[0], &parent_ns_id, sizeof(parent_ns_id));
+ bytes += read(sv[0], &child_ns_id, sizeof(child_ns_id));
+
+ if (bytes != (int)(2 * sizeof(__u64))) {
+ close(sv[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to get namespace IDs from child");
+ }
+
+ TH_LOG("Parent user namespace ID: %llu", (unsigned long long)parent_ns_id);
+ TH_LOG("Child user namespace ID: %llu", (unsigned long long)child_ns_id);
+
+ /* List all user namespaces */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+
+ if (ret < 0 && errno == ENOSYS) {
+ close(sv[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "listns() not supported");
+ }
+
+ ASSERT_GE(ret, 0);
+ TH_LOG("Found %zd active user namespaces", ret);
+
+ /* Both parent and child should be visible (active due to child process) */
+ found_parent = false;
+ found_child = false;
+ for (ssize_t i = 0; i < ret; i++) {
+ if (ns_ids[i] == parent_ns_id)
+ found_parent = true;
+ if (ns_ids[i] == child_ns_id)
+ found_child = true;
+ }
+
+ TH_LOG("Parent namespace %s, child namespace %s",
+ found_parent ? "found" : "NOT FOUND",
+ found_child ? "found" : "NOT FOUND");
+
+ ASSERT_TRUE(found_child);
+ /* With hierarchical propagation, parent should also be active */
+ ASSERT_TRUE(found_parent);
+
+ /* Signal child to exit */
+ if (write(sv[0], "X", 1) != 1) {
+ close(sv[0]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ ASSERT_TRUE(false);
+ }
+ close(sv[0]);
+ waitpid(pid, &status, 0);
+}
+
+/*
+ * Test error cases for listns().
+ */
+TEST(listns_error_cases)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[10];
+ int ret;
+
+ /* Test with invalid flags */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0xFFFF);
+ if (errno == ENOSYS) {
+ /* listns() not supported, skip this check */
+ } else {
+ ASSERT_LT(ret, 0);
+ ASSERT_EQ(errno, EINVAL);
+ }
+
+ /* Test with NULL ns_ids array */
+ ret = sys_listns(&req, NULL, 10, 0);
+ ASSERT_LT(ret, 0);
+
+ /* Test with invalid spare field */
+ req.spare = 1;
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (errno == ENOSYS) {
+ /* listns() not supported, skip this check */
+ } else {
+ ASSERT_LT(ret, 0);
+ ASSERT_EQ(errno, EINVAL);
+ }
+ req.spare = 0;
+
+ /* Test with huge nr_ns_ids */
+ ret = sys_listns(&req, ns_ids, 2000000, 0);
+ if (errno == ENOSYS) {
+ /* listns() not supported, skip this check */
+ } else {
+ ASSERT_LT(ret, 0);
+ }
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/namespaces/ns_active_ref_test.c b/tools/testing/selftests/namespaces/ns_active_ref_test.c
new file mode 100644
index 000000000000..093268f0efaa
--- /dev/null
+++ b/tools/testing/selftests/namespaces/ns_active_ref_test.c
@@ -0,0 +1,2672 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <linux/nsfs.h>
+#include <sys/mount.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <pthread.h>
+#include "../kselftest_harness.h"
+#include "../filesystems/utils.h"
+#include "wrappers.h"
+
+#ifndef FD_NSFS_ROOT
+#define FD_NSFS_ROOT -10003 /* Root of the nsfs filesystem */
+#endif
+
+#ifndef FILEID_NSFS
+#define FILEID_NSFS 0xf1
+#endif
+
+/*
+ * Test that initial namespaces can be reopened via file handle.
+ * Initial namespaces should have active ref count of 1 from boot.
+ */
+TEST(init_ns_always_active)
+{
+ struct file_handle *handle;
+ int mount_id;
+ int ret;
+ int fd1, fd2;
+ struct stat st1, st2;
+
+ handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ);
+ ASSERT_NE(handle, NULL);
+
+ /* Open initial network namespace */
+ fd1 = open("/proc/1/ns/net", O_RDONLY);
+ ASSERT_GE(fd1, 0);
+
+ /* Get file handle for initial namespace */
+ handle->handle_bytes = MAX_HANDLE_SZ;
+ ret = name_to_handle_at(fd1, "", handle, &mount_id, AT_EMPTY_PATH);
+ if (ret < 0 && errno == EOPNOTSUPP) {
+ SKIP(free(handle); close(fd1);
+ return, "nsfs doesn't support file handles");
+ }
+ ASSERT_EQ(ret, 0);
+
+ /* Close the namespace fd */
+ close(fd1);
+
+ /* Try to reopen via file handle - should succeed since init ns is always active */
+ fd2 = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ if (fd2 < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) {
+ SKIP(free(handle);
+ return, "open_by_handle_at with FD_NSFS_ROOT not supported");
+ }
+ ASSERT_GE(fd2, 0);
+
+ /* Verify we opened the same namespace */
+ fd1 = open("/proc/1/ns/net", O_RDONLY);
+ ASSERT_GE(fd1, 0);
+ ASSERT_EQ(fstat(fd1, &st1), 0);
+ ASSERT_EQ(fstat(fd2, &st2), 0);
+ ASSERT_EQ(st1.st_ino, st2.st_ino);
+
+ close(fd1);
+ close(fd2);
+ free(handle);
+}
+
+/*
+ * Test namespace lifecycle: create a namespace in a child process,
+ * get a file handle while it's active, then try to reopen after
+ * the process exits (namespace becomes inactive).
+ */
+TEST(ns_inactive_after_exit)
+{
+ struct file_handle *handle;
+ int mount_id;
+ int ret;
+ int fd;
+ int pipefd[2];
+ pid_t pid;
+ int status;
+ char buf[sizeof(*handle) + MAX_HANDLE_SZ];
+
+ /* Create pipe for passing file handle from child */
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ close(pipefd[0]);
+
+ /* Create new network namespace */
+ ret = unshare(CLONE_NEWNET);
+ if (ret < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Open our new namespace */
+ fd = open("/proc/self/ns/net", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Get file handle for the namespace */
+ handle = (struct file_handle *)buf;
+ handle->handle_bytes = MAX_HANDLE_SZ;
+ ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH);
+ close(fd);
+
+ if (ret < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Send handle to parent */
+ write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes);
+ close(pipefd[1]);
+
+ /* Exit - namespace should become inactive */
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+
+ /* Read file handle from child */
+ ret = read(pipefd[0], buf, sizeof(buf));
+ close(pipefd[0]);
+
+ /* Wait for child to exit */
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ ASSERT_GT(ret, 0);
+ handle = (struct file_handle *)buf;
+
+ /* Try to reopen namespace - should fail with ENOENT since it's inactive */
+ fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_LT(fd, 0);
+ /* Should fail with ENOENT (namespace inactive) or ESTALE */
+ ASSERT_TRUE(errno == ENOENT || errno == ESTALE);
+}
+
+/*
+ * Test that a namespace remains active while a process is using it,
+ * even after the creating process exits.
+ */
+TEST(ns_active_with_multiple_processes)
+{
+ struct file_handle *handle;
+ int mount_id;
+ int ret;
+ int fd;
+ int pipefd[2];
+ int syncpipe[2];
+ pid_t pid1, pid2;
+ int status;
+ char buf[sizeof(*handle) + MAX_HANDLE_SZ];
+ char sync_byte;
+
+ /* Create pipes for communication */
+ ASSERT_EQ(pipe(pipefd), 0);
+ ASSERT_EQ(pipe(syncpipe), 0);
+
+ pid1 = fork();
+ ASSERT_GE(pid1, 0);
+
+ if (pid1 == 0) {
+ /* First child - creates namespace */
+ close(pipefd[0]);
+ close(syncpipe[1]);
+
+ /* Create new network namespace */
+ ret = unshare(CLONE_NEWNET);
+ if (ret < 0) {
+ close(pipefd[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+
+ /* Open and get handle */
+ fd = open("/proc/self/ns/net", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+
+ handle = (struct file_handle *)buf;
+ handle->handle_bytes = MAX_HANDLE_SZ;
+ ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH);
+ close(fd);
+
+ if (ret < 0) {
+ close(pipefd[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+
+ /* Send handle to parent */
+ write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes);
+ close(pipefd[1]);
+
+ /* Wait for signal before exiting */
+ read(syncpipe[0], &sync_byte, 1);
+ close(syncpipe[0]);
+ exit(0);
+ }
+
+ /* Parent reads handle */
+ close(pipefd[1]);
+ ret = read(pipefd[0], buf, sizeof(buf));
+ close(pipefd[0]);
+ ASSERT_GT(ret, 0);
+
+ handle = (struct file_handle *)buf;
+
+ /* Create second child that will keep namespace active */
+ pid2 = fork();
+ ASSERT_GE(pid2, 0);
+
+ if (pid2 == 0) {
+ /* Second child - reopens the namespace */
+ close(syncpipe[0]);
+ close(syncpipe[1]);
+
+ /* Open the namespace via handle */
+ fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ if (fd < 0) {
+ exit(1);
+ }
+
+ /* Join the namespace */
+ ret = setns(fd, CLONE_NEWNET);
+ close(fd);
+ if (ret < 0) {
+ exit(1);
+ }
+
+ /* Sleep to keep namespace active */
+ sleep(1);
+ exit(0);
+ }
+
+ /* Let second child enter the namespace */
+ usleep(100000); /* 100ms */
+
+ /* Signal first child to exit */
+ close(syncpipe[0]);
+ sync_byte = 'X';
+ write(syncpipe[1], &sync_byte, 1);
+ close(syncpipe[1]);
+
+ /* Wait for first child */
+ waitpid(pid1, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+
+ /* Namespace should still be active because second child is using it */
+ fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_GE(fd, 0);
+ close(fd);
+
+ /* Wait for second child */
+ waitpid(pid2, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+}
+
+/*
+ * Test user namespace active ref tracking via credential lifecycle
+ */
+TEST(userns_active_ref_lifecycle)
+{
+ struct file_handle *handle;
+ int mount_id;
+ int ret;
+ int fd;
+ int pipefd[2];
+ pid_t pid;
+ int status;
+ char buf[sizeof(*handle) + MAX_HANDLE_SZ];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ close(pipefd[0]);
+
+ /* Create new user namespace */
+ ret = unshare(CLONE_NEWUSER);
+ if (ret < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Set up uid/gid mappings */
+ int uid_map_fd = open("/proc/self/uid_map", O_WRONLY);
+ int gid_map_fd = open("/proc/self/gid_map", O_WRONLY);
+ int setgroups_fd = open("/proc/self/setgroups", O_WRONLY);
+
+ if (uid_map_fd >= 0 && gid_map_fd >= 0 && setgroups_fd >= 0) {
+ write(setgroups_fd, "deny", 4);
+ close(setgroups_fd);
+
+ char mapping[64];
+ snprintf(mapping, sizeof(mapping), "0 %d 1", getuid());
+ write(uid_map_fd, mapping, strlen(mapping));
+ close(uid_map_fd);
+
+ snprintf(mapping, sizeof(mapping), "0 %d 1", getgid());
+ write(gid_map_fd, mapping, strlen(mapping));
+ close(gid_map_fd);
+ }
+
+ /* Get file handle */
+ fd = open("/proc/self/ns/user", O_RDONLY);
+ if (fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ handle = (struct file_handle *)buf;
+ handle->handle_bytes = MAX_HANDLE_SZ;
+ ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH);
+ close(fd);
+
+ if (ret < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Send handle to parent */
+ write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes);
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent */
+ close(pipefd[1]);
+ ret = read(pipefd[0], buf, sizeof(buf));
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ ASSERT_GT(ret, 0);
+ handle = (struct file_handle *)buf;
+
+ /* Namespace should be inactive after all tasks exit */
+ fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_LT(fd, 0);
+ ASSERT_TRUE(errno == ENOENT || errno == ESTALE);
+}
+
+/*
+ * Test PID namespace active ref tracking
+ */
+TEST(pidns_active_ref_lifecycle)
+{
+ struct file_handle *handle;
+ int mount_id;
+ int ret;
+ int fd;
+ int pipefd[2];
+ pid_t pid;
+ int status;
+ char buf[sizeof(*handle) + MAX_HANDLE_SZ];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ close(pipefd[0]);
+
+ /* Create new PID namespace */
+ ret = unshare(CLONE_NEWPID);
+ if (ret < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ /* Fork to actually enter the PID namespace */
+ pid_t child = fork();
+ if (child < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ if (child == 0) {
+ /* Grandchild - in new PID namespace */
+ fd = open("/proc/self/ns/pid", O_RDONLY);
+ if (fd < 0) {
+ exit(1);
+ }
+
+ handle = (struct file_handle *)buf;
+ handle->handle_bytes = MAX_HANDLE_SZ;
+ ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH);
+ close(fd);
+
+ if (ret < 0) {
+ exit(1);
+ }
+
+ /* Send handle to grandparent */
+ write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes);
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Wait for grandchild */
+ waitpid(child, NULL, 0);
+ exit(0);
+ }
+
+ /* Parent */
+ close(pipefd[1]);
+ ret = read(pipefd[0], buf, sizeof(buf));
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ ASSERT_GT(ret, 0);
+ handle = (struct file_handle *)buf;
+
+ /* Namespace should be inactive after all processes exit */
+ fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_LT(fd, 0);
+ ASSERT_TRUE(errno == ENOENT || errno == ESTALE);
+}
+
+/*
+ * Test that an open file descriptor keeps a namespace active.
+ * Even after the creating process exits, the namespace should remain
+ * active as long as an fd is held open.
+ */
+TEST(ns_fd_keeps_active)
+{
+ struct file_handle *handle;
+ int mount_id;
+ int ret;
+ int nsfd;
+ int pipe_child_ready[2];
+ int pipe_parent_ready[2];
+ pid_t pid;
+ int status;
+ char buf[sizeof(*handle) + MAX_HANDLE_SZ];
+ char sync_byte;
+ char proc_path[64];
+
+ ASSERT_EQ(pipe(pipe_child_ready), 0);
+ ASSERT_EQ(pipe(pipe_parent_ready), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ close(pipe_child_ready[0]);
+ close(pipe_parent_ready[1]);
+
+ TH_LOG("Child: creating new network namespace");
+
+ /* Create new network namespace */
+ ret = unshare(CLONE_NEWNET);
+ if (ret < 0) {
+ TH_LOG("Child: unshare(CLONE_NEWNET) failed: %s", strerror(errno));
+ close(pipe_child_ready[1]);
+ close(pipe_parent_ready[0]);
+ exit(1);
+ }
+
+ TH_LOG("Child: network namespace created successfully");
+
+ /* Get file handle for the namespace */
+ nsfd = open("/proc/self/ns/net", O_RDONLY);
+ if (nsfd < 0) {
+ TH_LOG("Child: failed to open /proc/self/ns/net: %s", strerror(errno));
+ close(pipe_child_ready[1]);
+ close(pipe_parent_ready[0]);
+ exit(1);
+ }
+
+ TH_LOG("Child: opened namespace fd %d", nsfd);
+
+ handle = (struct file_handle *)buf;
+ handle->handle_bytes = MAX_HANDLE_SZ;
+ ret = name_to_handle_at(nsfd, "", handle, &mount_id, AT_EMPTY_PATH);
+ close(nsfd);
+
+ if (ret < 0) {
+ TH_LOG("Child: name_to_handle_at failed: %s", strerror(errno));
+ close(pipe_child_ready[1]);
+ close(pipe_parent_ready[0]);
+ exit(1);
+ }
+
+ TH_LOG("Child: got file handle (bytes=%u)", handle->handle_bytes);
+
+ /* Send file handle to parent */
+ ret = write(pipe_child_ready[1], buf, sizeof(*handle) + handle->handle_bytes);
+ TH_LOG("Child: sent %d bytes of file handle to parent", ret);
+ close(pipe_child_ready[1]);
+
+ /* Wait for parent to open the fd */
+ TH_LOG("Child: waiting for parent to open fd");
+ ret = read(pipe_parent_ready[0], &sync_byte, 1);
+ close(pipe_parent_ready[0]);
+
+ TH_LOG("Child: parent signaled (read %d bytes), exiting now", ret);
+ /* Exit - namespace should stay active because parent holds fd */
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipe_child_ready[1]);
+ close(pipe_parent_ready[0]);
+
+ TH_LOG("Parent: reading file handle from child");
+
+ /* Read file handle from child */
+ ret = read(pipe_child_ready[0], buf, sizeof(buf));
+ close(pipe_child_ready[0]);
+ ASSERT_GT(ret, 0);
+ handle = (struct file_handle *)buf;
+
+ TH_LOG("Parent: received %d bytes, handle size=%u", ret, handle->handle_bytes);
+
+ /* Open the child's namespace while it's still alive */
+ snprintf(proc_path, sizeof(proc_path), "/proc/%d/ns/net", pid);
+ TH_LOG("Parent: opening child's namespace at %s", proc_path);
+ nsfd = open(proc_path, O_RDONLY);
+ if (nsfd < 0) {
+ TH_LOG("Parent: failed to open %s: %s", proc_path, strerror(errno));
+ close(pipe_parent_ready[1]);
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to open child's namespace");
+ }
+
+ TH_LOG("Parent: opened child's namespace, got fd %d", nsfd);
+
+ /* Signal child that we have the fd */
+ sync_byte = 'G';
+ write(pipe_parent_ready[1], &sync_byte, 1);
+ close(pipe_parent_ready[1]);
+ TH_LOG("Parent: signaled child that we have the fd");
+
+ /* Wait for child to exit */
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ TH_LOG("Child exited, parent holds fd %d to namespace", nsfd);
+
+ /*
+ * Namespace should still be ACTIVE because we hold an fd.
+ * We should be able to reopen it via file handle.
+ */
+ TH_LOG("Attempting to reopen namespace via file handle (should succeed - fd held)");
+ int fd2 = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_GE(fd2, 0);
+
+ TH_LOG("Successfully reopened namespace via file handle, got fd %d", fd2);
+
+ /* Verify it's the same namespace */
+ struct stat st1, st2;
+ ASSERT_EQ(fstat(nsfd, &st1), 0);
+ ASSERT_EQ(fstat(fd2, &st2), 0);
+ TH_LOG("Namespace inodes: nsfd=%lu, fd2=%lu", st1.st_ino, st2.st_ino);
+ ASSERT_EQ(st1.st_ino, st2.st_ino);
+ close(fd2);
+
+ /* Now close the fd - namespace should become inactive */
+ TH_LOG("Closing fd %d - namespace should become inactive", nsfd);
+ close(nsfd);
+
+ /* Now reopening should fail - namespace is inactive */
+ TH_LOG("Attempting to reopen namespace via file handle (should fail - inactive)");
+ fd2 = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_LT(fd2, 0);
+ /* Should fail with ENOENT (inactive) or ESTALE (gone) */
+ TH_LOG("Reopen failed as expected: %s (errno=%d)", strerror(errno), errno);
+ ASSERT_TRUE(errno == ENOENT || errno == ESTALE);
+}
+
+/*
+ * Test hierarchical active reference propagation.
+ * When a child namespace is active, its owning user namespace should also
+ * be active automatically due to hierarchical active reference propagation.
+ * This ensures parents are always reachable when children are active.
+ */
+TEST(ns_parent_always_reachable)
+{
+ struct file_handle *parent_handle, *child_handle;
+ int ret;
+ int child_nsfd;
+ int pipefd[2];
+ pid_t pid;
+ int status;
+ __u64 parent_id, child_id;
+ char parent_buf[sizeof(*parent_handle) + MAX_HANDLE_SZ];
+ char child_buf[sizeof(*child_handle) + MAX_HANDLE_SZ];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ close(pipefd[0]);
+
+ TH_LOG("Child: creating parent user namespace and setting up mappings");
+
+ /* Create parent user namespace with mappings */
+ ret = setup_userns();
+ if (ret < 0) {
+ TH_LOG("Child: setup_userns() for parent failed: %s", strerror(errno));
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ TH_LOG("Child: parent user namespace created, now uid=%d gid=%d", getuid(), getgid());
+
+ /* Get namespace ID for parent user namespace */
+ int parent_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (parent_fd < 0) {
+ TH_LOG("Child: failed to open parent /proc/self/ns/user: %s", strerror(errno));
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ TH_LOG("Child: opened parent userns fd %d", parent_fd);
+
+ if (ioctl(parent_fd, NS_GET_ID, &parent_id) < 0) {
+ TH_LOG("Child: NS_GET_ID for parent failed: %s", strerror(errno));
+ close(parent_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(parent_fd);
+
+ TH_LOG("Child: got parent namespace ID %llu", (unsigned long long)parent_id);
+
+ /* Create child user namespace within parent */
+ TH_LOG("Child: creating nested child user namespace");
+ ret = setup_userns();
+ if (ret < 0) {
+ TH_LOG("Child: setup_userns() for child failed: %s", strerror(errno));
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ TH_LOG("Child: nested child user namespace created, uid=%d gid=%d", getuid(), getgid());
+
+ /* Get namespace ID for child user namespace */
+ int child_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (child_fd < 0) {
+ TH_LOG("Child: failed to open child /proc/self/ns/user: %s", strerror(errno));
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ TH_LOG("Child: opened child userns fd %d", child_fd);
+
+ if (ioctl(child_fd, NS_GET_ID, &child_id) < 0) {
+ TH_LOG("Child: NS_GET_ID for child failed: %s", strerror(errno));
+ close(child_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(child_fd);
+
+ TH_LOG("Child: got child namespace ID %llu", (unsigned long long)child_id);
+
+ /* Send both namespace IDs to parent */
+ TH_LOG("Child: sending both namespace IDs to parent");
+ write(pipefd[1], &parent_id, sizeof(parent_id));
+ write(pipefd[1], &child_id, sizeof(child_id));
+ close(pipefd[1]);
+
+ TH_LOG("Child: exiting - parent userns should become inactive");
+ /* Exit - parent user namespace should become inactive */
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+
+ TH_LOG("Parent: reading both namespace IDs from child");
+
+ /* Read both namespace IDs - fixed size, no parsing needed */
+ ret = read(pipefd[0], &parent_id, sizeof(parent_id));
+ if (ret != sizeof(parent_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read parent namespace ID from child");
+ }
+
+ ret = read(pipefd[0], &child_id, sizeof(child_id));
+ close(pipefd[0]);
+ if (ret != sizeof(child_id)) {
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read child namespace ID from child");
+ }
+
+ TH_LOG("Parent: received parent_id=%llu, child_id=%llu",
+ (unsigned long long)parent_id, (unsigned long long)child_id);
+
+ /* Construct file handles from namespace IDs */
+ parent_handle = (struct file_handle *)parent_buf;
+ parent_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ parent_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *parent_fh = (struct nsfs_file_handle *)parent_handle->f_handle;
+ parent_fh->ns_id = parent_id;
+ parent_fh->ns_type = 0;
+ parent_fh->ns_inum = 0;
+
+ child_handle = (struct file_handle *)child_buf;
+ child_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ child_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *child_fh = (struct nsfs_file_handle *)child_handle->f_handle;
+ child_fh->ns_id = child_id;
+ child_fh->ns_type = 0;
+ child_fh->ns_inum = 0;
+
+ TH_LOG("Parent: opening child namespace BEFORE child exits");
+
+ /* Open child namespace while child is still alive to keep it active */
+ child_nsfd = open_by_handle_at(FD_NSFS_ROOT, child_handle, O_RDONLY);
+ if (child_nsfd < 0) {
+ TH_LOG("Failed to open child namespace: %s (errno=%d)", strerror(errno), errno);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to open child namespace");
+ }
+
+ TH_LOG("Opened child namespace fd %d", child_nsfd);
+
+ /* Now wait for child to exit */
+ TH_LOG("Parent: waiting for child to exit");
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ TH_LOG("Child process exited, parent holds fd to child namespace");
+
+ /*
+ * With hierarchical active reference propagation:
+ * Since the child namespace is active (parent process holds fd),
+ * the parent user namespace should ALSO be active automatically.
+ * This is because when we took an active reference on the child,
+ * it propagated up to the owning user namespace.
+ */
+ TH_LOG("Attempting to reopen parent namespace (should SUCCEED - hierarchical propagation)");
+ int parent_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY);
+ ASSERT_GE(parent_fd, 0);
+
+ TH_LOG("SUCCESS: Parent namespace is active (fd=%d) due to active child", parent_fd);
+
+ /* Verify we can also get parent via NS_GET_USERNS */
+ TH_LOG("Verifying NS_GET_USERNS also works");
+ int parent_fd2 = ioctl(child_nsfd, NS_GET_USERNS);
+ if (parent_fd2 < 0) {
+ close(parent_fd);
+ close(child_nsfd);
+ TH_LOG("NS_GET_USERNS failed: %s (errno=%d)", strerror(errno), errno);
+ SKIP(return, "NS_GET_USERNS not supported or failed");
+ }
+
+ TH_LOG("NS_GET_USERNS succeeded, got parent fd %d", parent_fd2);
+
+ /* Verify both methods give us the same namespace */
+ struct stat st1, st2;
+ ASSERT_EQ(fstat(parent_fd, &st1), 0);
+ ASSERT_EQ(fstat(parent_fd2, &st2), 0);
+ TH_LOG("Parent namespace inodes: parent_fd=%lu, parent_fd2=%lu", st1.st_ino, st2.st_ino);
+ ASSERT_EQ(st1.st_ino, st2.st_ino);
+
+ /*
+ * Close child fd - parent should remain active because we still
+ * hold direct references to it (parent_fd and parent_fd2).
+ */
+ TH_LOG("Closing child fd - parent should remain active (direct refs held)");
+ close(child_nsfd);
+
+ /* Parent should still be openable */
+ TH_LOG("Verifying parent still active via file handle");
+ int parent_fd3 = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY);
+ ASSERT_GE(parent_fd3, 0);
+ close(parent_fd3);
+
+ TH_LOG("Closing all fds to parent namespace");
+ close(parent_fd);
+ close(parent_fd2);
+
+ /* Both should now be inactive */
+ TH_LOG("Attempting to reopen parent (should fail - inactive, no refs)");
+ parent_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY);
+ ASSERT_LT(parent_fd, 0);
+ TH_LOG("Parent inactive as expected: %s (errno=%d)", strerror(errno), errno);
+ ASSERT_TRUE(errno == ENOENT || errno == ESTALE);
+}
+
+/*
+ * Test that bind mounts keep namespaces in the tree even when inactive
+ */
+TEST(ns_bind_mount_keeps_in_tree)
+{
+ struct file_handle *handle;
+ int mount_id;
+ int ret;
+ int fd;
+ int pipefd[2];
+ pid_t pid;
+ int status;
+ char buf[sizeof(*handle) + MAX_HANDLE_SZ];
+ char tmpfile[] = "/tmp/ns-test-XXXXXX";
+ int tmpfd;
+
+ /* Create temporary file for bind mount */
+ tmpfd = mkstemp(tmpfile);
+ if (tmpfd < 0) {
+ SKIP(return, "Cannot create temporary file");
+ }
+ close(tmpfd);
+
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ close(pipefd[0]);
+
+ /* Unshare mount namespace and make mounts private to avoid propagation */
+ ret = unshare(CLONE_NEWNS);
+ if (ret < 0) {
+ close(pipefd[1]);
+ unlink(tmpfile);
+ exit(1);
+ }
+ ret = mount(NULL, "/", NULL, MS_PRIVATE | MS_REC, NULL);
+ if (ret < 0) {
+ close(pipefd[1]);
+ unlink(tmpfile);
+ exit(1);
+ }
+
+ /* Create new network namespace */
+ ret = unshare(CLONE_NEWNET);
+ if (ret < 0) {
+ close(pipefd[1]);
+ unlink(tmpfile);
+ exit(1);
+ }
+
+ /* Bind mount the namespace */
+ ret = mount("/proc/self/ns/net", tmpfile, NULL, MS_BIND, NULL);
+ if (ret < 0) {
+ close(pipefd[1]);
+ unlink(tmpfile);
+ exit(1);
+ }
+
+ /* Get file handle */
+ fd = open("/proc/self/ns/net", O_RDONLY);
+ if (fd < 0) {
+ umount(tmpfile);
+ close(pipefd[1]);
+ unlink(tmpfile);
+ exit(1);
+ }
+
+ handle = (struct file_handle *)buf;
+ handle->handle_bytes = MAX_HANDLE_SZ;
+ ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH);
+ close(fd);
+
+ if (ret < 0) {
+ umount(tmpfile);
+ close(pipefd[1]);
+ unlink(tmpfile);
+ exit(1);
+ }
+
+ /* Send handle to parent */
+ write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes);
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ /* Parent */
+ close(pipefd[1]);
+ ret = read(pipefd[0], buf, sizeof(buf));
+ close(pipefd[0]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ ASSERT_GT(ret, 0);
+ handle = (struct file_handle *)buf;
+
+ /*
+ * Namespace should be inactive but still in tree due to bind mount.
+ * Reopening should fail with ENOENT (inactive) not ESTALE (not in tree).
+ */
+ fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_LT(fd, 0);
+ /* Should be ENOENT (inactive) since bind mount keeps it in tree */
+ if (errno != ENOENT && errno != ESTALE) {
+ TH_LOG("Unexpected error: %d", errno);
+ }
+
+ /* Cleanup */
+ umount(tmpfile);
+ unlink(tmpfile);
+}
+
+/*
+ * Test multi-level hierarchy (3+ levels deep).
+ * Grandparent → Parent → Child
+ * When child is active, both parent AND grandparent should be active.
+ */
+TEST(ns_multilevel_hierarchy)
+{
+ struct file_handle *gp_handle, *p_handle, *c_handle;
+ int ret, pipefd[2];
+ pid_t pid;
+ int status;
+ __u64 gp_id, p_id, c_id;
+ char gp_buf[sizeof(*gp_handle) + MAX_HANDLE_SZ];
+ char p_buf[sizeof(*p_handle) + MAX_HANDLE_SZ];
+ char c_buf[sizeof(*c_handle) + MAX_HANDLE_SZ];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ close(pipefd[0]);
+
+ /* Create grandparent user namespace */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int gp_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (gp_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(gp_fd, NS_GET_ID, &gp_id) < 0) {
+ close(gp_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(gp_fd);
+
+ /* Create parent user namespace */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int p_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (p_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(p_fd, NS_GET_ID, &p_id) < 0) {
+ close(p_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(p_fd);
+
+ /* Create child user namespace */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int c_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (c_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(c_fd, NS_GET_ID, &c_id) < 0) {
+ close(c_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(c_fd);
+
+ /* Send all three namespace IDs */
+ write(pipefd[1], &gp_id, sizeof(gp_id));
+ write(pipefd[1], &p_id, sizeof(p_id));
+ write(pipefd[1], &c_id, sizeof(c_id));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ close(pipefd[1]);
+
+ /* Read all three namespace IDs - fixed size, no parsing needed */
+ ret = read(pipefd[0], &gp_id, sizeof(gp_id));
+ if (ret != sizeof(gp_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read grandparent namespace ID from child");
+ }
+
+ ret = read(pipefd[0], &p_id, sizeof(p_id));
+ if (ret != sizeof(p_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read parent namespace ID from child");
+ }
+
+ ret = read(pipefd[0], &c_id, sizeof(c_id));
+ close(pipefd[0]);
+ if (ret != sizeof(c_id)) {
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read child namespace ID from child");
+ }
+
+ /* Construct file handles from namespace IDs */
+ gp_handle = (struct file_handle *)gp_buf;
+ gp_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ gp_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *gp_fh = (struct nsfs_file_handle *)gp_handle->f_handle;
+ gp_fh->ns_id = gp_id;
+ gp_fh->ns_type = 0;
+ gp_fh->ns_inum = 0;
+
+ p_handle = (struct file_handle *)p_buf;
+ p_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ p_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *p_fh = (struct nsfs_file_handle *)p_handle->f_handle;
+ p_fh->ns_id = p_id;
+ p_fh->ns_type = 0;
+ p_fh->ns_inum = 0;
+
+ c_handle = (struct file_handle *)c_buf;
+ c_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ c_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *c_fh = (struct nsfs_file_handle *)c_handle->f_handle;
+ c_fh->ns_id = c_id;
+ c_fh->ns_type = 0;
+ c_fh->ns_inum = 0;
+
+ /* Open child before process exits */
+ int c_fd = open_by_handle_at(FD_NSFS_ROOT, c_handle, O_RDONLY);
+ if (c_fd < 0) {
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to open child namespace");
+ }
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /*
+ * With 3-level hierarchy and child active:
+ * - Child is active (we hold fd)
+ * - Parent should be active (propagated from child)
+ * - Grandparent should be active (propagated from parent)
+ */
+ TH_LOG("Testing parent active when child is active");
+ int p_fd = open_by_handle_at(FD_NSFS_ROOT, p_handle, O_RDONLY);
+ ASSERT_GE(p_fd, 0);
+
+ TH_LOG("Testing grandparent active when child is active");
+ int gp_fd = open_by_handle_at(FD_NSFS_ROOT, gp_handle, O_RDONLY);
+ ASSERT_GE(gp_fd, 0);
+
+ close(c_fd);
+ close(p_fd);
+ close(gp_fd);
+}
+
+/*
+ * Test multiple children sharing same parent.
+ * Parent should stay active as long as ANY child is active.
+ */
+TEST(ns_multiple_children_same_parent)
+{
+ struct file_handle *p_handle, *c1_handle, *c2_handle;
+ int ret, pipefd[2];
+ pid_t pid;
+ int status;
+ __u64 p_id, c1_id, c2_id;
+ char p_buf[sizeof(*p_handle) + MAX_HANDLE_SZ];
+ char c1_buf[sizeof(*c1_handle) + MAX_HANDLE_SZ];
+ char c2_buf[sizeof(*c2_handle) + MAX_HANDLE_SZ];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ close(pipefd[0]);
+
+ /* Create parent user namespace */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int p_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (p_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(p_fd, NS_GET_ID, &p_id) < 0) {
+ close(p_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(p_fd);
+
+ /* Create first child user namespace */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int c1_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (c1_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(c1_fd, NS_GET_ID, &c1_id) < 0) {
+ close(c1_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(c1_fd);
+
+ /* Return to parent user namespace and create second child */
+ /* We can't actually do this easily, so let's create a sibling namespace
+ * by creating a network namespace instead */
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int c2_fd = open("/proc/self/ns/net", O_RDONLY);
+ if (c2_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(c2_fd, NS_GET_ID, &c2_id) < 0) {
+ close(c2_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(c2_fd);
+
+ /* Send all namespace IDs */
+ write(pipefd[1], &p_id, sizeof(p_id));
+ write(pipefd[1], &c1_id, sizeof(c1_id));
+ write(pipefd[1], &c2_id, sizeof(c2_id));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ close(pipefd[1]);
+
+ /* Read all three namespace IDs - fixed size, no parsing needed */
+ ret = read(pipefd[0], &p_id, sizeof(p_id));
+ if (ret != sizeof(p_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read parent namespace ID");
+ }
+
+ ret = read(pipefd[0], &c1_id, sizeof(c1_id));
+ if (ret != sizeof(c1_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read first child namespace ID");
+ }
+
+ ret = read(pipefd[0], &c2_id, sizeof(c2_id));
+ close(pipefd[0]);
+ if (ret != sizeof(c2_id)) {
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read second child namespace ID");
+ }
+
+ /* Construct file handles from namespace IDs */
+ p_handle = (struct file_handle *)p_buf;
+ p_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ p_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *p_fh = (struct nsfs_file_handle *)p_handle->f_handle;
+ p_fh->ns_id = p_id;
+ p_fh->ns_type = 0;
+ p_fh->ns_inum = 0;
+
+ c1_handle = (struct file_handle *)c1_buf;
+ c1_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ c1_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *c1_fh = (struct nsfs_file_handle *)c1_handle->f_handle;
+ c1_fh->ns_id = c1_id;
+ c1_fh->ns_type = 0;
+ c1_fh->ns_inum = 0;
+
+ c2_handle = (struct file_handle *)c2_buf;
+ c2_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ c2_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *c2_fh = (struct nsfs_file_handle *)c2_handle->f_handle;
+ c2_fh->ns_id = c2_id;
+ c2_fh->ns_type = 0;
+ c2_fh->ns_inum = 0;
+
+ /* Open both children before process exits */
+ int c1_fd = open_by_handle_at(FD_NSFS_ROOT, c1_handle, O_RDONLY);
+ int c2_fd = open_by_handle_at(FD_NSFS_ROOT, c2_handle, O_RDONLY);
+
+ if (c1_fd < 0 || c2_fd < 0) {
+ if (c1_fd >= 0) close(c1_fd);
+ if (c2_fd >= 0) close(c2_fd);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to open child namespaces");
+ }
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Parent should be active (both children active) */
+ TH_LOG("Both children active - parent should be active");
+ int p_fd = open_by_handle_at(FD_NSFS_ROOT, p_handle, O_RDONLY);
+ ASSERT_GE(p_fd, 0);
+ close(p_fd);
+
+ /* Close first child - parent should STILL be active */
+ TH_LOG("Closing first child - parent should still be active");
+ close(c1_fd);
+ p_fd = open_by_handle_at(FD_NSFS_ROOT, p_handle, O_RDONLY);
+ ASSERT_GE(p_fd, 0);
+ close(p_fd);
+
+ /* Close second child - NOW parent should become inactive */
+ TH_LOG("Closing second child - parent should become inactive");
+ close(c2_fd);
+ p_fd = open_by_handle_at(FD_NSFS_ROOT, p_handle, O_RDONLY);
+ ASSERT_LT(p_fd, 0);
+}
+
+/*
+ * Test that different namespace types with same owner all contribute
+ * active references to the owning user namespace.
+ */
+TEST(ns_different_types_same_owner)
+{
+ struct file_handle *u_handle, *n_handle, *ut_handle;
+ int ret, pipefd[2];
+ pid_t pid;
+ int status;
+ __u64 u_id, n_id, ut_id;
+ char u_buf[sizeof(*u_handle) + MAX_HANDLE_SZ];
+ char n_buf[sizeof(*n_handle) + MAX_HANDLE_SZ];
+ char ut_buf[sizeof(*ut_handle) + MAX_HANDLE_SZ];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ close(pipefd[0]);
+
+ /* Create user namespace */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int u_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (u_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(u_fd, NS_GET_ID, &u_id) < 0) {
+ close(u_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(u_fd);
+
+ /* Create network namespace (owned by user namespace) */
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int n_fd = open("/proc/self/ns/net", O_RDONLY);
+ if (n_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(n_fd, NS_GET_ID, &n_id) < 0) {
+ close(n_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(n_fd);
+
+ /* Create UTS namespace (also owned by user namespace) */
+ if (unshare(CLONE_NEWUTS) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int ut_fd = open("/proc/self/ns/uts", O_RDONLY);
+ if (ut_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(ut_fd, NS_GET_ID, &ut_id) < 0) {
+ close(ut_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(ut_fd);
+
+ /* Send all namespace IDs */
+ write(pipefd[1], &u_id, sizeof(u_id));
+ write(pipefd[1], &n_id, sizeof(n_id));
+ write(pipefd[1], &ut_id, sizeof(ut_id));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ close(pipefd[1]);
+
+ /* Read all three namespace IDs - fixed size, no parsing needed */
+ ret = read(pipefd[0], &u_id, sizeof(u_id));
+ if (ret != sizeof(u_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read user namespace ID");
+ }
+
+ ret = read(pipefd[0], &n_id, sizeof(n_id));
+ if (ret != sizeof(n_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read network namespace ID");
+ }
+
+ ret = read(pipefd[0], &ut_id, sizeof(ut_id));
+ close(pipefd[0]);
+ if (ret != sizeof(ut_id)) {
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read UTS namespace ID");
+ }
+
+ /* Construct file handles from namespace IDs */
+ u_handle = (struct file_handle *)u_buf;
+ u_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ u_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *u_fh = (struct nsfs_file_handle *)u_handle->f_handle;
+ u_fh->ns_id = u_id;
+ u_fh->ns_type = 0;
+ u_fh->ns_inum = 0;
+
+ n_handle = (struct file_handle *)n_buf;
+ n_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ n_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *n_fh = (struct nsfs_file_handle *)n_handle->f_handle;
+ n_fh->ns_id = n_id;
+ n_fh->ns_type = 0;
+ n_fh->ns_inum = 0;
+
+ ut_handle = (struct file_handle *)ut_buf;
+ ut_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ ut_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *ut_fh = (struct nsfs_file_handle *)ut_handle->f_handle;
+ ut_fh->ns_id = ut_id;
+ ut_fh->ns_type = 0;
+ ut_fh->ns_inum = 0;
+
+ /* Open both non-user namespaces before process exits */
+ int n_fd = open_by_handle_at(FD_NSFS_ROOT, n_handle, O_RDONLY);
+ int ut_fd = open_by_handle_at(FD_NSFS_ROOT, ut_handle, O_RDONLY);
+
+ if (n_fd < 0 || ut_fd < 0) {
+ if (n_fd >= 0) close(n_fd);
+ if (ut_fd >= 0) close(ut_fd);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to open namespaces");
+ }
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /*
+ * Both network and UTS namespaces are active.
+ * User namespace should be active (gets 2 active refs).
+ */
+ TH_LOG("Both net and uts active - user namespace should be active");
+ int u_fd = open_by_handle_at(FD_NSFS_ROOT, u_handle, O_RDONLY);
+ ASSERT_GE(u_fd, 0);
+ close(u_fd);
+
+ /* Close network namespace - user namespace should STILL be active */
+ TH_LOG("Closing network ns - user ns should still be active (uts still active)");
+ close(n_fd);
+ u_fd = open_by_handle_at(FD_NSFS_ROOT, u_handle, O_RDONLY);
+ ASSERT_GE(u_fd, 0);
+ close(u_fd);
+
+ /* Close UTS namespace - user namespace should become inactive */
+ TH_LOG("Closing uts ns - user ns should become inactive");
+ close(ut_fd);
+ u_fd = open_by_handle_at(FD_NSFS_ROOT, u_handle, O_RDONLY);
+ ASSERT_LT(u_fd, 0);
+}
+
+/*
+ * Test hierarchical propagation with deep namespace hierarchy.
+ * Create: init_user_ns -> user_A -> user_B -> net_ns
+ * When net_ns is active, both user_A and user_B should be active.
+ * This verifies the conditional recursion in __ns_ref_active_put() works.
+ */
+TEST(ns_deep_hierarchy_propagation)
+{
+ struct file_handle *ua_handle, *ub_handle, *net_handle;
+ int ret, pipefd[2];
+ pid_t pid;
+ int status;
+ __u64 ua_id, ub_id, net_id;
+ char ua_buf[sizeof(*ua_handle) + MAX_HANDLE_SZ];
+ char ub_buf[sizeof(*ub_handle) + MAX_HANDLE_SZ];
+ char net_buf[sizeof(*net_handle) + MAX_HANDLE_SZ];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ close(pipefd[0]);
+
+ /* Create user_A -> user_B -> net hierarchy */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int ua_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (ua_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(ua_fd, NS_GET_ID, &ua_id) < 0) {
+ close(ua_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(ua_fd);
+
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int ub_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (ub_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(ub_fd, NS_GET_ID, &ub_id) < 0) {
+ close(ub_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(ub_fd);
+
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int net_fd = open("/proc/self/ns/net", O_RDONLY);
+ if (net_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(net_fd, NS_GET_ID, &net_id) < 0) {
+ close(net_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(net_fd);
+
+ /* Send all three namespace IDs */
+ write(pipefd[1], &ua_id, sizeof(ua_id));
+ write(pipefd[1], &ub_id, sizeof(ub_id));
+ write(pipefd[1], &net_id, sizeof(net_id));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ close(pipefd[1]);
+
+ /* Read all three namespace IDs - fixed size, no parsing needed */
+ ret = read(pipefd[0], &ua_id, sizeof(ua_id));
+ if (ret != sizeof(ua_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read user_A namespace ID");
+ }
+
+ ret = read(pipefd[0], &ub_id, sizeof(ub_id));
+ if (ret != sizeof(ub_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read user_B namespace ID");
+ }
+
+ ret = read(pipefd[0], &net_id, sizeof(net_id));
+ close(pipefd[0]);
+ if (ret != sizeof(net_id)) {
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read network namespace ID");
+ }
+
+ /* Construct file handles from namespace IDs */
+ ua_handle = (struct file_handle *)ua_buf;
+ ua_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ ua_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *ua_fh = (struct nsfs_file_handle *)ua_handle->f_handle;
+ ua_fh->ns_id = ua_id;
+ ua_fh->ns_type = 0;
+ ua_fh->ns_inum = 0;
+
+ ub_handle = (struct file_handle *)ub_buf;
+ ub_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ ub_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *ub_fh = (struct nsfs_file_handle *)ub_handle->f_handle;
+ ub_fh->ns_id = ub_id;
+ ub_fh->ns_type = 0;
+ ub_fh->ns_inum = 0;
+
+ net_handle = (struct file_handle *)net_buf;
+ net_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ net_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *net_fh = (struct nsfs_file_handle *)net_handle->f_handle;
+ net_fh->ns_id = net_id;
+ net_fh->ns_type = 0;
+ net_fh->ns_inum = 0;
+
+ /* Open net_ns before child exits to keep it active */
+ int net_fd = open_by_handle_at(FD_NSFS_ROOT, net_handle, O_RDONLY);
+ if (net_fd < 0) {
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to open network namespace");
+ }
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* With net_ns active, both user_A and user_B should be active */
+ TH_LOG("Testing user_B active (net_ns active causes propagation)");
+ int ub_fd = open_by_handle_at(FD_NSFS_ROOT, ub_handle, O_RDONLY);
+ ASSERT_GE(ub_fd, 0);
+
+ TH_LOG("Testing user_A active (propagated through user_B)");
+ int ua_fd = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY);
+ ASSERT_GE(ua_fd, 0);
+
+ /* Close net_ns - user_B should stay active (we hold direct ref) */
+ TH_LOG("Closing net_ns, user_B should remain active (direct ref held)");
+ close(net_fd);
+ int ub_fd2 = open_by_handle_at(FD_NSFS_ROOT, ub_handle, O_RDONLY);
+ ASSERT_GE(ub_fd2, 0);
+ close(ub_fd2);
+
+ /* Close user_B - user_A should stay active (we hold direct ref) */
+ TH_LOG("Closing user_B, user_A should remain active (direct ref held)");
+ close(ub_fd);
+ int ua_fd2 = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY);
+ ASSERT_GE(ua_fd2, 0);
+ close(ua_fd2);
+
+ /* Close user_A - everything should become inactive */
+ TH_LOG("Closing user_A, all should become inactive");
+ close(ua_fd);
+
+ /* All should now be inactive */
+ ua_fd = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY);
+ ASSERT_LT(ua_fd, 0);
+}
+
+/*
+ * Test that parent stays active as long as ANY child is active.
+ * Create parent user namespace with two child net namespaces.
+ * Parent should remain active until BOTH children are inactive.
+ */
+TEST(ns_parent_multiple_children_refcount)
+{
+ struct file_handle *parent_handle, *net1_handle, *net2_handle;
+ int ret, pipefd[2], syncpipe[2];
+ pid_t pid;
+ int status;
+ __u64 p_id, n1_id, n2_id;
+ char p_buf[sizeof(*parent_handle) + MAX_HANDLE_SZ];
+ char n1_buf[sizeof(*net1_handle) + MAX_HANDLE_SZ];
+ char n2_buf[sizeof(*net2_handle) + MAX_HANDLE_SZ];
+ char sync_byte;
+
+ ASSERT_EQ(pipe(pipefd), 0);
+ ASSERT_EQ(pipe(syncpipe), 0);
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ close(pipefd[0]);
+ close(syncpipe[1]);
+
+ /* Create parent user namespace */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int p_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (p_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(p_fd, NS_GET_ID, &p_id) < 0) {
+ close(p_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(p_fd);
+
+ /* Create first network namespace */
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(pipefd[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+
+ int n1_fd = open("/proc/self/ns/net", O_RDONLY);
+ if (n1_fd < 0) {
+ close(pipefd[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+ if (ioctl(n1_fd, NS_GET_ID, &n1_id) < 0) {
+ close(n1_fd);
+ close(pipefd[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+ /* Keep n1_fd open so first namespace stays active */
+
+ /* Create second network namespace */
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(n1_fd);
+ close(pipefd[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+
+ int n2_fd = open("/proc/self/ns/net", O_RDONLY);
+ if (n2_fd < 0) {
+ close(n1_fd);
+ close(pipefd[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+ if (ioctl(n2_fd, NS_GET_ID, &n2_id) < 0) {
+ close(n1_fd);
+ close(n2_fd);
+ close(pipefd[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+ /* Keep both n1_fd and n2_fd open */
+
+ /* Send all namespace IDs */
+ write(pipefd[1], &p_id, sizeof(p_id));
+ write(pipefd[1], &n1_id, sizeof(n1_id));
+ write(pipefd[1], &n2_id, sizeof(n2_id));
+ close(pipefd[1]);
+
+ /* Wait for parent to signal before exiting */
+ read(syncpipe[0], &sync_byte, 1);
+ close(syncpipe[0]);
+ exit(0);
+ }
+
+ close(pipefd[1]);
+ close(syncpipe[0]);
+
+ /* Read all three namespace IDs - fixed size, no parsing needed */
+ ret = read(pipefd[0], &p_id, sizeof(p_id));
+ if (ret != sizeof(p_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read parent namespace ID");
+ }
+
+ ret = read(pipefd[0], &n1_id, sizeof(n1_id));
+ if (ret != sizeof(n1_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read first network namespace ID");
+ }
+
+ ret = read(pipefd[0], &n2_id, sizeof(n2_id));
+ close(pipefd[0]);
+ if (ret != sizeof(n2_id)) {
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read second network namespace ID");
+ }
+
+ /* Construct file handles from namespace IDs */
+ parent_handle = (struct file_handle *)p_buf;
+ parent_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ parent_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *p_fh = (struct nsfs_file_handle *)parent_handle->f_handle;
+ p_fh->ns_id = p_id;
+ p_fh->ns_type = 0;
+ p_fh->ns_inum = 0;
+
+ net1_handle = (struct file_handle *)n1_buf;
+ net1_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ net1_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *n1_fh = (struct nsfs_file_handle *)net1_handle->f_handle;
+ n1_fh->ns_id = n1_id;
+ n1_fh->ns_type = 0;
+ n1_fh->ns_inum = 0;
+
+ net2_handle = (struct file_handle *)n2_buf;
+ net2_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ net2_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *n2_fh = (struct nsfs_file_handle *)net2_handle->f_handle;
+ n2_fh->ns_id = n2_id;
+ n2_fh->ns_type = 0;
+ n2_fh->ns_inum = 0;
+
+ /* Open both net namespaces while child is still alive */
+ int n1_fd = open_by_handle_at(FD_NSFS_ROOT, net1_handle, O_RDONLY);
+ int n2_fd = open_by_handle_at(FD_NSFS_ROOT, net2_handle, O_RDONLY);
+ if (n1_fd < 0 || n2_fd < 0) {
+ if (n1_fd >= 0) close(n1_fd);
+ if (n2_fd >= 0) close(n2_fd);
+ sync_byte = 'G';
+ write(syncpipe[1], &sync_byte, 1);
+ close(syncpipe[1]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to open net namespaces");
+ }
+
+ /* Signal child that we have opened the namespaces */
+ sync_byte = 'G';
+ write(syncpipe[1], &sync_byte, 1);
+ close(syncpipe[1]);
+
+ /* Wait for child to exit */
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Parent should be active (has 2 active children) */
+ TH_LOG("Both net namespaces active - parent should be active");
+ int p_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY);
+ ASSERT_GE(p_fd, 0);
+ close(p_fd);
+
+ /* Close first net namespace - parent should STILL be active */
+ TH_LOG("Closing first net ns - parent should still be active");
+ close(n1_fd);
+ p_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY);
+ ASSERT_GE(p_fd, 0);
+ close(p_fd);
+
+ /* Close second net namespace - parent should become inactive */
+ TH_LOG("Closing second net ns - parent should become inactive");
+ close(n2_fd);
+ p_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY);
+ ASSERT_LT(p_fd, 0);
+}
+
+/*
+ * Test that user namespace as a child also propagates correctly.
+ * Create user_A -> user_B, verify when user_B is active that user_A
+ * is also active. This is different from non-user namespace children.
+ */
+TEST(ns_userns_child_propagation)
+{
+ struct file_handle *ua_handle, *ub_handle;
+ int ret, pipefd[2];
+ pid_t pid;
+ int status;
+ __u64 ua_id, ub_id;
+ char ua_buf[sizeof(*ua_handle) + MAX_HANDLE_SZ];
+ char ub_buf[sizeof(*ub_handle) + MAX_HANDLE_SZ];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ close(pipefd[0]);
+
+ /* Create user_A */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int ua_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (ua_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(ua_fd, NS_GET_ID, &ua_id) < 0) {
+ close(ua_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(ua_fd);
+
+ /* Create user_B (child of user_A) */
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int ub_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (ub_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(ub_fd, NS_GET_ID, &ub_id) < 0) {
+ close(ub_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(ub_fd);
+
+ /* Send both namespace IDs */
+ write(pipefd[1], &ua_id, sizeof(ua_id));
+ write(pipefd[1], &ub_id, sizeof(ub_id));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ close(pipefd[1]);
+
+ /* Read both namespace IDs - fixed size, no parsing needed */
+ ret = read(pipefd[0], &ua_id, sizeof(ua_id));
+ if (ret != sizeof(ua_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read user_A namespace ID");
+ }
+
+ ret = read(pipefd[0], &ub_id, sizeof(ub_id));
+ close(pipefd[0]);
+ if (ret != sizeof(ub_id)) {
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read user_B namespace ID");
+ }
+
+ /* Construct file handles from namespace IDs */
+ ua_handle = (struct file_handle *)ua_buf;
+ ua_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ ua_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *ua_fh = (struct nsfs_file_handle *)ua_handle->f_handle;
+ ua_fh->ns_id = ua_id;
+ ua_fh->ns_type = 0;
+ ua_fh->ns_inum = 0;
+
+ ub_handle = (struct file_handle *)ub_buf;
+ ub_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ ub_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *ub_fh = (struct nsfs_file_handle *)ub_handle->f_handle;
+ ub_fh->ns_id = ub_id;
+ ub_fh->ns_type = 0;
+ ub_fh->ns_inum = 0;
+
+ /* Open user_B before child exits */
+ int ub_fd = open_by_handle_at(FD_NSFS_ROOT, ub_handle, O_RDONLY);
+ if (ub_fd < 0) {
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to open user_B");
+ }
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* With user_B active, user_A should also be active */
+ TH_LOG("Testing user_A active when child user_B is active");
+ int ua_fd = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY);
+ ASSERT_GE(ua_fd, 0);
+
+ /* Close user_B */
+ TH_LOG("Closing user_B");
+ close(ub_fd);
+
+ /* user_A should remain active (we hold direct ref) */
+ int ua_fd2 = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY);
+ ASSERT_GE(ua_fd2, 0);
+ close(ua_fd2);
+
+ /* Close user_A - should become inactive */
+ TH_LOG("Closing user_A - should become inactive");
+ close(ua_fd);
+
+ ua_fd = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY);
+ ASSERT_LT(ua_fd, 0);
+}
+
+/*
+ * Test different namespace types (net, uts, ipc) all contributing
+ * active references to the same owning user namespace.
+ */
+TEST(ns_mixed_types_same_owner)
+{
+ struct file_handle *user_handle, *net_handle, *uts_handle;
+ int ret, pipefd[2];
+ pid_t pid;
+ int status;
+ __u64 u_id, n_id, ut_id;
+ char u_buf[sizeof(*user_handle) + MAX_HANDLE_SZ];
+ char n_buf[sizeof(*net_handle) + MAX_HANDLE_SZ];
+ char ut_buf[sizeof(*uts_handle) + MAX_HANDLE_SZ];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ close(pipefd[0]);
+
+ if (setup_userns() < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int u_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (u_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(u_fd, NS_GET_ID, &u_id) < 0) {
+ close(u_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(u_fd);
+
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int n_fd = open("/proc/self/ns/net", O_RDONLY);
+ if (n_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(n_fd, NS_GET_ID, &n_id) < 0) {
+ close(n_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(n_fd);
+
+ if (unshare(CLONE_NEWUTS) < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+
+ int ut_fd = open("/proc/self/ns/uts", O_RDONLY);
+ if (ut_fd < 0) {
+ close(pipefd[1]);
+ exit(1);
+ }
+ if (ioctl(ut_fd, NS_GET_ID, &ut_id) < 0) {
+ close(ut_fd);
+ close(pipefd[1]);
+ exit(1);
+ }
+ close(ut_fd);
+
+ /* Send all namespace IDs */
+ write(pipefd[1], &u_id, sizeof(u_id));
+ write(pipefd[1], &n_id, sizeof(n_id));
+ write(pipefd[1], &ut_id, sizeof(ut_id));
+ close(pipefd[1]);
+ exit(0);
+ }
+
+ close(pipefd[1]);
+
+ /* Read all three namespace IDs - fixed size, no parsing needed */
+ ret = read(pipefd[0], &u_id, sizeof(u_id));
+ if (ret != sizeof(u_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read user namespace ID");
+ }
+
+ ret = read(pipefd[0], &n_id, sizeof(n_id));
+ if (ret != sizeof(n_id)) {
+ close(pipefd[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read network namespace ID");
+ }
+
+ ret = read(pipefd[0], &ut_id, sizeof(ut_id));
+ close(pipefd[0]);
+ if (ret != sizeof(ut_id)) {
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read UTS namespace ID");
+ }
+
+ /* Construct file handles from namespace IDs */
+ user_handle = (struct file_handle *)u_buf;
+ user_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ user_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *u_fh = (struct nsfs_file_handle *)user_handle->f_handle;
+ u_fh->ns_id = u_id;
+ u_fh->ns_type = 0;
+ u_fh->ns_inum = 0;
+
+ net_handle = (struct file_handle *)n_buf;
+ net_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ net_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *n_fh = (struct nsfs_file_handle *)net_handle->f_handle;
+ n_fh->ns_id = n_id;
+ n_fh->ns_type = 0;
+ n_fh->ns_inum = 0;
+
+ uts_handle = (struct file_handle *)ut_buf;
+ uts_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ uts_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *ut_fh = (struct nsfs_file_handle *)uts_handle->f_handle;
+ ut_fh->ns_id = ut_id;
+ ut_fh->ns_type = 0;
+ ut_fh->ns_inum = 0;
+
+ /* Open both non-user namespaces */
+ int n_fd = open_by_handle_at(FD_NSFS_ROOT, net_handle, O_RDONLY);
+ int ut_fd = open_by_handle_at(FD_NSFS_ROOT, uts_handle, O_RDONLY);
+ if (n_fd < 0 || ut_fd < 0) {
+ if (n_fd >= 0) close(n_fd);
+ if (ut_fd >= 0) close(ut_fd);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to open namespaces");
+ }
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* User namespace should be active (2 active children) */
+ TH_LOG("Both net and uts active - user ns should be active");
+ int u_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY);
+ ASSERT_GE(u_fd, 0);
+ close(u_fd);
+
+ /* Close net - user ns should STILL be active (uts still active) */
+ TH_LOG("Closing net - user ns should still be active");
+ close(n_fd);
+ u_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY);
+ ASSERT_GE(u_fd, 0);
+ close(u_fd);
+
+ /* Close uts - user ns should become inactive */
+ TH_LOG("Closing uts - user ns should become inactive");
+ close(ut_fd);
+ u_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY);
+ ASSERT_LT(u_fd, 0);
+}
+
+/* Thread test helpers and structures */
+struct thread_ns_info {
+ __u64 ns_id;
+ int pipefd;
+ int syncfd_read;
+ int syncfd_write;
+ int exit_code;
+};
+
+static void *thread_create_namespace(void *arg)
+{
+ struct thread_ns_info *info = (struct thread_ns_info *)arg;
+ int ret;
+
+ /* Create new network namespace */
+ ret = unshare(CLONE_NEWNET);
+ if (ret < 0) {
+ info->exit_code = 1;
+ return NULL;
+ }
+
+ /* Get namespace ID */
+ int fd = open("/proc/thread-self/ns/net", O_RDONLY);
+ if (fd < 0) {
+ info->exit_code = 2;
+ return NULL;
+ }
+
+ ret = ioctl(fd, NS_GET_ID, &info->ns_id);
+ close(fd);
+ if (ret < 0) {
+ info->exit_code = 3;
+ return NULL;
+ }
+
+ /* Send namespace ID to main thread */
+ if (write(info->pipefd, &info->ns_id, sizeof(info->ns_id)) != sizeof(info->ns_id)) {
+ info->exit_code = 4;
+ return NULL;
+ }
+
+ /* Wait for signal to exit */
+ char sync_byte;
+ if (read(info->syncfd_read, &sync_byte, 1) != 1) {
+ info->exit_code = 5;
+ return NULL;
+ }
+
+ info->exit_code = 0;
+ return NULL;
+}
+
+/*
+ * Test that namespace becomes inactive after thread exits.
+ * This verifies active reference counting works with threads, not just processes.
+ */
+TEST(thread_ns_inactive_after_exit)
+{
+ pthread_t thread;
+ struct thread_ns_info info;
+ struct file_handle *handle;
+ int pipefd[2];
+ int syncpipe[2];
+ int ret;
+ char sync_byte;
+ char buf[sizeof(*handle) + MAX_HANDLE_SZ];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+ ASSERT_EQ(pipe(syncpipe), 0);
+
+ info.pipefd = pipefd[1];
+ info.syncfd_read = syncpipe[0];
+ info.syncfd_write = -1;
+ info.exit_code = -1;
+
+ /* Create thread that will create a namespace */
+ ret = pthread_create(&thread, NULL, thread_create_namespace, &info);
+ ASSERT_EQ(ret, 0);
+
+ /* Read namespace ID from thread */
+ __u64 ns_id;
+ ret = read(pipefd[0], &ns_id, sizeof(ns_id));
+ if (ret != sizeof(ns_id)) {
+ sync_byte = 'X';
+ write(syncpipe[1], &sync_byte, 1);
+ pthread_join(thread, NULL);
+ close(pipefd[0]);
+ close(pipefd[1]);
+ close(syncpipe[0]);
+ close(syncpipe[1]);
+ SKIP(return, "Failed to read namespace ID from thread");
+ }
+
+ TH_LOG("Thread created namespace with ID %llu", (unsigned long long)ns_id);
+
+ /* Construct file handle */
+ handle = (struct file_handle *)buf;
+ handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *fh = (struct nsfs_file_handle *)handle->f_handle;
+ fh->ns_id = ns_id;
+ fh->ns_type = 0;
+ fh->ns_inum = 0;
+
+ /* Namespace should be active while thread is alive */
+ TH_LOG("Attempting to open namespace while thread is alive (should succeed)");
+ int nsfd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_GE(nsfd, 0);
+ close(nsfd);
+
+ /* Signal thread to exit */
+ TH_LOG("Signaling thread to exit");
+ sync_byte = 'X';
+ ASSERT_EQ(write(syncpipe[1], &sync_byte, 1), 1);
+ close(syncpipe[1]);
+
+ /* Wait for thread to exit */
+ ASSERT_EQ(pthread_join(thread, NULL), 0);
+ close(pipefd[0]);
+ close(pipefd[1]);
+ close(syncpipe[0]);
+
+ if (info.exit_code != 0)
+ SKIP(return, "Thread failed to create namespace");
+
+ TH_LOG("Thread exited, namespace should be inactive");
+
+ /* Namespace should now be inactive */
+ nsfd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_LT(nsfd, 0);
+ /* Should fail with ENOENT (inactive) or ESTALE (gone) */
+ TH_LOG("Namespace inactive as expected: %s (errno=%d)", strerror(errno), errno);
+ ASSERT_TRUE(errno == ENOENT || errno == ESTALE);
+}
+
+/*
+ * Test that a namespace remains active while a thread holds an fd to it.
+ * Even after the thread exits, the namespace should remain active as long as
+ * another thread holds a file descriptor to it.
+ */
+TEST(thread_ns_fd_keeps_active)
+{
+ pthread_t thread;
+ struct thread_ns_info info;
+ struct file_handle *handle;
+ int pipefd[2];
+ int syncpipe[2];
+ int ret;
+ char sync_byte;
+ char buf[sizeof(*handle) + MAX_HANDLE_SZ];
+
+ ASSERT_EQ(pipe(pipefd), 0);
+ ASSERT_EQ(pipe(syncpipe), 0);
+
+ info.pipefd = pipefd[1];
+ info.syncfd_read = syncpipe[0];
+ info.syncfd_write = -1;
+ info.exit_code = -1;
+
+ /* Create thread that will create a namespace */
+ ret = pthread_create(&thread, NULL, thread_create_namespace, &info);
+ ASSERT_EQ(ret, 0);
+
+ /* Read namespace ID from thread */
+ __u64 ns_id;
+ ret = read(pipefd[0], &ns_id, sizeof(ns_id));
+ if (ret != sizeof(ns_id)) {
+ sync_byte = 'X';
+ write(syncpipe[1], &sync_byte, 1);
+ pthread_join(thread, NULL);
+ close(pipefd[0]);
+ close(pipefd[1]);
+ close(syncpipe[0]);
+ close(syncpipe[1]);
+ SKIP(return, "Failed to read namespace ID from thread");
+ }
+
+ TH_LOG("Thread created namespace with ID %llu", (unsigned long long)ns_id);
+
+ /* Construct file handle */
+ handle = (struct file_handle *)buf;
+ handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *fh = (struct nsfs_file_handle *)handle->f_handle;
+ fh->ns_id = ns_id;
+ fh->ns_type = 0;
+ fh->ns_inum = 0;
+
+ /* Open namespace while thread is alive */
+ TH_LOG("Opening namespace while thread is alive");
+ int nsfd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_GE(nsfd, 0);
+
+ /* Signal thread to exit */
+ TH_LOG("Signaling thread to exit");
+ sync_byte = 'X';
+ write(syncpipe[1], &sync_byte, 1);
+ close(syncpipe[1]);
+
+ /* Wait for thread to exit */
+ pthread_join(thread, NULL);
+ close(pipefd[0]);
+ close(pipefd[1]);
+ close(syncpipe[0]);
+
+ if (info.exit_code != 0) {
+ close(nsfd);
+ SKIP(return, "Thread failed to create namespace");
+ }
+
+ TH_LOG("Thread exited, but main thread holds fd - namespace should remain active");
+
+ /* Namespace should still be active because we hold an fd */
+ int nsfd2 = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_GE(nsfd2, 0);
+
+ /* Verify it's the same namespace */
+ struct stat st1, st2;
+ ASSERT_EQ(fstat(nsfd, &st1), 0);
+ ASSERT_EQ(fstat(nsfd2, &st2), 0);
+ ASSERT_EQ(st1.st_ino, st2.st_ino);
+ close(nsfd2);
+
+ TH_LOG("Closing fd - namespace should become inactive");
+ close(nsfd);
+
+ /* Now namespace should be inactive */
+ nsfd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_LT(nsfd, 0);
+ /* Should fail with ENOENT (inactive) or ESTALE (gone) */
+ TH_LOG("Namespace inactive as expected: %s (errno=%d)", strerror(errno), errno);
+ ASSERT_TRUE(errno == ENOENT || errno == ESTALE);
+}
+
+/* Structure for thread data in subprocess */
+struct thread_sleep_data {
+ int syncfd_read;
+};
+
+static void *thread_sleep_and_wait(void *arg)
+{
+ struct thread_sleep_data *data = (struct thread_sleep_data *)arg;
+ char sync_byte;
+
+ /* Wait for signal to exit - read will unblock when pipe is closed */
+ (void)read(data->syncfd_read, &sync_byte, 1);
+ return NULL;
+}
+
+/*
+ * Test that namespaces become inactive after subprocess with multiple threads exits.
+ * Create a subprocess that unshares user and network namespaces, then creates two
+ * threads that share those namespaces. Verify that after all threads and subprocess
+ * exit, the namespaces are no longer listed by listns() and cannot be opened by
+ * open_by_handle_at().
+ */
+TEST(thread_subprocess_ns_inactive_after_all_exit)
+{
+ int pipefd[2];
+ int sv[2];
+ pid_t pid;
+ int status;
+ __u64 user_id, net_id;
+ struct file_handle *user_handle, *net_handle;
+ char user_buf[sizeof(*user_handle) + MAX_HANDLE_SZ];
+ char net_buf[sizeof(*net_handle) + MAX_HANDLE_SZ];
+ char sync_byte;
+ int ret;
+
+ ASSERT_EQ(pipe(pipefd), 0);
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child process */
+ close(pipefd[0]);
+ close(sv[0]);
+
+ /* Create user namespace with mappings */
+ if (setup_userns() < 0) {
+ fprintf(stderr, "Child: setup_userns() failed: %s\n", strerror(errno));
+ close(pipefd[1]);
+ close(sv[1]);
+ exit(1);
+ }
+ fprintf(stderr, "Child: setup_userns() succeeded\n");
+
+ /* Get user namespace ID */
+ int user_fd = open("/proc/self/ns/user", O_RDONLY);
+ if (user_fd < 0) {
+ fprintf(stderr, "Child: open(/proc/self/ns/user) failed: %s\n", strerror(errno));
+ close(pipefd[1]);
+ close(sv[1]);
+ exit(1);
+ }
+
+ if (ioctl(user_fd, NS_GET_ID, &user_id) < 0) {
+ fprintf(stderr, "Child: ioctl(NS_GET_ID) for user ns failed: %s\n", strerror(errno));
+ close(user_fd);
+ close(pipefd[1]);
+ close(sv[1]);
+ exit(1);
+ }
+ close(user_fd);
+ fprintf(stderr, "Child: user ns ID = %llu\n", (unsigned long long)user_id);
+
+ /* Unshare network namespace */
+ if (unshare(CLONE_NEWNET) < 0) {
+ fprintf(stderr, "Child: unshare(CLONE_NEWNET) failed: %s\n", strerror(errno));
+ close(pipefd[1]);
+ close(sv[1]);
+ exit(1);
+ }
+ fprintf(stderr, "Child: unshare(CLONE_NEWNET) succeeded\n");
+
+ /* Get network namespace ID */
+ int net_fd = open("/proc/self/ns/net", O_RDONLY);
+ if (net_fd < 0) {
+ fprintf(stderr, "Child: open(/proc/self/ns/net) failed: %s\n", strerror(errno));
+ close(pipefd[1]);
+ close(sv[1]);
+ exit(1);
+ }
+
+ if (ioctl(net_fd, NS_GET_ID, &net_id) < 0) {
+ fprintf(stderr, "Child: ioctl(NS_GET_ID) for net ns failed: %s\n", strerror(errno));
+ close(net_fd);
+ close(pipefd[1]);
+ close(sv[1]);
+ exit(1);
+ }
+ close(net_fd);
+ fprintf(stderr, "Child: net ns ID = %llu\n", (unsigned long long)net_id);
+
+ /* Send namespace IDs to parent */
+ if (write(pipefd[1], &user_id, sizeof(user_id)) != sizeof(user_id)) {
+ fprintf(stderr, "Child: write(user_id) failed: %s\n", strerror(errno));
+ exit(1);
+ }
+ if (write(pipefd[1], &net_id, sizeof(net_id)) != sizeof(net_id)) {
+ fprintf(stderr, "Child: write(net_id) failed: %s\n", strerror(errno));
+ exit(1);
+ }
+ close(pipefd[1]);
+ fprintf(stderr, "Child: sent namespace IDs to parent\n");
+
+ /* Create two threads that share the namespaces */
+ pthread_t thread1, thread2;
+ struct thread_sleep_data data;
+ data.syncfd_read = sv[1];
+
+ int ret_thread = pthread_create(&thread1, NULL, thread_sleep_and_wait, &data);
+ if (ret_thread != 0) {
+ fprintf(stderr, "Child: pthread_create(thread1) failed: %s\n", strerror(ret_thread));
+ close(sv[1]);
+ exit(1);
+ }
+ fprintf(stderr, "Child: created thread1\n");
+
+ ret_thread = pthread_create(&thread2, NULL, thread_sleep_and_wait, &data);
+ if (ret_thread != 0) {
+ fprintf(stderr, "Child: pthread_create(thread2) failed: %s\n", strerror(ret_thread));
+ close(sv[1]);
+ pthread_cancel(thread1);
+ exit(1);
+ }
+ fprintf(stderr, "Child: created thread2\n");
+
+ /* Wait for threads to complete - they will unblock when parent writes */
+ fprintf(stderr, "Child: waiting for threads to exit\n");
+ pthread_join(thread1, NULL);
+ fprintf(stderr, "Child: thread1 exited\n");
+ pthread_join(thread2, NULL);
+ fprintf(stderr, "Child: thread2 exited\n");
+
+ close(sv[1]);
+
+ /* Exit - namespaces should become inactive */
+ fprintf(stderr, "Child: all threads joined, exiting with success\n");
+ exit(0);
+ }
+
+ /* Parent process */
+ close(pipefd[1]);
+ close(sv[1]);
+
+ TH_LOG("Parent: waiting to read namespace IDs from child");
+
+ /* Read namespace IDs from child */
+ ret = read(pipefd[0], &user_id, sizeof(user_id));
+ if (ret != sizeof(user_id)) {
+ TH_LOG("Parent: failed to read user_id, ret=%d, errno=%s", ret, strerror(errno));
+ close(pipefd[0]);
+ sync_byte = 'X';
+ (void)write(sv[0], &sync_byte, 1);
+ close(sv[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read user namespace ID from child");
+ }
+
+ ret = read(pipefd[0], &net_id, sizeof(net_id));
+ close(pipefd[0]);
+ if (ret != sizeof(net_id)) {
+ TH_LOG("Parent: failed to read net_id, ret=%d, errno=%s", ret, strerror(errno));
+ sync_byte = 'X';
+ (void)write(sv[0], &sync_byte, 1);
+ close(sv[0]);
+ waitpid(pid, NULL, 0);
+ SKIP(return, "Failed to read network namespace ID from child");
+ }
+
+ TH_LOG("Child created user ns %llu and net ns %llu with 2 threads",
+ (unsigned long long)user_id, (unsigned long long)net_id);
+
+ /* Construct file handles */
+ user_handle = (struct file_handle *)user_buf;
+ user_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ user_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *user_fh = (struct nsfs_file_handle *)user_handle->f_handle;
+ user_fh->ns_id = user_id;
+ user_fh->ns_type = 0;
+ user_fh->ns_inum = 0;
+
+ net_handle = (struct file_handle *)net_buf;
+ net_handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ net_handle->handle_type = FILEID_NSFS;
+ struct nsfs_file_handle *net_fh = (struct nsfs_file_handle *)net_handle->f_handle;
+ net_fh->ns_id = net_id;
+ net_fh->ns_type = 0;
+ net_fh->ns_inum = 0;
+
+ /* Verify namespaces are active while subprocess and threads are alive */
+ TH_LOG("Verifying namespaces are active while subprocess with threads is running");
+ int user_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY);
+ ASSERT_GE(user_fd, 0);
+
+ int net_fd = open_by_handle_at(FD_NSFS_ROOT, net_handle, O_RDONLY);
+ ASSERT_GE(net_fd, 0);
+
+ close(user_fd);
+ close(net_fd);
+
+ /* Also verify they appear in listns() */
+ TH_LOG("Verifying namespaces appear in listns() while active");
+ struct ns_id_req req = {
+ .size = sizeof(struct ns_id_req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[256];
+ int nr_ids = sys_listns(&req, ns_ids, 256, 0);
+ if (nr_ids < 0) {
+ TH_LOG("listns() not available, skipping listns verification");
+ } else {
+ /* Check if user_id is in the list */
+ int found_user = 0;
+ for (int i = 0; i < nr_ids; i++) {
+ if (ns_ids[i] == user_id) {
+ found_user = 1;
+ break;
+ }
+ }
+ ASSERT_TRUE(found_user);
+ TH_LOG("User namespace found in listns() as expected");
+
+ /* Check network namespace */
+ req.ns_type = CLONE_NEWNET;
+ nr_ids = sys_listns(&req, ns_ids, 256, 0);
+ if (nr_ids >= 0) {
+ int found_net = 0;
+ for (int i = 0; i < nr_ids; i++) {
+ if (ns_ids[i] == net_id) {
+ found_net = 1;
+ break;
+ }
+ }
+ ASSERT_TRUE(found_net);
+ TH_LOG("Network namespace found in listns() as expected");
+ }
+ }
+
+ /* Signal threads to exit */
+ TH_LOG("Signaling threads to exit");
+ sync_byte = 'X';
+ /* Write two bytes - one for each thread */
+ ASSERT_EQ(write(sv[0], &sync_byte, 1), 1);
+ ASSERT_EQ(write(sv[0], &sync_byte, 1), 1);
+ close(sv[0]);
+
+ /* Wait for child process to exit */
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ if (WEXITSTATUS(status) != 0) {
+ TH_LOG("Child process failed with exit code %d", WEXITSTATUS(status));
+ SKIP(return, "Child process failed");
+ }
+
+ TH_LOG("Subprocess and all threads have exited successfully");
+
+ /* Verify namespaces are now inactive - open_by_handle_at should fail */
+ TH_LOG("Verifying namespaces are inactive after subprocess and threads exit");
+ user_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY);
+ ASSERT_LT(user_fd, 0);
+ TH_LOG("User namespace inactive as expected: %s (errno=%d)",
+ strerror(errno), errno);
+ ASSERT_TRUE(errno == ENOENT || errno == ESTALE);
+
+ net_fd = open_by_handle_at(FD_NSFS_ROOT, net_handle, O_RDONLY);
+ ASSERT_LT(net_fd, 0);
+ TH_LOG("Network namespace inactive as expected: %s (errno=%d)",
+ strerror(errno), errno);
+ ASSERT_TRUE(errno == ENOENT || errno == ESTALE);
+
+ /* Verify namespaces do NOT appear in listns() */
+ TH_LOG("Verifying namespaces do NOT appear in listns() when inactive");
+ memset(&req, 0, sizeof(req));
+ req.size = sizeof(struct ns_id_req);
+ req.ns_type = CLONE_NEWUSER;
+ nr_ids = sys_listns(&req, ns_ids, 256, 0);
+ if (nr_ids >= 0) {
+ int found_user = 0;
+ for (int i = 0; i < nr_ids; i++) {
+ if (ns_ids[i] == user_id) {
+ found_user = 1;
+ break;
+ }
+ }
+ ASSERT_FALSE(found_user);
+ TH_LOG("User namespace correctly not listed in listns()");
+
+ /* Check network namespace */
+ req.ns_type = CLONE_NEWNET;
+ nr_ids = sys_listns(&req, ns_ids, 256, 0);
+ if (nr_ids >= 0) {
+ int found_net = 0;
+ for (int i = 0; i < nr_ids; i++) {
+ if (ns_ids[i] == net_id) {
+ found_net = 1;
+ break;
+ }
+ }
+ ASSERT_FALSE(found_net);
+ TH_LOG("Network namespace correctly not listed in listns()");
+ }
+ }
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/namespaces/nsid_test.c b/tools/testing/selftests/namespaces/nsid_test.c
index e28accd74a57..527ade0a8673 100644
--- a/tools/testing/selftests/namespaces/nsid_test.c
+++ b/tools/testing/selftests/namespaces/nsid_test.c
@@ -6,6 +6,7 @@
#include <libgen.h>
#include <limits.h>
#include <pthread.h>
+#include <signal.h>
#include <string.h>
#include <sys/mount.h>
#include <poll.h>
@@ -14,12 +15,30 @@
#include <sys/stat.h>
#include <sys/socket.h>
#include <sys/un.h>
+#include <sys/wait.h>
#include <unistd.h>
#include <linux/fs.h>
#include <linux/limits.h>
#include <linux/nsfs.h>
#include "../kselftest_harness.h"
+/* Fixture for tests that create child processes */
+FIXTURE(nsid) {
+ pid_t child_pid;
+};
+
+FIXTURE_SETUP(nsid) {
+ self->child_pid = 0;
+}
+
+FIXTURE_TEARDOWN(nsid) {
+ /* Clean up any child process that may still be running */
+ if (self->child_pid > 0) {
+ kill(self->child_pid, SIGKILL);
+ waitpid(self->child_pid, NULL, 0);
+ }
+}
+
TEST(nsid_mntns_basic)
{
__u64 mnt_ns_id = 0;
@@ -44,7 +63,7 @@ TEST(nsid_mntns_basic)
close(fd_mntns);
}
-TEST(nsid_mntns_separate)
+TEST_F(nsid, mntns_separate)
{
__u64 parent_mnt_ns_id = 0;
__u64 child_mnt_ns_id = 0;
@@ -90,6 +109,9 @@ TEST(nsid_mntns_separate)
_exit(0);
}
+ /* Track child for cleanup */
+ self->child_pid = pid;
+
/* Parent process */
close(pipefd[1]);
@@ -99,8 +121,6 @@ TEST(nsid_mntns_separate)
if (buf == 'S') {
/* Child couldn't create namespace, skip test */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
close(fd_parent_mntns);
SKIP(return, "No permission to create mount namespace");
}
@@ -123,10 +143,6 @@ TEST(nsid_mntns_separate)
close(fd_parent_mntns);
close(fd_child_mntns);
-
- /* Clean up child process */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
}
TEST(nsid_cgroupns_basic)
@@ -153,7 +169,7 @@ TEST(nsid_cgroupns_basic)
close(fd_cgroupns);
}
-TEST(nsid_cgroupns_separate)
+TEST_F(nsid, cgroupns_separate)
{
__u64 parent_cgroup_ns_id = 0;
__u64 child_cgroup_ns_id = 0;
@@ -199,6 +215,9 @@ TEST(nsid_cgroupns_separate)
_exit(0);
}
+ /* Track child for cleanup */
+ self->child_pid = pid;
+
/* Parent process */
close(pipefd[1]);
@@ -208,8 +227,6 @@ TEST(nsid_cgroupns_separate)
if (buf == 'S') {
/* Child couldn't create namespace, skip test */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
close(fd_parent_cgroupns);
SKIP(return, "No permission to create cgroup namespace");
}
@@ -232,10 +249,6 @@ TEST(nsid_cgroupns_separate)
close(fd_parent_cgroupns);
close(fd_child_cgroupns);
-
- /* Clean up child process */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
}
TEST(nsid_ipcns_basic)
@@ -262,7 +275,7 @@ TEST(nsid_ipcns_basic)
close(fd_ipcns);
}
-TEST(nsid_ipcns_separate)
+TEST_F(nsid, ipcns_separate)
{
__u64 parent_ipc_ns_id = 0;
__u64 child_ipc_ns_id = 0;
@@ -308,6 +321,9 @@ TEST(nsid_ipcns_separate)
_exit(0);
}
+ /* Track child for cleanup */
+ self->child_pid = pid;
+
/* Parent process */
close(pipefd[1]);
@@ -317,8 +333,6 @@ TEST(nsid_ipcns_separate)
if (buf == 'S') {
/* Child couldn't create namespace, skip test */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
close(fd_parent_ipcns);
SKIP(return, "No permission to create IPC namespace");
}
@@ -341,10 +355,6 @@ TEST(nsid_ipcns_separate)
close(fd_parent_ipcns);
close(fd_child_ipcns);
-
- /* Clean up child process */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
}
TEST(nsid_utsns_basic)
@@ -371,7 +381,7 @@ TEST(nsid_utsns_basic)
close(fd_utsns);
}
-TEST(nsid_utsns_separate)
+TEST_F(nsid, utsns_separate)
{
__u64 parent_uts_ns_id = 0;
__u64 child_uts_ns_id = 0;
@@ -417,6 +427,9 @@ TEST(nsid_utsns_separate)
_exit(0);
}
+ /* Track child for cleanup */
+ self->child_pid = pid;
+
/* Parent process */
close(pipefd[1]);
@@ -426,8 +439,6 @@ TEST(nsid_utsns_separate)
if (buf == 'S') {
/* Child couldn't create namespace, skip test */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
close(fd_parent_utsns);
SKIP(return, "No permission to create UTS namespace");
}
@@ -450,10 +461,6 @@ TEST(nsid_utsns_separate)
close(fd_parent_utsns);
close(fd_child_utsns);
-
- /* Clean up child process */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
}
TEST(nsid_userns_basic)
@@ -480,7 +487,7 @@ TEST(nsid_userns_basic)
close(fd_userns);
}
-TEST(nsid_userns_separate)
+TEST_F(nsid, userns_separate)
{
__u64 parent_user_ns_id = 0;
__u64 child_user_ns_id = 0;
@@ -526,6 +533,9 @@ TEST(nsid_userns_separate)
_exit(0);
}
+ /* Track child for cleanup */
+ self->child_pid = pid;
+
/* Parent process */
close(pipefd[1]);
@@ -535,8 +545,6 @@ TEST(nsid_userns_separate)
if (buf == 'S') {
/* Child couldn't create namespace, skip test */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
close(fd_parent_userns);
SKIP(return, "No permission to create user namespace");
}
@@ -559,10 +567,6 @@ TEST(nsid_userns_separate)
close(fd_parent_userns);
close(fd_child_userns);
-
- /* Clean up child process */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
}
TEST(nsid_timens_basic)
@@ -591,7 +595,7 @@ TEST(nsid_timens_basic)
close(fd_timens);
}
-TEST(nsid_timens_separate)
+TEST_F(nsid, timens_separate)
{
__u64 parent_time_ns_id = 0;
__u64 child_time_ns_id = 0;
@@ -652,6 +656,9 @@ TEST(nsid_timens_separate)
}
}
+ /* Track child for cleanup */
+ self->child_pid = pid;
+
/* Parent process */
close(pipefd[1]);
@@ -660,8 +667,6 @@ TEST(nsid_timens_separate)
if (buf == 'S') {
/* Child couldn't create namespace, skip test */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
close(fd_parent_timens);
close(pipefd[0]);
SKIP(return, "Cannot create time namespace");
@@ -689,10 +694,6 @@ TEST(nsid_timens_separate)
close(fd_parent_timens);
close(fd_child_timens);
-
- /* Clean up child process */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
}
TEST(nsid_pidns_basic)
@@ -719,7 +720,7 @@ TEST(nsid_pidns_basic)
close(fd_pidns);
}
-TEST(nsid_pidns_separate)
+TEST_F(nsid, pidns_separate)
{
__u64 parent_pid_ns_id = 0;
__u64 child_pid_ns_id = 0;
@@ -776,6 +777,9 @@ TEST(nsid_pidns_separate)
}
}
+ /* Track child for cleanup */
+ self->child_pid = pid;
+
/* Parent process */
close(pipefd[1]);
@@ -784,8 +788,6 @@ TEST(nsid_pidns_separate)
if (buf == 'S') {
/* Child couldn't create namespace, skip test */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
close(fd_parent_pidns);
close(pipefd[0]);
SKIP(return, "No permission to create PID namespace");
@@ -813,10 +815,6 @@ TEST(nsid_pidns_separate)
close(fd_parent_pidns);
close(fd_child_pidns);
-
- /* Clean up child process */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
}
TEST(nsid_netns_basic)
@@ -860,7 +858,7 @@ TEST(nsid_netns_basic)
close(fd_netns);
}
-TEST(nsid_netns_separate)
+TEST_F(nsid, netns_separate)
{
__u64 parent_net_ns_id = 0;
__u64 parent_netns_cookie = 0;
@@ -920,6 +918,9 @@ TEST(nsid_netns_separate)
_exit(0);
}
+ /* Track child for cleanup */
+ self->child_pid = pid;
+
/* Parent process */
close(pipefd[1]);
@@ -929,8 +930,6 @@ TEST(nsid_netns_separate)
if (buf == 'S') {
/* Child couldn't create namespace, skip test */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
close(fd_parent_netns);
close(parent_sock);
SKIP(return, "No permission to create network namespace");
@@ -977,10 +976,6 @@ TEST(nsid_netns_separate)
close(fd_parent_netns);
close(fd_child_netns);
close(parent_sock);
-
- /* Clean up child process */
- kill(pid, SIGTERM);
- waitpid(pid, NULL, 0);
}
TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/namespaces/regression_pidfd_setns_test.c b/tools/testing/selftests/namespaces/regression_pidfd_setns_test.c
new file mode 100644
index 000000000000..753fd29dffd8
--- /dev/null
+++ b/tools/testing/selftests/namespaces/regression_pidfd_setns_test.c
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#include "../pidfd/pidfd.h"
+#include "../kselftest_harness.h"
+
+/*
+ * Regression tests for the setns(pidfd) active reference counting bug.
+ *
+ * These tests are based on the reproducers that triggered the race condition
+ * fixed by commit 1c465d0518dc ("ns: handle setns(pidfd, ...) cleanly").
+ *
+ * The bug: When using setns() with a pidfd, if the target task exits between
+ * prepare_nsset() and commit_nsset(), the namespaces would become inactive.
+ * Then ns_ref_active_get() would increment from 0 without properly resurrecting
+ * the owner chain, causing active reference count underflows.
+ */
+
+/*
+ * Simple pidfd setns test using create_child()+unshare().
+ *
+ * Without the fix, this would trigger active refcount warnings when the
+ * parent exits after doing setns(pidfd) on a child that has already exited.
+ */
+TEST(simple_pidfd_setns)
+{
+ pid_t child_pid;
+ int pidfd = -1;
+ int ret;
+ int sv[2];
+ char c;
+
+ /* Ignore SIGCHLD for autoreap */
+ ASSERT_NE(signal(SIGCHLD, SIG_IGN), SIG_ERR);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0);
+
+ /* Create a child process without namespaces initially */
+ child_pid = create_child(&pidfd, 0);
+ ASSERT_GE(child_pid, 0);
+
+ if (child_pid == 0) {
+ close(sv[0]);
+
+ if (unshare(CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWNET | CLONE_NEWUSER) < 0) {
+ close(sv[1]);
+ _exit(1);
+ }
+
+ /* Signal parent that namespaces are ready */
+ if (write_nointr(sv[1], "1", 1) < 0) {
+ close(sv[1]);
+ _exit(1);
+ }
+
+ close(sv[1]);
+ _exit(0);
+ }
+ ASSERT_GE(pidfd, 0);
+ EXPECT_EQ(close(sv[1]), 0);
+
+ ret = read_nointr(sv[0], &c, 1);
+ ASSERT_EQ(ret, 1);
+ EXPECT_EQ(close(sv[0]), 0);
+
+ /* Set to child's namespaces via pidfd */
+ ret = setns(pidfd, CLONE_NEWUTS | CLONE_NEWIPC);
+ TH_LOG("setns() returned %d", ret);
+ close(pidfd);
+}
+
+/*
+ * Simple pidfd setns test using create_child().
+ *
+ * This variation uses create_child() with namespace flags directly.
+ * Namespaces are created immediately at clone time.
+ */
+TEST(simple_pidfd_setns_clone)
+{
+ pid_t child_pid;
+ int pidfd = -1;
+ int ret;
+
+ /* Ignore SIGCHLD for autoreap */
+ ASSERT_NE(signal(SIGCHLD, SIG_IGN), SIG_ERR);
+
+ /* Create a child process with new namespaces using create_child() */
+ child_pid = create_child(&pidfd, CLONE_NEWUSER | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWNET);
+ ASSERT_GE(child_pid, 0);
+
+ if (child_pid == 0) {
+ /* Child: sleep for a while so parent can setns to us */
+ sleep(2);
+ _exit(0);
+ }
+
+ /* Parent: pidfd was already created by create_child() */
+ ASSERT_GE(pidfd, 0);
+
+ /* Set to child's namespaces via pidfd */
+ ret = setns(pidfd, CLONE_NEWUTS | CLONE_NEWIPC);
+ close(pidfd);
+ TH_LOG("setns() returned %d", ret);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/namespaces/siocgskns_test.c b/tools/testing/selftests/namespaces/siocgskns_test.c
new file mode 100644
index 000000000000..ba689a22d82f
--- /dev/null
+++ b/tools/testing/selftests/namespaces/siocgskns_test.c
@@ -0,0 +1,1824 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <linux/if.h>
+#include <linux/sockios.h>
+#include <linux/nsfs.h>
+#include <arpa/inet.h>
+#include "../kselftest_harness.h"
+#include "../filesystems/utils.h"
+#include "wrappers.h"
+
+#ifndef SIOCGSKNS
+#define SIOCGSKNS 0x894C
+#endif
+
+#ifndef FD_NSFS_ROOT
+#define FD_NSFS_ROOT -10003
+#endif
+
+#ifndef FILEID_NSFS
+#define FILEID_NSFS 0xf1
+#endif
+
+/*
+ * Test basic SIOCGSKNS functionality.
+ * Create a socket and verify SIOCGSKNS returns the correct network namespace.
+ */
+TEST(siocgskns_basic)
+{
+ int sock_fd, netns_fd, current_netns_fd;
+ struct stat st1, st2;
+
+ /* Create a TCP socket */
+ sock_fd = socket(AF_INET, SOCK_STREAM, 0);
+ ASSERT_GE(sock_fd, 0);
+
+ /* Use SIOCGSKNS to get network namespace */
+ netns_fd = ioctl(sock_fd, SIOCGSKNS);
+ if (netns_fd < 0) {
+ close(sock_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_fd, 0);
+ }
+
+ /* Get current network namespace */
+ current_netns_fd = open("/proc/self/ns/net", O_RDONLY);
+ ASSERT_GE(current_netns_fd, 0);
+
+ /* Verify they match */
+ ASSERT_EQ(fstat(netns_fd, &st1), 0);
+ ASSERT_EQ(fstat(current_netns_fd, &st2), 0);
+ ASSERT_EQ(st1.st_ino, st2.st_ino);
+
+ close(sock_fd);
+ close(netns_fd);
+ close(current_netns_fd);
+}
+
+/*
+ * Test that socket file descriptors keep network namespaces active.
+ * Create a network namespace, create a socket in it, then exit the namespace.
+ * The namespace should remain active while the socket FD is held.
+ */
+TEST(siocgskns_keeps_netns_active)
+{
+ int sock_fd, netns_fd, test_fd;
+ int ipc_sockets[2];
+ pid_t pid;
+ int status;
+ struct stat st;
+
+ EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child: create new netns and socket */
+ close(ipc_sockets[0]);
+
+ if (unshare(CLONE_NEWNET) < 0) {
+ TH_LOG("unshare(CLONE_NEWNET) failed: %s", strerror(errno));
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ /* Create a socket in the new network namespace */
+ sock_fd = socket(AF_INET, SOCK_DGRAM, 0);
+ if (sock_fd < 0) {
+ TH_LOG("socket() failed: %s", strerror(errno));
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ /* Send socket FD to parent via SCM_RIGHTS */
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1] = {'X'};
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+ memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int));
+
+ if (sendmsg(ipc_sockets[1], &msg, 0) < 0) {
+ close(sock_fd);
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ close(sock_fd);
+ close(ipc_sockets[1]);
+ exit(0);
+ }
+
+ /* Parent: receive socket FD */
+ close(ipc_sockets[1]);
+
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1];
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ ssize_t n = recvmsg(ipc_sockets[0], &msg, 0);
+ close(ipc_sockets[0]);
+ ASSERT_EQ(n, 1);
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, NULL);
+ ASSERT_EQ(cmsg->cmsg_type, SCM_RIGHTS);
+
+ memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int));
+
+ /* Wait for child to exit */
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Get network namespace from socket */
+ netns_fd = ioctl(sock_fd, SIOCGSKNS);
+ if (netns_fd < 0) {
+ close(sock_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_fd, 0);
+ }
+
+ ASSERT_EQ(fstat(netns_fd, &st), 0);
+
+ /*
+ * Namespace should still be active because socket FD keeps it alive.
+ * Try to access it via /proc/self/fd/<fd>.
+ */
+ char path[64];
+ snprintf(path, sizeof(path), "/proc/self/fd/%d", netns_fd);
+ test_fd = open(path, O_RDONLY);
+ ASSERT_GE(test_fd, 0);
+ close(test_fd);
+ close(netns_fd);
+
+ /* Close socket - namespace should become inactive */
+ close(sock_fd);
+
+ /* Try SIOCGSKNS again - should fail since socket is closed */
+ ASSERT_LT(ioctl(sock_fd, SIOCGSKNS), 0);
+}
+
+/*
+ * Test SIOCGSKNS with different socket types (TCP, UDP, RAW).
+ */
+TEST(siocgskns_socket_types)
+{
+ int sock_tcp, sock_udp, sock_raw;
+ int netns_tcp, netns_udp, netns_raw;
+ struct stat st_tcp, st_udp, st_raw;
+
+ /* TCP socket */
+ sock_tcp = socket(AF_INET, SOCK_STREAM, 0);
+ ASSERT_GE(sock_tcp, 0);
+
+ /* UDP socket */
+ sock_udp = socket(AF_INET, SOCK_DGRAM, 0);
+ ASSERT_GE(sock_udp, 0);
+
+ /* RAW socket (may require privileges) */
+ sock_raw = socket(AF_INET, SOCK_RAW, IPPROTO_ICMP);
+ if (sock_raw < 0 && (errno == EPERM || errno == EACCES)) {
+ sock_raw = -1; /* Skip raw socket test */
+ }
+
+ /* Test SIOCGSKNS on TCP */
+ netns_tcp = ioctl(sock_tcp, SIOCGSKNS);
+ if (netns_tcp < 0) {
+ close(sock_tcp);
+ close(sock_udp);
+ if (sock_raw >= 0) close(sock_raw);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_tcp, 0);
+ }
+
+ /* Test SIOCGSKNS on UDP */
+ netns_udp = ioctl(sock_udp, SIOCGSKNS);
+ ASSERT_GE(netns_udp, 0);
+
+ /* Test SIOCGSKNS on RAW (if available) */
+ if (sock_raw >= 0) {
+ netns_raw = ioctl(sock_raw, SIOCGSKNS);
+ ASSERT_GE(netns_raw, 0);
+ }
+
+ /* Verify all return the same network namespace */
+ ASSERT_EQ(fstat(netns_tcp, &st_tcp), 0);
+ ASSERT_EQ(fstat(netns_udp, &st_udp), 0);
+ ASSERT_EQ(st_tcp.st_ino, st_udp.st_ino);
+
+ if (sock_raw >= 0) {
+ ASSERT_EQ(fstat(netns_raw, &st_raw), 0);
+ ASSERT_EQ(st_tcp.st_ino, st_raw.st_ino);
+ close(netns_raw);
+ close(sock_raw);
+ }
+
+ close(netns_tcp);
+ close(netns_udp);
+ close(sock_tcp);
+ close(sock_udp);
+}
+
+/*
+ * Test SIOCGSKNS across setns.
+ * Create a socket in netns A, switch to netns B, verify SIOCGSKNS still
+ * returns netns A.
+ */
+TEST(siocgskns_across_setns)
+{
+ int sock_fd, netns_a_fd, netns_b_fd, result_fd;
+ struct stat st_a;
+
+ /* Get current netns (A) */
+ netns_a_fd = open("/proc/self/ns/net", O_RDONLY);
+ ASSERT_GE(netns_a_fd, 0);
+ ASSERT_EQ(fstat(netns_a_fd, &st_a), 0);
+
+ /* Create socket in netns A */
+ sock_fd = socket(AF_INET, SOCK_STREAM, 0);
+ ASSERT_GE(sock_fd, 0);
+
+ /* Create new netns (B) */
+ ASSERT_EQ(unshare(CLONE_NEWNET), 0);
+
+ netns_b_fd = open("/proc/self/ns/net", O_RDONLY);
+ ASSERT_GE(netns_b_fd, 0);
+
+ /* Get netns from socket created in A */
+ result_fd = ioctl(sock_fd, SIOCGSKNS);
+ if (result_fd < 0) {
+ close(sock_fd);
+ setns(netns_a_fd, CLONE_NEWNET);
+ close(netns_a_fd);
+ close(netns_b_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(result_fd, 0);
+ }
+
+ /* Verify it still points to netns A */
+ struct stat st_result_stat;
+ ASSERT_EQ(fstat(result_fd, &st_result_stat), 0);
+ ASSERT_EQ(st_a.st_ino, st_result_stat.st_ino);
+
+ close(result_fd);
+ close(sock_fd);
+ close(netns_b_fd);
+
+ /* Restore original netns */
+ ASSERT_EQ(setns(netns_a_fd, CLONE_NEWNET), 0);
+ close(netns_a_fd);
+}
+
+/*
+ * Test SIOCGSKNS fails on non-socket file descriptors.
+ */
+TEST(siocgskns_non_socket)
+{
+ int fd;
+ int pipefd[2];
+
+ /* Test on regular file */
+ fd = open("/dev/null", O_RDONLY);
+ ASSERT_GE(fd, 0);
+
+ ASSERT_LT(ioctl(fd, SIOCGSKNS), 0);
+ ASSERT_TRUE(errno == ENOTTY || errno == EINVAL);
+ close(fd);
+
+ /* Test on pipe */
+ ASSERT_EQ(pipe(pipefd), 0);
+
+ ASSERT_LT(ioctl(pipefd[0], SIOCGSKNS), 0);
+ ASSERT_TRUE(errno == ENOTTY || errno == EINVAL);
+
+ close(pipefd[0]);
+ close(pipefd[1]);
+}
+
+/*
+ * Test multiple sockets keep the same network namespace active.
+ * Create multiple sockets, verify closing some doesn't affect others.
+ */
+TEST(siocgskns_multiple_sockets)
+{
+ int socks[5];
+ int netns_fds[5];
+ int i;
+ struct stat st;
+ ino_t netns_ino;
+
+ /* Create new network namespace */
+ ASSERT_EQ(unshare(CLONE_NEWNET), 0);
+
+ /* Create multiple sockets */
+ for (i = 0; i < 5; i++) {
+ socks[i] = socket(AF_INET, SOCK_STREAM, 0);
+ ASSERT_GE(socks[i], 0);
+ }
+
+ /* Get netns from all sockets */
+ for (i = 0; i < 5; i++) {
+ netns_fds[i] = ioctl(socks[i], SIOCGSKNS);
+ if (netns_fds[i] < 0) {
+ int j;
+ for (j = 0; j <= i; j++) {
+ close(socks[j]);
+ if (j < i && netns_fds[j] >= 0)
+ close(netns_fds[j]);
+ }
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_fds[i], 0);
+ }
+ }
+
+ /* Verify all point to same netns */
+ ASSERT_EQ(fstat(netns_fds[0], &st), 0);
+ netns_ino = st.st_ino;
+
+ for (i = 1; i < 5; i++) {
+ ASSERT_EQ(fstat(netns_fds[i], &st), 0);
+ ASSERT_EQ(st.st_ino, netns_ino);
+ }
+
+ /* Close some sockets */
+ for (i = 0; i < 3; i++) {
+ close(socks[i]);
+ }
+
+ /* Remaining netns FDs should still be valid */
+ for (i = 3; i < 5; i++) {
+ char path[64];
+ snprintf(path, sizeof(path), "/proc/self/fd/%d", netns_fds[i]);
+ int test_fd = open(path, O_RDONLY);
+ ASSERT_GE(test_fd, 0);
+ close(test_fd);
+ }
+
+ /* Cleanup */
+ for (i = 0; i < 5; i++) {
+ if (i >= 3)
+ close(socks[i]);
+ close(netns_fds[i]);
+ }
+}
+
+/*
+ * Test socket keeps netns active after creating process exits.
+ * Verify that as long as the socket FD exists, the namespace remains active.
+ */
+TEST(siocgskns_netns_lifecycle)
+{
+ int sock_fd, netns_fd;
+ int ipc_sockets[2];
+ int syncpipe[2];
+ pid_t pid;
+ int status;
+ char sync_byte;
+ struct stat st;
+ ino_t netns_ino;
+
+ EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0);
+
+ ASSERT_EQ(pipe(syncpipe), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child */
+ close(ipc_sockets[0]);
+ close(syncpipe[1]);
+
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(ipc_sockets[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+
+ sock_fd = socket(AF_INET, SOCK_STREAM, 0);
+ if (sock_fd < 0) {
+ close(ipc_sockets[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+
+ /* Send socket to parent */
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1] = {'X'};
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+ memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int));
+
+ if (sendmsg(ipc_sockets[1], &msg, 0) < 0) {
+ close(sock_fd);
+ close(ipc_sockets[1]);
+ close(syncpipe[0]);
+ exit(1);
+ }
+
+ close(sock_fd);
+ close(ipc_sockets[1]);
+
+ /* Wait for parent signal */
+ read(syncpipe[0], &sync_byte, 1);
+ close(syncpipe[0]);
+ exit(0);
+ }
+
+ /* Parent */
+ close(ipc_sockets[1]);
+ close(syncpipe[0]);
+
+ /* Receive socket FD */
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1];
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ ssize_t n = recvmsg(ipc_sockets[0], &msg, 0);
+ close(ipc_sockets[0]);
+ ASSERT_EQ(n, 1);
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, NULL);
+ memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int));
+
+ /* Get netns from socket while child is alive */
+ netns_fd = ioctl(sock_fd, SIOCGSKNS);
+ if (netns_fd < 0) {
+ sync_byte = 'G';
+ write(syncpipe[1], &sync_byte, 1);
+ close(syncpipe[1]);
+ close(sock_fd);
+ waitpid(pid, NULL, 0);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_fd, 0);
+ }
+ ASSERT_EQ(fstat(netns_fd, &st), 0);
+ netns_ino = st.st_ino;
+
+ /* Signal child to exit */
+ sync_byte = 'G';
+ write(syncpipe[1], &sync_byte, 1);
+ close(syncpipe[1]);
+
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+
+ /*
+ * Socket FD should still keep namespace active even after
+ * the creating process exited.
+ */
+ int test_fd = ioctl(sock_fd, SIOCGSKNS);
+ ASSERT_GE(test_fd, 0);
+
+ struct stat st_test;
+ ASSERT_EQ(fstat(test_fd, &st_test), 0);
+ ASSERT_EQ(st_test.st_ino, netns_ino);
+
+ close(test_fd);
+ close(netns_fd);
+
+ /* Close socket - namespace should become inactive */
+ close(sock_fd);
+}
+
+/*
+ * Test IPv6 sockets also work with SIOCGSKNS.
+ */
+TEST(siocgskns_ipv6)
+{
+ int sock_fd, netns_fd, current_netns_fd;
+ struct stat st1, st2;
+
+ /* Create an IPv6 TCP socket */
+ sock_fd = socket(AF_INET6, SOCK_STREAM, 0);
+ ASSERT_GE(sock_fd, 0);
+
+ /* Use SIOCGSKNS */
+ netns_fd = ioctl(sock_fd, SIOCGSKNS);
+ if (netns_fd < 0) {
+ close(sock_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_fd, 0);
+ }
+
+ /* Verify it matches current namespace */
+ current_netns_fd = open("/proc/self/ns/net", O_RDONLY);
+ ASSERT_GE(current_netns_fd, 0);
+
+ ASSERT_EQ(fstat(netns_fd, &st1), 0);
+ ASSERT_EQ(fstat(current_netns_fd, &st2), 0);
+ ASSERT_EQ(st1.st_ino, st2.st_ino);
+
+ close(sock_fd);
+ close(netns_fd);
+ close(current_netns_fd);
+}
+
+/*
+ * Test that socket-kept netns appears in listns() output.
+ * Verify that a network namespace kept alive by a socket FD appears in
+ * listns() output even after the creating process exits, and that it
+ * disappears when the socket is closed.
+ */
+TEST(siocgskns_listns_visibility)
+{
+ int sock_fd, netns_fd, owner_fd;
+ int ipc_sockets[2];
+ pid_t pid;
+ int status;
+ __u64 netns_id, owner_id;
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWNET,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[256];
+ int ret, i;
+ bool found_netns = false;
+
+ EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child: create new netns and socket */
+ close(ipc_sockets[0]);
+
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ sock_fd = socket(AF_INET, SOCK_DGRAM, 0);
+ if (sock_fd < 0) {
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ /* Send socket FD to parent via SCM_RIGHTS */
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1] = {'X'};
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+ memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int));
+
+ if (sendmsg(ipc_sockets[1], &msg, 0) < 0) {
+ close(sock_fd);
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ close(sock_fd);
+ close(ipc_sockets[1]);
+ exit(0);
+ }
+
+ /* Parent: receive socket FD */
+ close(ipc_sockets[1]);
+
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1];
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ ssize_t n = recvmsg(ipc_sockets[0], &msg, 0);
+ close(ipc_sockets[0]);
+ ASSERT_EQ(n, 1);
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, NULL);
+ memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int));
+
+ /* Wait for child to exit */
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Get network namespace from socket */
+ netns_fd = ioctl(sock_fd, SIOCGSKNS);
+ if (netns_fd < 0) {
+ close(sock_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_fd, 0);
+ }
+
+ /* Get namespace ID */
+ ret = ioctl(netns_fd, NS_GET_ID, &netns_id);
+ if (ret < 0) {
+ close(sock_fd);
+ close(netns_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "NS_GET_ID not supported");
+ ASSERT_EQ(ret, 0);
+ }
+
+ /* Get owner user namespace */
+ owner_fd = ioctl(netns_fd, NS_GET_USERNS);
+ if (owner_fd < 0) {
+ close(sock_fd);
+ close(netns_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "NS_GET_USERNS not supported");
+ ASSERT_GE(owner_fd, 0);
+ }
+
+ /* Get owner namespace ID */
+ ret = ioctl(owner_fd, NS_GET_ID, &owner_id);
+ if (ret < 0) {
+ close(owner_fd);
+ close(sock_fd);
+ close(netns_fd);
+ ASSERT_EQ(ret, 0);
+ }
+ close(owner_fd);
+
+ /* Namespace should appear in listns() output */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ close(sock_fd);
+ close(netns_fd);
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ TH_LOG("listns failed: %s", strerror(errno));
+ ASSERT_GE(ret, 0);
+ }
+
+ /* Search for our network namespace in the list */
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == netns_id) {
+ found_netns = true;
+ break;
+ }
+ }
+
+ ASSERT_TRUE(found_netns);
+ TH_LOG("Found netns %llu in listns() output (kept alive by socket)", netns_id);
+
+ /* Now verify with owner filtering */
+ req.user_ns_id = owner_id;
+ found_netns = false;
+
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ ASSERT_GE(ret, 0);
+
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == netns_id) {
+ found_netns = true;
+ break;
+ }
+ }
+
+ ASSERT_TRUE(found_netns);
+ TH_LOG("Found netns %llu owned by userns %llu", netns_id, owner_id);
+
+ /* Close socket - namespace should become inactive and disappear from listns() */
+ close(sock_fd);
+ close(netns_fd);
+
+ /* Verify it's no longer in listns() output */
+ req.user_ns_id = 0;
+ found_netns = false;
+
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ ASSERT_GE(ret, 0);
+
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == netns_id) {
+ found_netns = true;
+ break;
+ }
+ }
+
+ ASSERT_FALSE(found_netns);
+ TH_LOG("Netns %llu correctly disappeared from listns() after socket closed", netns_id);
+}
+
+/*
+ * Test that socket-kept netns can be reopened via file handle.
+ * Verify that a network namespace kept alive by a socket FD can be
+ * reopened using file handles even after the creating process exits.
+ */
+TEST(siocgskns_file_handle)
+{
+ int sock_fd, netns_fd, reopened_fd;
+ int ipc_sockets[2];
+ pid_t pid;
+ int status;
+ struct stat st1, st2;
+ ino_t netns_ino;
+ __u64 netns_id;
+ struct file_handle *handle;
+ struct nsfs_file_handle *nsfs_fh;
+ int ret;
+
+ /* Allocate file_handle structure for nsfs */
+ handle = malloc(sizeof(struct file_handle) + sizeof(struct nsfs_file_handle));
+ ASSERT_NE(handle, NULL);
+ handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ handle->handle_type = FILEID_NSFS;
+
+ EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child: create new netns and socket */
+ close(ipc_sockets[0]);
+
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ sock_fd = socket(AF_INET, SOCK_DGRAM, 0);
+ if (sock_fd < 0) {
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ /* Send socket FD to parent via SCM_RIGHTS */
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1] = {'X'};
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+ memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int));
+
+ if (sendmsg(ipc_sockets[1], &msg, 0) < 0) {
+ close(sock_fd);
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ close(sock_fd);
+ close(ipc_sockets[1]);
+ exit(0);
+ }
+
+ /* Parent: receive socket FD */
+ close(ipc_sockets[1]);
+
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1];
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ ssize_t n = recvmsg(ipc_sockets[0], &msg, 0);
+ close(ipc_sockets[0]);
+ ASSERT_EQ(n, 1);
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, NULL);
+ memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int));
+
+ /* Wait for child to exit */
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Get network namespace from socket */
+ netns_fd = ioctl(sock_fd, SIOCGSKNS);
+ if (netns_fd < 0) {
+ free(handle);
+ close(sock_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_fd, 0);
+ }
+
+ ASSERT_EQ(fstat(netns_fd, &st1), 0);
+ netns_ino = st1.st_ino;
+
+ /* Get namespace ID */
+ ret = ioctl(netns_fd, NS_GET_ID, &netns_id);
+ if (ret < 0) {
+ free(handle);
+ close(sock_fd);
+ close(netns_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "NS_GET_ID not supported");
+ ASSERT_EQ(ret, 0);
+ }
+
+ /* Construct file handle from namespace ID */
+ nsfs_fh = (struct nsfs_file_handle *)handle->f_handle;
+ nsfs_fh->ns_id = netns_id;
+ nsfs_fh->ns_type = 0; /* Type field not needed for reopening */
+ nsfs_fh->ns_inum = 0; /* Inum field not needed for reopening */
+
+ TH_LOG("Constructed file handle for netns %lu (id=%llu)", netns_ino, netns_id);
+
+ /* Reopen namespace using file handle (while socket still keeps it alive) */
+ reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ if (reopened_fd < 0) {
+ free(handle);
+ close(sock_fd);
+ if (errno == EOPNOTSUPP || errno == ENOSYS || errno == EBADF)
+ SKIP(return, "open_by_handle_at with FD_NSFS_ROOT not supported");
+ TH_LOG("open_by_handle_at failed: %s", strerror(errno));
+ ASSERT_GE(reopened_fd, 0);
+ }
+
+ /* Verify it's the same namespace */
+ ASSERT_EQ(fstat(reopened_fd, &st2), 0);
+ ASSERT_EQ(st1.st_ino, st2.st_ino);
+ ASSERT_EQ(st1.st_dev, st2.st_dev);
+
+ TH_LOG("Successfully reopened netns %lu via file handle", netns_ino);
+
+ close(reopened_fd);
+
+ /* Close the netns FD */
+ close(netns_fd);
+
+ /* Try to reopen via file handle - should fail since namespace is now inactive */
+ reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_LT(reopened_fd, 0);
+ TH_LOG("Correctly failed to reopen inactive netns: %s", strerror(errno));
+
+ /* Get network namespace from socket */
+ netns_fd = ioctl(sock_fd, SIOCGSKNS);
+ if (netns_fd < 0) {
+ free(handle);
+ close(sock_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_fd, 0);
+ }
+
+ /* Reopen namespace using file handle (while socket still keeps it alive) */
+ reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ if (reopened_fd < 0) {
+ free(handle);
+ close(sock_fd);
+ if (errno == EOPNOTSUPP || errno == ENOSYS || errno == EBADF)
+ SKIP(return, "open_by_handle_at with FD_NSFS_ROOT not supported");
+ TH_LOG("open_by_handle_at failed: %s", strerror(errno));
+ ASSERT_GE(reopened_fd, 0);
+ }
+
+ /* Verify it's the same namespace */
+ ASSERT_EQ(fstat(reopened_fd, &st2), 0);
+ ASSERT_EQ(st1.st_ino, st2.st_ino);
+ ASSERT_EQ(st1.st_dev, st2.st_dev);
+
+ TH_LOG("Successfully reopened netns %lu via file handle", netns_ino);
+
+ /* Close socket - namespace should become inactive */
+ close(sock_fd);
+ free(handle);
+}
+
+/*
+ * Test combined listns() and file handle operations with socket-kept netns.
+ * Create a netns, keep it alive with a socket, verify it appears in listns(),
+ * then reopen it via file handle obtained from listns() entry.
+ */
+TEST(siocgskns_listns_and_file_handle)
+{
+ int sock_fd, netns_fd, userns_fd, reopened_fd;
+ int ipc_sockets[2];
+ pid_t pid;
+ int status;
+ struct stat st;
+ ino_t netns_ino;
+ __u64 netns_id, userns_id;
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWNET | CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[256];
+ int ret, i;
+ bool found_netns = false, found_userns = false;
+ struct file_handle *handle;
+ struct nsfs_file_handle *nsfs_fh;
+
+ /* Allocate file_handle structure for nsfs */
+ handle = malloc(sizeof(struct file_handle) + sizeof(struct nsfs_file_handle));
+ ASSERT_NE(handle, NULL);
+ handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ handle->handle_type = FILEID_NSFS;
+
+ EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child: create new userns and netns with socket */
+ close(ipc_sockets[0]);
+
+ if (setup_userns() < 0) {
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ sock_fd = socket(AF_INET, SOCK_DGRAM, 0);
+ if (sock_fd < 0) {
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ /* Send socket FD to parent via SCM_RIGHTS */
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1] = {'X'};
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+ memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int));
+
+ if (sendmsg(ipc_sockets[1], &msg, 0) < 0) {
+ close(sock_fd);
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ close(sock_fd);
+ close(ipc_sockets[1]);
+ exit(0);
+ }
+
+ /* Parent: receive socket FD */
+ close(ipc_sockets[1]);
+
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1];
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ ssize_t n = recvmsg(ipc_sockets[0], &msg, 0);
+ close(ipc_sockets[0]);
+ ASSERT_EQ(n, 1);
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, NULL);
+ memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int));
+
+ /* Wait for child to exit */
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /* Get network namespace from socket */
+ netns_fd = ioctl(sock_fd, SIOCGSKNS);
+ if (netns_fd < 0) {
+ free(handle);
+ close(sock_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_fd, 0);
+ }
+
+ ASSERT_EQ(fstat(netns_fd, &st), 0);
+ netns_ino = st.st_ino;
+
+ /* Get namespace ID */
+ ret = ioctl(netns_fd, NS_GET_ID, &netns_id);
+ if (ret < 0) {
+ free(handle);
+ close(sock_fd);
+ close(netns_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "NS_GET_ID not supported");
+ ASSERT_EQ(ret, 0);
+ }
+
+ /* Get owner user namespace */
+ userns_fd = ioctl(netns_fd, NS_GET_USERNS);
+ if (userns_fd < 0) {
+ free(handle);
+ close(sock_fd);
+ close(netns_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "NS_GET_USERNS not supported");
+ ASSERT_GE(userns_fd, 0);
+ }
+
+ /* Get owner namespace ID */
+ ret = ioctl(userns_fd, NS_GET_ID, &userns_id);
+ if (ret < 0) {
+ close(userns_fd);
+ free(handle);
+ close(sock_fd);
+ close(netns_fd);
+ ASSERT_EQ(ret, 0);
+ }
+ close(userns_fd);
+
+ TH_LOG("Testing netns %lu (id=%llu) owned by userns id=%llu", netns_ino, netns_id, userns_id);
+
+ /* Verify namespace appears in listns() */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ free(handle);
+ close(sock_fd);
+ close(netns_fd);
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ TH_LOG("listns failed: %s", strerror(errno));
+ ASSERT_GE(ret, 0);
+ }
+
+ found_netns = false;
+ found_userns = false;
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == netns_id)
+ found_netns = true;
+ if (ns_ids[i] == userns_id)
+ found_userns = true;
+ }
+ ASSERT_TRUE(found_netns);
+ ASSERT_TRUE(found_userns);
+ TH_LOG("Found netns %llu in listns() output", netns_id);
+
+ /* Construct file handle from namespace ID */
+ nsfs_fh = (struct nsfs_file_handle *)handle->f_handle;
+ nsfs_fh->ns_id = netns_id;
+ nsfs_fh->ns_type = 0;
+ nsfs_fh->ns_inum = 0;
+
+ reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ if (reopened_fd < 0) {
+ free(handle);
+ close(sock_fd);
+ if (errno == EOPNOTSUPP || errno == ENOSYS || errno == EBADF)
+ SKIP(return, "open_by_handle_at with FD_NSFS_ROOT not supported");
+ TH_LOG("open_by_handle_at failed: %s", strerror(errno));
+ ASSERT_GE(reopened_fd, 0);
+ }
+
+ struct stat reopened_st;
+ ASSERT_EQ(fstat(reopened_fd, &reopened_st), 0);
+ ASSERT_EQ(reopened_st.st_ino, netns_ino);
+
+ TH_LOG("Successfully reopened netns %lu via file handle (socket-kept)", netns_ino);
+
+ close(reopened_fd);
+ close(netns_fd);
+
+ /* Try to reopen via file handle - should fail since namespace is now inactive */
+ reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ ASSERT_LT(reopened_fd, 0);
+ TH_LOG("Correctly failed to reopen inactive netns: %s", strerror(errno));
+
+ /* Get network namespace from socket */
+ netns_fd = ioctl(sock_fd, SIOCGSKNS);
+ if (netns_fd < 0) {
+ free(handle);
+ close(sock_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_fd, 0);
+ }
+
+ /* Verify namespace appears in listns() */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ free(handle);
+ close(sock_fd);
+ close(netns_fd);
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ TH_LOG("listns failed: %s", strerror(errno));
+ ASSERT_GE(ret, 0);
+ }
+
+ found_netns = false;
+ found_userns = false;
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == netns_id)
+ found_netns = true;
+ if (ns_ids[i] == userns_id)
+ found_userns = true;
+ }
+ ASSERT_TRUE(found_netns);
+ ASSERT_TRUE(found_userns);
+ TH_LOG("Found netns %llu in listns() output", netns_id);
+
+ close(netns_fd);
+
+ /* Verify namespace appears in listns() */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ free(handle);
+ close(sock_fd);
+ close(netns_fd);
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ TH_LOG("listns failed: %s", strerror(errno));
+ ASSERT_GE(ret, 0);
+ }
+
+ found_netns = false;
+ found_userns = false;
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == netns_id)
+ found_netns = true;
+ if (ns_ids[i] == userns_id)
+ found_userns = true;
+ }
+ ASSERT_FALSE(found_netns);
+ ASSERT_FALSE(found_userns);
+ TH_LOG("Netns %llu correctly disappeared from listns() after socket closed", netns_id);
+
+ close(sock_fd);
+ free(handle);
+}
+
+/*
+ * Test multi-level namespace resurrection across three user namespace levels.
+ *
+ * This test creates a complex namespace hierarchy with three levels of user
+ * namespaces and a network namespace at the deepest level. It verifies that
+ * the resurrection semantics work correctly when SIOCGSKNS is called on a
+ * socket from an inactive namespace tree, and that listns() and
+ * open_by_handle_at() correctly respect visibility rules.
+ *
+ * Hierarchy after child processes exit (all with 0 active refcount):
+ *
+ * net_L3A (0) <- Level 3 network namespace
+ * |
+ * +
+ * userns_L3 (0) <- Level 3 user namespace
+ * |
+ * +
+ * userns_L2 (0) <- Level 2 user namespace
+ * |
+ * +
+ * userns_L1 (0) <- Level 1 user namespace
+ * |
+ * x
+ * init_user_ns
+ *
+ * The test verifies:
+ * 1. SIOCGSKNS on a socket from inactive net_L3A resurrects the entire chain
+ * 2. After resurrection, all namespaces are visible in listns()
+ * 3. Resurrected namespaces can be reopened via file handles
+ * 4. Closing the netns FD cascades down: the entire ownership chain
+ * (userns_L3 -> userns_L2 -> userns_L1) becomes inactive again
+ * 5. Inactive namespaces disappear from listns() and cannot be reopened
+ * 6. Calling SIOCGSKNS again on the same socket resurrects the tree again
+ * 7. After second resurrection, namespaces are visible and can be reopened
+ */
+TEST(siocgskns_multilevel_resurrection)
+{
+ int ipc_sockets[2];
+ pid_t pid_l1, pid_l2, pid_l3;
+ int status;
+
+ /* Namespace file descriptors to be received from child */
+ int sock_L3A_fd = -1;
+ int netns_L3A_fd = -1;
+ __u64 netns_L3A_id;
+ __u64 userns_L1_id, userns_L2_id, userns_L3_id;
+
+ /* For listns() and file handle testing */
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWNET | CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids[256];
+ int ret, i;
+ struct file_handle *handle;
+ struct nsfs_file_handle *nsfs_fh;
+ int reopened_fd;
+
+ /* Allocate file handle for testing */
+ handle = malloc(sizeof(struct file_handle) + sizeof(struct nsfs_file_handle));
+ ASSERT_NE(handle, NULL);
+ handle->handle_bytes = sizeof(struct nsfs_file_handle);
+ handle->handle_type = FILEID_NSFS;
+
+ EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0);
+
+ /*
+ * Fork level 1 child that creates userns_L1
+ */
+ pid_l1 = fork();
+ ASSERT_GE(pid_l1, 0);
+
+ if (pid_l1 == 0) {
+ /* Level 1 child */
+ int ipc_L2[2];
+ close(ipc_sockets[0]);
+
+ /* Create userns_L1 */
+ if (setup_userns() < 0) {
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ /* Create socketpair for communicating with L2 child */
+ if (socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_L2) < 0) {
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ /*
+ * Fork level 2 child that creates userns_L2
+ */
+ pid_l2 = fork();
+ if (pid_l2 < 0) {
+ close(ipc_sockets[1]);
+ close(ipc_L2[0]);
+ close(ipc_L2[1]);
+ exit(1);
+ }
+
+ if (pid_l2 == 0) {
+ /* Level 2 child */
+ int ipc_L3[2];
+ close(ipc_L2[0]);
+
+ /* Create userns_L2 (nested inside userns_L1) */
+ if (setup_userns() < 0) {
+ close(ipc_L2[1]);
+ exit(1);
+ }
+
+ /* Create socketpair for communicating with L3 child */
+ if (socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_L3) < 0) {
+ close(ipc_L2[1]);
+ exit(1);
+ }
+
+ /*
+ * Fork level 3 child that creates userns_L3 and network namespaces
+ */
+ pid_l3 = fork();
+ if (pid_l3 < 0) {
+ close(ipc_L2[1]);
+ close(ipc_L3[0]);
+ close(ipc_L3[1]);
+ exit(1);
+ }
+
+ if (pid_l3 == 0) {
+ /* Level 3 child - the deepest level */
+ int sock_fd;
+ close(ipc_L3[0]);
+
+ /* Create userns_L3 (nested inside userns_L2) */
+ if (setup_userns() < 0) {
+ close(ipc_L3[1]);
+ exit(1);
+ }
+
+ /* Create network namespace at level 3 */
+ if (unshare(CLONE_NEWNET) < 0) {
+ close(ipc_L3[1]);
+ exit(1);
+ }
+
+ /* Create socket in net_L3A */
+ sock_fd = socket(AF_INET, SOCK_DGRAM, 0);
+ if (sock_fd < 0) {
+ close(ipc_L3[1]);
+ exit(1);
+ }
+
+ /* Send socket FD to L2 parent */
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1] = {'X'};
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+ memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int));
+
+ if (sendmsg(ipc_L3[1], &msg, 0) < 0) {
+ close(sock_fd);
+ close(ipc_L3[1]);
+ exit(1);
+ }
+
+ close(sock_fd);
+ close(ipc_L3[1]);
+ exit(0);
+ }
+
+ /* Level 2 child - receive from L3 and forward to L1 */
+ close(ipc_L3[1]);
+
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1];
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+ int received_fd;
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ ssize_t n = recvmsg(ipc_L3[0], &msg, 0);
+ close(ipc_L3[0]);
+
+ if (n != 1) {
+ close(ipc_L2[1]);
+ waitpid(pid_l3, NULL, 0);
+ exit(1);
+ }
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ if (!cmsg) {
+ close(ipc_L2[1]);
+ waitpid(pid_l3, NULL, 0);
+ exit(1);
+ }
+ memcpy(&received_fd, CMSG_DATA(cmsg), sizeof(int));
+
+ /* Wait for L3 child */
+ waitpid(pid_l3, NULL, 0);
+
+ /* Forward the socket FD to L1 parent */
+ memset(&msg, 0, sizeof(msg));
+ buf[0] = 'Y';
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+ memcpy(CMSG_DATA(cmsg), &received_fd, sizeof(int));
+
+ if (sendmsg(ipc_L2[1], &msg, 0) < 0) {
+ close(received_fd);
+ close(ipc_L2[1]);
+ exit(1);
+ }
+
+ close(received_fd);
+ close(ipc_L2[1]);
+ exit(0);
+ }
+
+ /* Level 1 child - receive from L2 and forward to parent */
+ close(ipc_L2[1]);
+
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1];
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+ int received_fd;
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ ssize_t n = recvmsg(ipc_L2[0], &msg, 0);
+ close(ipc_L2[0]);
+
+ if (n != 1) {
+ close(ipc_sockets[1]);
+ waitpid(pid_l2, NULL, 0);
+ exit(1);
+ }
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ if (!cmsg) {
+ close(ipc_sockets[1]);
+ waitpid(pid_l2, NULL, 0);
+ exit(1);
+ }
+ memcpy(&received_fd, CMSG_DATA(cmsg), sizeof(int));
+
+ /* Wait for L2 child */
+ waitpid(pid_l2, NULL, 0);
+
+ /* Forward the socket FD to parent */
+ memset(&msg, 0, sizeof(msg));
+ buf[0] = 'Z';
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+ memcpy(CMSG_DATA(cmsg), &received_fd, sizeof(int));
+
+ if (sendmsg(ipc_sockets[1], &msg, 0) < 0) {
+ close(received_fd);
+ close(ipc_sockets[1]);
+ exit(1);
+ }
+
+ close(received_fd);
+ close(ipc_sockets[1]);
+ exit(0);
+ }
+
+ /* Parent - receive the socket from the deepest level */
+ close(ipc_sockets[1]);
+
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ char buf[1];
+ char cmsg_buf[CMSG_SPACE(sizeof(int))];
+
+ iov.iov_base = buf;
+ iov.iov_len = 1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ ssize_t n = recvmsg(ipc_sockets[0], &msg, 0);
+ close(ipc_sockets[0]);
+
+ if (n != 1) {
+ free(handle);
+ waitpid(pid_l1, NULL, 0);
+ SKIP(return, "Failed to receive socket from child");
+ }
+
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+ if (!cmsg) {
+ free(handle);
+ waitpid(pid_l1, NULL, 0);
+ SKIP(return, "Failed to receive socket from child");
+ }
+ memcpy(&sock_L3A_fd, CMSG_DATA(cmsg), sizeof(int));
+
+ /* Wait for L1 child */
+ waitpid(pid_l1, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+
+ /*
+ * At this point, all child processes have exited. The socket itself
+ * doesn't keep the namespace active - we need to call SIOCGSKNS which
+ * will resurrect the entire namespace tree by taking active references.
+ */
+
+ /* Get network namespace from socket - this resurrects the tree */
+ netns_L3A_fd = ioctl(sock_L3A_fd, SIOCGSKNS);
+ if (netns_L3A_fd < 0) {
+ free(handle);
+ close(sock_L3A_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "SIOCGSKNS not supported");
+ ASSERT_GE(netns_L3A_fd, 0);
+ }
+
+ /* Get namespace ID for net_L3A */
+ ret = ioctl(netns_L3A_fd, NS_GET_ID, &netns_L3A_id);
+ if (ret < 0) {
+ free(handle);
+ close(sock_L3A_fd);
+ close(netns_L3A_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "NS_GET_ID not supported");
+ ASSERT_EQ(ret, 0);
+ }
+
+ /* Get owner user namespace chain: userns_L3 -> userns_L2 -> userns_L1 */
+ int userns_L3_fd = ioctl(netns_L3A_fd, NS_GET_USERNS);
+ if (userns_L3_fd < 0) {
+ free(handle);
+ close(sock_L3A_fd);
+ close(netns_L3A_fd);
+ if (errno == ENOTTY || errno == EINVAL)
+ SKIP(return, "NS_GET_USERNS not supported");
+ ASSERT_GE(userns_L3_fd, 0);
+ }
+
+ ret = ioctl(userns_L3_fd, NS_GET_ID, &userns_L3_id);
+ ASSERT_EQ(ret, 0);
+
+ int userns_L2_fd = ioctl(userns_L3_fd, NS_GET_USERNS);
+ ASSERT_GE(userns_L2_fd, 0);
+ ret = ioctl(userns_L2_fd, NS_GET_ID, &userns_L2_id);
+ ASSERT_EQ(ret, 0);
+
+ int userns_L1_fd = ioctl(userns_L2_fd, NS_GET_USERNS);
+ ASSERT_GE(userns_L1_fd, 0);
+ ret = ioctl(userns_L1_fd, NS_GET_ID, &userns_L1_id);
+ ASSERT_EQ(ret, 0);
+
+ close(userns_L1_fd);
+ close(userns_L2_fd);
+ close(userns_L3_fd);
+
+ TH_LOG("Multi-level hierarchy: net_L3A (id=%llu) -> userns_L3 (id=%llu) -> userns_L2 (id=%llu) -> userns_L1 (id=%llu)",
+ netns_L3A_id, userns_L3_id, userns_L2_id, userns_L1_id);
+
+ /*
+ * Test 1: Verify net_L3A is visible in listns() after resurrection.
+ * The entire ownership chain should be resurrected and visible.
+ */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ if (ret < 0) {
+ free(handle);
+ close(sock_L3A_fd);
+ close(netns_L3A_fd);
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret, 0);
+ }
+
+ bool found_netns_L3A = false;
+ bool found_userns_L1 = false;
+ bool found_userns_L2 = false;
+ bool found_userns_L3 = false;
+
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == netns_L3A_id)
+ found_netns_L3A = true;
+ if (ns_ids[i] == userns_L1_id)
+ found_userns_L1 = true;
+ if (ns_ids[i] == userns_L2_id)
+ found_userns_L2 = true;
+ if (ns_ids[i] == userns_L3_id)
+ found_userns_L3 = true;
+ }
+
+ ASSERT_TRUE(found_netns_L3A);
+ ASSERT_TRUE(found_userns_L1);
+ ASSERT_TRUE(found_userns_L2);
+ ASSERT_TRUE(found_userns_L3);
+ TH_LOG("Resurrection verified: all namespaces in hierarchy visible in listns()");
+
+ /*
+ * Test 2: Verify net_L3A can be reopened via file handle.
+ */
+ nsfs_fh = (struct nsfs_file_handle *)handle->f_handle;
+ nsfs_fh->ns_id = netns_L3A_id;
+ nsfs_fh->ns_type = 0;
+ nsfs_fh->ns_inum = 0;
+
+ reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ if (reopened_fd < 0) {
+ free(handle);
+ close(sock_L3A_fd);
+ close(netns_L3A_fd);
+ if (errno == EOPNOTSUPP || errno == ENOSYS || errno == EBADF)
+ SKIP(return, "open_by_handle_at with FD_NSFS_ROOT not supported");
+ TH_LOG("open_by_handle_at failed: %s", strerror(errno));
+ ASSERT_GE(reopened_fd, 0);
+ }
+
+ close(reopened_fd);
+ TH_LOG("File handle test passed: net_L3A can be reopened");
+
+ /*
+ * Test 3: Verify that when we close the netns FD (dropping the last
+ * active reference), the entire tree becomes inactive and disappears
+ * from listns(). The cascade goes: net_L3A drops -> userns_L3 drops ->
+ * userns_L2 drops -> userns_L1 drops.
+ */
+ close(netns_L3A_fd);
+
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ ASSERT_GE(ret, 0);
+
+ found_netns_L3A = false;
+ found_userns_L1 = false;
+ found_userns_L2 = false;
+ found_userns_L3 = false;
+
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == netns_L3A_id)
+ found_netns_L3A = true;
+ if (ns_ids[i] == userns_L1_id)
+ found_userns_L1 = true;
+ if (ns_ids[i] == userns_L2_id)
+ found_userns_L2 = true;
+ if (ns_ids[i] == userns_L3_id)
+ found_userns_L3 = true;
+ }
+
+ ASSERT_FALSE(found_netns_L3A);
+ ASSERT_FALSE(found_userns_L1);
+ ASSERT_FALSE(found_userns_L2);
+ ASSERT_FALSE(found_userns_L3);
+ TH_LOG("Cascade test passed: all namespaces disappeared after netns FD closed");
+
+ /*
+ * Test 4: Verify file handle no longer works for inactive namespace.
+ */
+ reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ if (reopened_fd >= 0) {
+ close(reopened_fd);
+ free(handle);
+ ASSERT_TRUE(false); /* Should have failed */
+ }
+ TH_LOG("Inactive namespace correctly cannot be reopened via file handle");
+
+ /*
+ * Test 5: Verify that calling SIOCGSKNS again resurrects the tree again.
+ * The socket is still valid, so we can call SIOCGSKNS on it to resurrect
+ * the namespace tree once more.
+ */
+ netns_L3A_fd = ioctl(sock_L3A_fd, SIOCGSKNS);
+ ASSERT_GE(netns_L3A_fd, 0);
+
+ TH_LOG("Called SIOCGSKNS again to resurrect the namespace tree");
+
+ /* Verify the namespace tree is resurrected and visible in listns() */
+ ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0);
+ ASSERT_GE(ret, 0);
+
+ found_netns_L3A = false;
+ found_userns_L1 = false;
+ found_userns_L2 = false;
+ found_userns_L3 = false;
+
+ for (i = 0; i < ret; i++) {
+ if (ns_ids[i] == netns_L3A_id)
+ found_netns_L3A = true;
+ if (ns_ids[i] == userns_L1_id)
+ found_userns_L1 = true;
+ if (ns_ids[i] == userns_L2_id)
+ found_userns_L2 = true;
+ if (ns_ids[i] == userns_L3_id)
+ found_userns_L3 = true;
+ }
+
+ ASSERT_TRUE(found_netns_L3A);
+ ASSERT_TRUE(found_userns_L1);
+ ASSERT_TRUE(found_userns_L2);
+ ASSERT_TRUE(found_userns_L3);
+ TH_LOG("Second resurrection verified: all namespaces in hierarchy visible in listns() again");
+
+ /* Verify we can reopen via file handle again */
+ reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY);
+ if (reopened_fd < 0) {
+ free(handle);
+ close(sock_L3A_fd);
+ close(netns_L3A_fd);
+ TH_LOG("open_by_handle_at failed after second resurrection: %s", strerror(errno));
+ ASSERT_GE(reopened_fd, 0);
+ }
+
+ close(reopened_fd);
+ TH_LOG("File handle test passed: net_L3A can be reopened after second resurrection");
+
+ /* Final cleanup */
+ close(sock_L3A_fd);
+ close(netns_L3A_fd);
+ free(handle);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/namespaces/stress_test.c b/tools/testing/selftests/namespaces/stress_test.c
new file mode 100644
index 000000000000..dd7df7d6cb27
--- /dev/null
+++ b/tools/testing/selftests/namespaces/stress_test.c
@@ -0,0 +1,626 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <linux/nsfs.h>
+#include "../kselftest_harness.h"
+#include "../filesystems/utils.h"
+#include "wrappers.h"
+
+/*
+ * Stress tests for namespace active reference counting.
+ *
+ * These tests validate that the active reference counting system can handle
+ * high load scenarios including rapid namespace creation/destruction, large
+ * numbers of concurrent namespaces, and various edge cases under stress.
+ */
+
+/*
+ * Test rapid creation and destruction of user namespaces.
+ * Create and destroy namespaces in quick succession to stress the
+ * active reference tracking and ensure no leaks occur.
+ */
+TEST(rapid_namespace_creation_destruction)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids_before[256], ns_ids_after[256];
+ ssize_t ret_before, ret_after;
+ int i;
+
+ /* Get baseline count of active user namespaces */
+ ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0);
+ if (ret_before < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret_before, 0);
+ }
+
+ TH_LOG("Baseline: %zd active user namespaces", ret_before);
+
+ /* Rapidly create and destroy 100 user namespaces */
+ for (i = 0; i < 100; i++) {
+ pid_t pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child: create user namespace and immediately exit */
+ if (setup_userns() < 0)
+ exit(1);
+ exit(0);
+ }
+
+ /* Parent: wait for child */
+ int status;
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+ }
+
+ /* Verify we're back to baseline (no leaked namespaces) */
+ ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0);
+ ASSERT_GE(ret_after, 0);
+
+ TH_LOG("After 100 rapid create/destroy cycles: %zd active user namespaces", ret_after);
+ ASSERT_EQ(ret_before, ret_after);
+}
+
+/*
+ * Test creating many concurrent namespaces.
+ * Verify that listns() correctly tracks all of them and that they all
+ * become inactive after processes exit.
+ */
+TEST(many_concurrent_namespaces)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids_before[512], ns_ids_during[512], ns_ids_after[512];
+ ssize_t ret_before, ret_during, ret_after;
+ pid_t pids[50];
+ int num_children = 50;
+ int i;
+ int sv[2];
+
+ /* Get baseline */
+ ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0);
+ if (ret_before < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret_before, 0);
+ }
+
+ TH_LOG("Baseline: %zd active user namespaces", ret_before);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0);
+
+ /* Create many children, each with their own user namespace */
+ for (i = 0; i < num_children; i++) {
+ pids[i] = fork();
+ ASSERT_GE(pids[i], 0);
+
+ if (pids[i] == 0) {
+ /* Child: create user namespace and wait for parent signal */
+ char c;
+
+ close(sv[0]);
+
+ if (setup_userns() < 0) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ /* Signal parent we're ready */
+ if (write(sv[1], &c, 1) != 1) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ /* Wait for parent signal to exit */
+ if (read(sv[1], &c, 1) != 1) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ close(sv[1]);
+ exit(0);
+ }
+ }
+
+ close(sv[1]);
+
+ /* Wait for all children to signal ready */
+ for (i = 0; i < num_children; i++) {
+ char c;
+ if (read(sv[0], &c, 1) != 1) {
+ /* If we fail to read, kill all children and exit */
+ close(sv[0]);
+ for (int j = 0; j < num_children; j++)
+ kill(pids[j], SIGKILL);
+ for (int j = 0; j < num_children; j++)
+ waitpid(pids[j], NULL, 0);
+ ASSERT_TRUE(false);
+ }
+ }
+
+ /* List namespaces while all children are running */
+ ret_during = sys_listns(&req, ns_ids_during, ARRAY_SIZE(ns_ids_during), 0);
+ ASSERT_GE(ret_during, 0);
+
+ TH_LOG("With %d children running: %zd active user namespaces", num_children, ret_during);
+
+ /* Should have at least num_children more namespaces than baseline */
+ ASSERT_GE(ret_during, ret_before + num_children);
+
+ /* Signal all children to exit */
+ for (i = 0; i < num_children; i++) {
+ char c = 'X';
+ if (write(sv[0], &c, 1) != 1) {
+ /* If we fail to write, kill remaining children */
+ close(sv[0]);
+ for (int j = i; j < num_children; j++)
+ kill(pids[j], SIGKILL);
+ for (int j = 0; j < num_children; j++)
+ waitpid(pids[j], NULL, 0);
+ ASSERT_TRUE(false);
+ }
+ }
+
+ close(sv[0]);
+
+ /* Wait for all children */
+ for (i = 0; i < num_children; i++) {
+ int status;
+ waitpid(pids[i], &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ }
+
+ /* Verify we're back to baseline */
+ ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0);
+ ASSERT_GE(ret_after, 0);
+
+ TH_LOG("After all children exit: %zd active user namespaces", ret_after);
+ ASSERT_EQ(ret_before, ret_after);
+}
+
+/*
+ * Test rapid namespace creation with different namespace types.
+ * Create multiple types of namespaces rapidly to stress the tracking system.
+ */
+TEST(rapid_mixed_namespace_creation)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0, /* All types */
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids_before[512], ns_ids_after[512];
+ ssize_t ret_before, ret_after;
+ int i;
+
+ /* Get baseline count */
+ ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0);
+ if (ret_before < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret_before, 0);
+ }
+
+ TH_LOG("Baseline: %zd active namespaces (all types)", ret_before);
+
+ /* Rapidly create and destroy namespaces with multiple types */
+ for (i = 0; i < 50; i++) {
+ pid_t pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /* Child: create multiple namespace types */
+ if (setup_userns() < 0)
+ exit(1);
+
+ /* Create additional namespace types */
+ if (unshare(CLONE_NEWNET) < 0)
+ exit(1);
+ if (unshare(CLONE_NEWUTS) < 0)
+ exit(1);
+ if (unshare(CLONE_NEWIPC) < 0)
+ exit(1);
+
+ exit(0);
+ }
+
+ /* Parent: wait for child */
+ int status;
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ }
+
+ /* Verify we're back to baseline */
+ ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0);
+ ASSERT_GE(ret_after, 0);
+
+ TH_LOG("After 50 rapid mixed namespace cycles: %zd active namespaces", ret_after);
+ ASSERT_EQ(ret_before, ret_after);
+}
+
+/*
+ * Test nested namespace creation under stress.
+ * Create deeply nested namespace hierarchies and verify proper cleanup.
+ */
+TEST(nested_namespace_stress)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids_before[512], ns_ids_after[512];
+ ssize_t ret_before, ret_after;
+ int i;
+
+ /* Get baseline */
+ ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0);
+ if (ret_before < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret_before, 0);
+ }
+
+ TH_LOG("Baseline: %zd active user namespaces", ret_before);
+
+ /* Create 20 processes, each with nested user namespaces */
+ for (i = 0; i < 20; i++) {
+ pid_t pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ int userns_fd;
+ uid_t orig_uid = getuid();
+ int depth;
+
+ /* Create nested user namespaces (up to 5 levels) */
+ for (depth = 0; depth < 5; depth++) {
+ userns_fd = get_userns_fd(0, (depth == 0) ? orig_uid : 0, 1);
+ if (userns_fd < 0)
+ exit(1);
+
+ if (setns(userns_fd, CLONE_NEWUSER) < 0) {
+ close(userns_fd);
+ exit(1);
+ }
+ close(userns_fd);
+ }
+
+ exit(0);
+ }
+
+ /* Parent: wait for child */
+ int status;
+ waitpid(pid, &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ }
+
+ /* Verify we're back to baseline */
+ ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0);
+ ASSERT_GE(ret_after, 0);
+
+ TH_LOG("After 20 nested namespace hierarchies: %zd active user namespaces", ret_after);
+ ASSERT_EQ(ret_before, ret_after);
+}
+
+/*
+ * Test listns() pagination under stress.
+ * Create many namespaces and verify pagination works correctly.
+ */
+TEST(listns_pagination_stress)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ pid_t pids[30];
+ int num_children = 30;
+ int i;
+ int sv[2];
+ __u64 all_ns_ids[512];
+ int total_found = 0;
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0);
+
+ /* Create many children with user namespaces */
+ for (i = 0; i < num_children; i++) {
+ pids[i] = fork();
+ ASSERT_GE(pids[i], 0);
+
+ if (pids[i] == 0) {
+ char c;
+ close(sv[0]);
+
+ if (setup_userns() < 0) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ /* Signal parent we're ready */
+ if (write(sv[1], &c, 1) != 1) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ /* Wait for parent signal to exit */
+ if (read(sv[1], &c, 1) != 1) {
+ close(sv[1]);
+ exit(1);
+ }
+
+ close(sv[1]);
+ exit(0);
+ }
+ }
+
+ close(sv[1]);
+
+ /* Wait for all children to signal ready */
+ for (i = 0; i < num_children; i++) {
+ char c;
+ if (read(sv[0], &c, 1) != 1) {
+ /* If we fail to read, kill all children and exit */
+ close(sv[0]);
+ for (int j = 0; j < num_children; j++)
+ kill(pids[j], SIGKILL);
+ for (int j = 0; j < num_children; j++)
+ waitpid(pids[j], NULL, 0);
+ ASSERT_TRUE(false);
+ }
+ }
+
+ /* Paginate through all namespaces using small batch sizes */
+ req.ns_id = 0;
+ while (1) {
+ __u64 batch[5]; /* Small batch size to force pagination */
+ ssize_t ret;
+
+ ret = sys_listns(&req, batch, ARRAY_SIZE(batch), 0);
+ if (ret < 0) {
+ if (errno == ENOSYS) {
+ close(sv[0]);
+ for (i = 0; i < num_children; i++)
+ kill(pids[i], SIGKILL);
+ for (i = 0; i < num_children; i++)
+ waitpid(pids[i], NULL, 0);
+ SKIP(return, "listns() not supported");
+ }
+ ASSERT_GE(ret, 0);
+ }
+
+ if (ret == 0)
+ break;
+
+ /* Store results */
+ for (i = 0; i < ret && total_found < 512; i++) {
+ all_ns_ids[total_found++] = batch[i];
+ }
+
+ /* Update cursor for next batch */
+ if (ret == ARRAY_SIZE(batch))
+ req.ns_id = batch[ret - 1];
+ else
+ break;
+ }
+
+ TH_LOG("Paginated through %d user namespaces", total_found);
+
+ /* Verify no duplicates in pagination */
+ for (i = 0; i < total_found; i++) {
+ for (int j = i + 1; j < total_found; j++) {
+ if (all_ns_ids[i] == all_ns_ids[j]) {
+ TH_LOG("Found duplicate ns_id: %llu at positions %d and %d",
+ (unsigned long long)all_ns_ids[i], i, j);
+ ASSERT_TRUE(false);
+ }
+ }
+ }
+
+ /* Signal all children to exit */
+ for (i = 0; i < num_children; i++) {
+ char c = 'X';
+ if (write(sv[0], &c, 1) != 1) {
+ close(sv[0]);
+ for (int j = i; j < num_children; j++)
+ kill(pids[j], SIGKILL);
+ for (int j = 0; j < num_children; j++)
+ waitpid(pids[j], NULL, 0);
+ ASSERT_TRUE(false);
+ }
+ }
+
+ close(sv[0]);
+
+ /* Wait for all children */
+ for (i = 0; i < num_children; i++) {
+ int status;
+ waitpid(pids[i], &status, 0);
+ }
+}
+
+/*
+ * Test concurrent namespace operations.
+ * Multiple processes creating, querying, and destroying namespaces concurrently.
+ */
+TEST(concurrent_namespace_operations)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = 0,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids_before[512], ns_ids_after[512];
+ ssize_t ret_before, ret_after;
+ pid_t pids[20];
+ int num_workers = 20;
+ int i;
+
+ /* Get baseline */
+ ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0);
+ if (ret_before < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret_before, 0);
+ }
+
+ TH_LOG("Baseline: %zd active namespaces", ret_before);
+
+ /* Create worker processes that do concurrent operations */
+ for (i = 0; i < num_workers; i++) {
+ pids[i] = fork();
+ ASSERT_GE(pids[i], 0);
+
+ if (pids[i] == 0) {
+ /* Each worker: create namespaces, list them, repeat */
+ int iterations;
+
+ for (iterations = 0; iterations < 10; iterations++) {
+ int userns_fd;
+ __u64 temp_ns_ids[100];
+ ssize_t ret;
+
+ /* Create a user namespace */
+ userns_fd = get_userns_fd(0, getuid(), 1);
+ if (userns_fd < 0)
+ continue;
+
+ /* List namespaces */
+ ret = sys_listns(&req, temp_ns_ids, ARRAY_SIZE(temp_ns_ids), 0);
+ (void)ret;
+
+ close(userns_fd);
+
+ /* Small delay */
+ usleep(1000);
+ }
+
+ exit(0);
+ }
+ }
+
+ /* Wait for all workers */
+ for (i = 0; i < num_workers; i++) {
+ int status;
+ waitpid(pids[i], &status, 0);
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+ }
+
+ /* Verify we're back to baseline */
+ ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0);
+ ASSERT_GE(ret_after, 0);
+
+ TH_LOG("After concurrent operations: %zd active namespaces", ret_after);
+ ASSERT_EQ(ret_before, ret_after);
+}
+
+/*
+ * Test namespace churn - continuous creation and destruction.
+ * Simulates high-churn scenarios like container orchestration.
+ */
+TEST(namespace_churn)
+{
+ struct ns_id_req req = {
+ .size = sizeof(req),
+ .spare = 0,
+ .ns_id = 0,
+ .ns_type = CLONE_NEWUSER | CLONE_NEWNET | CLONE_NEWUTS,
+ .spare2 = 0,
+ .user_ns_id = 0,
+ };
+ __u64 ns_ids_before[512], ns_ids_after[512];
+ ssize_t ret_before, ret_after;
+ int cycle;
+
+ /* Get baseline */
+ ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0);
+ if (ret_before < 0) {
+ if (errno == ENOSYS)
+ SKIP(return, "listns() not supported");
+ ASSERT_GE(ret_before, 0);
+ }
+
+ TH_LOG("Baseline: %zd active namespaces", ret_before);
+
+ /* Simulate churn: batches of namespaces created and destroyed */
+ for (cycle = 0; cycle < 10; cycle++) {
+ pid_t batch_pids[10];
+ int i;
+
+ /* Create batch */
+ for (i = 0; i < 10; i++) {
+ batch_pids[i] = fork();
+ ASSERT_GE(batch_pids[i], 0);
+
+ if (batch_pids[i] == 0) {
+ /* Create multiple namespace types */
+ if (setup_userns() < 0)
+ exit(1);
+ if (unshare(CLONE_NEWNET) < 0)
+ exit(1);
+ if (unshare(CLONE_NEWUTS) < 0)
+ exit(1);
+
+ /* Keep namespaces alive briefly */
+ usleep(10000);
+ exit(0);
+ }
+ }
+
+ /* Wait for batch to complete */
+ for (i = 0; i < 10; i++) {
+ int status;
+ waitpid(batch_pids[i], &status, 0);
+ }
+ }
+
+ /* Verify we're back to baseline */
+ ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0);
+ ASSERT_GE(ret_after, 0);
+
+ TH_LOG("After 10 churn cycles (100 namespace sets): %zd active namespaces", ret_after);
+ ASSERT_EQ(ret_before, ret_after);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/namespaces/wrappers.h b/tools/testing/selftests/namespaces/wrappers.h
new file mode 100644
index 000000000000..9741a64a5b1d
--- /dev/null
+++ b/tools/testing/selftests/namespaces/wrappers.h
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/nsfs.h>
+#include <linux/types.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#ifndef __SELFTESTS_NAMESPACES_WRAPPERS_H__
+#define __SELFTESTS_NAMESPACES_WRAPPERS_H__
+
+#ifndef __NR_listns
+ #if defined __alpha__
+ #define __NR_listns 580
+ #elif defined _MIPS_SIM
+ #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */
+ #define __NR_listns 4470
+ #endif
+ #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */
+ #define __NR_listns 6470
+ #endif
+ #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */
+ #define __NR_listns 5470
+ #endif
+ #else
+ #define __NR_listns 470
+ #endif
+#endif
+
+static inline int sys_listns(const struct ns_id_req *req, __u64 *ns_ids,
+ size_t nr_ns_ids, unsigned int flags)
+{
+ return syscall(__NR_listns, req, ns_ids, nr_ns_ids, flags);
+}
+
+#endif /* __SELFTESTS_NAMESPACES_WRAPPERS_H__ */
diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore
index 439101b518ee..8f9850a71f54 100644
--- a/tools/testing/selftests/net/.gitignore
+++ b/tools/testing/selftests/net/.gitignore
@@ -45,6 +45,7 @@ skf_net_off
socket
so_incoming_cpu
so_netns_cookie
+so_peek_off
so_txtime
so_rcv_listener
stress_reuseport_listen
diff --git a/tools/testing/selftests/net/af_unix/Makefile b/tools/testing/selftests/net/af_unix/Makefile
index de805cbbdf69..528d14c598bb 100644
--- a/tools/testing/selftests/net/af_unix/Makefile
+++ b/tools/testing/selftests/net/af_unix/Makefile
@@ -6,6 +6,7 @@ TEST_GEN_PROGS := \
scm_inq \
scm_pidfd \
scm_rights \
+ so_peek_off \
unix_connect \
# end of TEST_GEN_PROGS
diff --git a/tools/testing/selftests/net/af_unix/so_peek_off.c b/tools/testing/selftests/net/af_unix/so_peek_off.c
new file mode 100644
index 000000000000..1a77728128e5
--- /dev/null
+++ b/tools/testing/selftests/net/af_unix/so_peek_off.c
@@ -0,0 +1,162 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright 2025 Google LLC */
+
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <sys/socket.h>
+
+#include "../../kselftest_harness.h"
+
+FIXTURE(so_peek_off)
+{
+ int fd[2]; /* 0: sender, 1: receiver */
+};
+
+FIXTURE_VARIANT(so_peek_off)
+{
+ int type;
+};
+
+FIXTURE_VARIANT_ADD(so_peek_off, stream)
+{
+ .type = SOCK_STREAM,
+};
+
+FIXTURE_VARIANT_ADD(so_peek_off, dgram)
+{
+ .type = SOCK_DGRAM,
+};
+
+FIXTURE_VARIANT_ADD(so_peek_off, seqpacket)
+{
+ .type = SOCK_SEQPACKET,
+};
+
+FIXTURE_SETUP(so_peek_off)
+{
+ struct timeval timeout = {
+ .tv_sec = 0,
+ .tv_usec = 3000,
+ };
+ int ret;
+
+ ret = socketpair(AF_UNIX, variant->type, 0, self->fd);
+ ASSERT_EQ(0, ret);
+
+ ret = setsockopt(self->fd[1], SOL_SOCKET, SO_RCVTIMEO_NEW,
+ &timeout, sizeof(timeout));
+ ASSERT_EQ(0, ret);
+
+ ret = setsockopt(self->fd[1], SOL_SOCKET, SO_PEEK_OFF,
+ &(int){0}, sizeof(int));
+ ASSERT_EQ(0, ret);
+}
+
+FIXTURE_TEARDOWN(so_peek_off)
+{
+ close_range(self->fd[0], self->fd[1], 0);
+}
+
+#define sendeq(fd, str, flags) \
+ do { \
+ int bytes, len = strlen(str); \
+ \
+ bytes = send(fd, str, len, flags); \
+ ASSERT_EQ(len, bytes); \
+ } while (0)
+
+#define recveq(fd, str, buflen, flags) \
+ do { \
+ char buf[(buflen) + 1] = {}; \
+ int bytes; \
+ \
+ bytes = recv(fd, buf, buflen, flags); \
+ ASSERT_NE(-1, bytes); \
+ ASSERT_STREQ(str, buf); \
+ } while (0)
+
+#define async \
+ for (pid_t pid = (pid = fork(), \
+ pid < 0 ? \
+ __TH_LOG("Failed to start async {}"), \
+ _metadata->exit_code = KSFT_FAIL, \
+ __bail(1, _metadata), \
+ 0xdead : \
+ pid); \
+ !pid; exit(0))
+
+TEST_F(so_peek_off, single_chunk)
+{
+ sendeq(self->fd[0], "aaaabbbb", 0);
+
+ recveq(self->fd[1], "aaaa", 4, MSG_PEEK);
+ recveq(self->fd[1], "bbbb", 100, MSG_PEEK);
+}
+
+TEST_F(so_peek_off, two_chunks)
+{
+ sendeq(self->fd[0], "aaaa", 0);
+ sendeq(self->fd[0], "bbbb", 0);
+
+ recveq(self->fd[1], "aaaa", 4, MSG_PEEK);
+ recveq(self->fd[1], "bbbb", 100, MSG_PEEK);
+}
+
+TEST_F(so_peek_off, two_chunks_blocking)
+{
+ async {
+ usleep(1000);
+ sendeq(self->fd[0], "aaaa", 0);
+ }
+
+ recveq(self->fd[1], "aaaa", 4, MSG_PEEK);
+
+ async {
+ usleep(1000);
+ sendeq(self->fd[0], "bbbb", 0);
+ }
+
+ /* goto again; -> goto redo; in unix_stream_read_generic(). */
+ recveq(self->fd[1], "bbbb", 100, MSG_PEEK);
+}
+
+TEST_F(so_peek_off, two_chunks_overlap)
+{
+ sendeq(self->fd[0], "aaaa", 0);
+ recveq(self->fd[1], "aa", 2, MSG_PEEK);
+
+ sendeq(self->fd[0], "bbbb", 0);
+
+ if (variant->type == SOCK_STREAM) {
+ /* SOCK_STREAM tries to fill the buffer. */
+ recveq(self->fd[1], "aabb", 4, MSG_PEEK);
+ recveq(self->fd[1], "bb", 100, MSG_PEEK);
+ } else {
+ /* SOCK_DGRAM and SOCK_SEQPACKET returns at the skb boundary. */
+ recveq(self->fd[1], "aa", 100, MSG_PEEK);
+ recveq(self->fd[1], "bbbb", 100, MSG_PEEK);
+ }
+}
+
+TEST_F(so_peek_off, two_chunks_overlap_blocking)
+{
+ async {
+ usleep(1000);
+ sendeq(self->fd[0], "aaaa", 0);
+ }
+
+ recveq(self->fd[1], "aa", 2, MSG_PEEK);
+
+ async {
+ usleep(1000);
+ sendeq(self->fd[0], "bbbb", 0);
+ }
+
+ /* Even SOCK_STREAM does not wait if at least one byte is read. */
+ recveq(self->fd[1], "aa", 100, MSG_PEEK);
+
+ recveq(self->fd[1], "bbbb", 100, MSG_PEEK);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/net/forwarding/lib_sh_test.sh b/tools/testing/selftests/net/forwarding/lib_sh_test.sh
index ff2accccaf4d..b4eda6c6199e 100755
--- a/tools/testing/selftests/net/forwarding/lib_sh_test.sh
+++ b/tools/testing/selftests/net/forwarding/lib_sh_test.sh
@@ -30,6 +30,11 @@ tfail()
do_test "tfail" false
}
+tfail2()
+{
+ do_test "tfail2" false
+}
+
txfail()
{
FAIL_TO_XFAIL=yes do_test "txfail" false
@@ -132,6 +137,8 @@ test_ret()
ret_subtest $ksft_fail "tfail" txfail tfail
ret_subtest $ksft_xfail "txfail" txfail txfail
+
+ ret_subtest $ksft_fail "tfail2" tfail2 tfail
}
exit_status_tests_run()
diff --git a/tools/testing/selftests/net/forwarding/local_termination.sh b/tools/testing/selftests/net/forwarding/local_termination.sh
index ecd34f364125..892895659c7e 100755
--- a/tools/testing/selftests/net/forwarding/local_termination.sh
+++ b/tools/testing/selftests/net/forwarding/local_termination.sh
@@ -176,6 +176,8 @@ run_test()
local rcv_dmac=$(mac_get $rcv_if_name)
local should_receive
+ setup_wait
+
tcpdump_start $rcv_if_name
mc_route_prepare $send_if_name
diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh
index feba4ef69a54..f448bafb3f20 100644
--- a/tools/testing/selftests/net/lib.sh
+++ b/tools/testing/selftests/net/lib.sh
@@ -43,7 +43,7 @@ __ksft_status_merge()
weights[$i]=$((weight++))
done
- if [[ ${weights[$a]} > ${weights[$b]} ]]; then
+ if [[ ${weights[$a]} -ge ${weights[$b]} ]]; then
echo "$a"
return 0
else
diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c
index b148cadb96d0..fc7e22b503d3 100644
--- a/tools/testing/selftests/net/mptcp/mptcp_connect.c
+++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c
@@ -710,8 +710,14 @@ static int copyfd_io_poll(int infd, int peerfd, int outfd,
bw = do_rnd_write(peerfd, winfo->buf + winfo->off, winfo->len);
if (bw < 0) {
- if (cfg_rcv_trunc)
- return 0;
+ /* expected reset, continue to read */
+ if (cfg_rcv_trunc &&
+ (errno == ECONNRESET ||
+ errno == EPIPE)) {
+ fds.events &= ~POLLOUT;
+ continue;
+ }
+
perror("write");
return 111;
}
@@ -737,8 +743,10 @@ static int copyfd_io_poll(int infd, int peerfd, int outfd,
}
if (fds.revents & (POLLERR | POLLNVAL)) {
- if (cfg_rcv_trunc)
- return 0;
+ if (cfg_rcv_trunc) {
+ fds.events &= ~(POLLERR | POLLNVAL);
+ continue;
+ }
fprintf(stderr, "Unexpected revents: "
"POLLERR/POLLNVAL(%x)\n", fds.revents);
return 5;
@@ -1433,7 +1441,7 @@ static void parse_opts(int argc, char **argv)
*/
if (cfg_truncate < 0) {
cfg_rcv_trunc = true;
- signal(SIGPIPE, handle_signal);
+ signal(SIGPIPE, SIG_IGN);
}
break;
case 'j':
diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
index 47ecb5b3836e..9b7b93f8eb0c 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
@@ -492,7 +492,7 @@ do_transfer()
"than expected (${expect_synrx})"
retc=1
fi
- if [ ${stat_ackrx_now_l} -lt ${expect_ackrx} ] && [ ${stat_ooo_now} -eq 0 ]; then
+ if [ ${stat_ackrx_now_l} -lt ${expect_ackrx} ]; then
if [ ${stat_ooo_now} -eq 0 ]; then
mptcp_lib_pr_fail "lower MPC ACK rx (${stat_ackrx_now_l})" \
"than expected (${expect_ackrx})"
diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index 78a1aa4ecff2..43f31f8d587f 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -2532,7 +2532,7 @@ remove_tests()
if reset "remove single subflow"; then
pm_nl_set_limits $ns1 0 1
pm_nl_set_limits $ns2 0 1
- pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
+ pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow,backup
addr_nr_ns2=-1 speed=slow \
run_tests $ns1 $ns2 10.0.1.1
chk_join_nr 1 1 1
@@ -2545,8 +2545,8 @@ remove_tests()
if reset "remove multiple subflows"; then
pm_nl_set_limits $ns1 0 2
pm_nl_set_limits $ns2 0 2
- pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow
- pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
+ pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow,backup
+ pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow,backup
addr_nr_ns2=-2 speed=slow \
run_tests $ns1 $ns2 10.0.1.1
chk_join_nr 2 2 2
@@ -2557,7 +2557,7 @@ remove_tests()
# single address, remove
if reset "remove single address"; then
pm_nl_set_limits $ns1 0 1
- pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
+ pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,backup
pm_nl_set_limits $ns2 1 1
addr_nr_ns1=-1 speed=slow \
run_tests $ns1 $ns2 10.0.1.1
@@ -2570,9 +2570,9 @@ remove_tests()
# subflow and signal, remove
if reset "remove subflow and signal"; then
pm_nl_set_limits $ns1 0 2
- pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
+ pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,backup
pm_nl_set_limits $ns2 1 2
- pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
+ pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow,backup
addr_nr_ns1=-1 addr_nr_ns2=-1 speed=slow \
run_tests $ns1 $ns2 10.0.1.1
chk_join_nr 2 2 2
@@ -2584,10 +2584,10 @@ remove_tests()
# subflows and signal, remove
if reset "remove subflows and signal"; then
pm_nl_set_limits $ns1 0 3
- pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
+ pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,backup
pm_nl_set_limits $ns2 1 3
- pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
- pm_nl_add_endpoint $ns2 10.0.4.2 flags subflow
+ pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow,backup
+ pm_nl_add_endpoint $ns2 10.0.4.2 flags subflow,backup
addr_nr_ns1=-1 addr_nr_ns2=-2 speed=10 \
run_tests $ns1 $ns2 10.0.1.1
chk_join_nr 3 3 3
@@ -2599,9 +2599,9 @@ remove_tests()
# addresses remove
if reset "remove addresses"; then
pm_nl_set_limits $ns1 3 3
- pm_nl_add_endpoint $ns1 10.0.2.1 flags signal id 250
- pm_nl_add_endpoint $ns1 10.0.3.1 flags signal
- pm_nl_add_endpoint $ns1 10.0.4.1 flags signal
+ pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,backup id 250
+ pm_nl_add_endpoint $ns1 10.0.3.1 flags signal,backup
+ pm_nl_add_endpoint $ns1 10.0.4.1 flags signal,backup
pm_nl_set_limits $ns2 3 3
addr_nr_ns1=-3 speed=10 \
run_tests $ns1 $ns2 10.0.1.1
@@ -2614,10 +2614,10 @@ remove_tests()
# invalid addresses remove
if reset "remove invalid addresses"; then
pm_nl_set_limits $ns1 3 3
- pm_nl_add_endpoint $ns1 10.0.12.1 flags signal
+ pm_nl_add_endpoint $ns1 10.0.12.1 flags signal,backup
# broadcast IP: no packet for this address will be received on ns1
- pm_nl_add_endpoint $ns1 224.0.0.1 flags signal
- pm_nl_add_endpoint $ns1 10.0.3.1 flags signal
+ pm_nl_add_endpoint $ns1 224.0.0.1 flags signal,backup
+ pm_nl_add_endpoint $ns1 10.0.3.1 flags signal,backup
pm_nl_set_limits $ns2 2 2
addr_nr_ns1=-3 speed=10 \
run_tests $ns1 $ns2 10.0.1.1
@@ -2631,10 +2631,10 @@ remove_tests()
# subflows and signal, flush
if reset "flush subflows and signal"; then
pm_nl_set_limits $ns1 0 3
- pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
+ pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,backup
pm_nl_set_limits $ns2 1 3
- pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
- pm_nl_add_endpoint $ns2 10.0.4.2 flags subflow
+ pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow,backup
+ pm_nl_add_endpoint $ns2 10.0.4.2 flags subflow,backup
addr_nr_ns1=-8 addr_nr_ns2=-8 speed=slow \
run_tests $ns1 $ns2 10.0.1.1
chk_join_nr 3 3 3
@@ -2647,9 +2647,9 @@ remove_tests()
if reset "flush subflows"; then
pm_nl_set_limits $ns1 3 3
pm_nl_set_limits $ns2 3 3
- pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow id 150
- pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
- pm_nl_add_endpoint $ns2 10.0.4.2 flags subflow
+ pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow,backup id 150
+ pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow,backup
+ pm_nl_add_endpoint $ns2 10.0.4.2 flags subflow,backup
addr_nr_ns1=-8 addr_nr_ns2=-8 speed=slow \
run_tests $ns1 $ns2 10.0.1.1
chk_join_nr 3 3 3
@@ -2666,9 +2666,9 @@ remove_tests()
# addresses flush
if reset "flush addresses"; then
pm_nl_set_limits $ns1 3 3
- pm_nl_add_endpoint $ns1 10.0.2.1 flags signal id 250
- pm_nl_add_endpoint $ns1 10.0.3.1 flags signal
- pm_nl_add_endpoint $ns1 10.0.4.1 flags signal
+ pm_nl_add_endpoint $ns1 10.0.2.1 flags signal,backup id 250
+ pm_nl_add_endpoint $ns1 10.0.3.1 flags signal,backup
+ pm_nl_add_endpoint $ns1 10.0.4.1 flags signal,backup
pm_nl_set_limits $ns2 3 3
addr_nr_ns1=-8 addr_nr_ns2=-8 speed=slow \
run_tests $ns1 $ns2 10.0.1.1
@@ -2681,9 +2681,9 @@ remove_tests()
# invalid addresses flush
if reset "flush invalid addresses"; then
pm_nl_set_limits $ns1 3 3
- pm_nl_add_endpoint $ns1 10.0.12.1 flags signal
- pm_nl_add_endpoint $ns1 10.0.3.1 flags signal
- pm_nl_add_endpoint $ns1 10.0.14.1 flags signal
+ pm_nl_add_endpoint $ns1 10.0.12.1 flags signal,backup
+ pm_nl_add_endpoint $ns1 10.0.3.1 flags signal,backup
+ pm_nl_add_endpoint $ns1 10.0.14.1 flags signal,backup
pm_nl_set_limits $ns2 3 3
addr_nr_ns1=-8 speed=slow \
run_tests $ns1 $ns2 10.0.1.1
@@ -3500,7 +3500,6 @@ fullmesh_tests()
fastclose_tests()
{
if reset_check_counter "fastclose test" "MPTcpExtMPFastcloseTx"; then
- MPTCP_LIB_SUBTEST_FLAKY=1
test_linkfail=1024 fastclose=client \
run_tests $ns1 $ns2 10.0.1.1
chk_join_nr 0 0 0
@@ -3509,7 +3508,6 @@ fastclose_tests()
fi
if reset_check_counter "fastclose server test" "MPTcpExtMPFastcloseRx"; then
- MPTCP_LIB_SUBTEST_FLAKY=1
test_linkfail=1024 fastclose=server \
run_tests $ns1 $ns2 10.0.1.1
join_rst_nr=1 \
@@ -3806,7 +3804,7 @@ userspace_tests()
continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then
set_userspace_pm $ns1
pm_nl_set_limits $ns2 2 2
- { speed=5 \
+ { timeout_test=120 test_linkfail=128 speed=5 \
run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null
local tests_pid=$!
wait_mpj $ns1
@@ -3831,7 +3829,7 @@ userspace_tests()
chk_mptcp_info subflows 0 subflows 0
chk_subflows_total 1 1
kill_events_pids
- mptcp_lib_kill_wait $tests_pid
+ mptcp_lib_kill_group_wait $tests_pid
fi
# userspace pm create destroy subflow
@@ -3839,7 +3837,7 @@ userspace_tests()
continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then
set_userspace_pm $ns2
pm_nl_set_limits $ns1 0 1
- { speed=5 \
+ { timeout_test=120 test_linkfail=128 speed=5 \
run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null
local tests_pid=$!
wait_mpj $ns2
@@ -3859,7 +3857,7 @@ userspace_tests()
chk_mptcp_info subflows 0 subflows 0
chk_subflows_total 1 1
kill_events_pids
- mptcp_lib_kill_wait $tests_pid
+ mptcp_lib_kill_group_wait $tests_pid
fi
# userspace pm create id 0 subflow
@@ -3867,7 +3865,7 @@ userspace_tests()
continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then
set_userspace_pm $ns2
pm_nl_set_limits $ns1 0 1
- { speed=5 \
+ { timeout_test=120 test_linkfail=128 speed=5 \
run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null
local tests_pid=$!
wait_mpj $ns2
@@ -3880,7 +3878,7 @@ userspace_tests()
chk_mptcp_info subflows 1 subflows 1
chk_subflows_total 2 2
kill_events_pids
- mptcp_lib_kill_wait $tests_pid
+ mptcp_lib_kill_group_wait $tests_pid
fi
# userspace pm remove initial subflow
@@ -3888,7 +3886,7 @@ userspace_tests()
continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then
set_userspace_pm $ns2
pm_nl_set_limits $ns1 0 1
- { speed=5 \
+ { timeout_test=120 test_linkfail=128 speed=5 \
run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null
local tests_pid=$!
wait_mpj $ns2
@@ -3904,7 +3902,7 @@ userspace_tests()
chk_mptcp_info subflows 1 subflows 1
chk_subflows_total 1 1
kill_events_pids
- mptcp_lib_kill_wait $tests_pid
+ mptcp_lib_kill_group_wait $tests_pid
fi
# userspace pm send RM_ADDR for ID 0
@@ -3912,7 +3910,7 @@ userspace_tests()
continue_if mptcp_lib_has_file '/proc/sys/net/mptcp/pm_type'; then
set_userspace_pm $ns1
pm_nl_set_limits $ns2 1 1
- { speed=5 \
+ { timeout_test=120 test_linkfail=128 speed=5 \
run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null
local tests_pid=$!
wait_mpj $ns1
@@ -3930,7 +3928,7 @@ userspace_tests()
chk_mptcp_info subflows 1 subflows 1
chk_subflows_total 1 1
kill_events_pids
- mptcp_lib_kill_wait $tests_pid
+ mptcp_lib_kill_group_wait $tests_pid
fi
}
@@ -3943,7 +3941,7 @@ endpoint_tests()
pm_nl_set_limits $ns1 2 2
pm_nl_set_limits $ns2 2 2
pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
- { speed=slow \
+ { timeout_test=120 test_linkfail=128 speed=slow \
run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null
local tests_pid=$!
@@ -3960,7 +3958,7 @@ endpoint_tests()
pm_nl_add_endpoint $ns2 10.0.2.2 flags signal
pm_nl_check_endpoint "modif is allowed" \
$ns2 10.0.2.2 id 1 flags signal
- mptcp_lib_kill_wait $tests_pid
+ mptcp_lib_kill_group_wait $tests_pid
fi
if reset_with_tcp_filter "delete and re-add" ns2 10.0.3.2 REJECT OUTPUT &&
@@ -3970,7 +3968,7 @@ endpoint_tests()
pm_nl_set_limits $ns2 0 3
pm_nl_add_endpoint $ns2 10.0.1.2 id 1 dev ns2eth1 flags subflow
pm_nl_add_endpoint $ns2 10.0.2.2 id 2 dev ns2eth2 flags subflow
- { test_linkfail=4 speed=5 \
+ { timeout_test=120 test_linkfail=128 speed=5 \
run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null
local tests_pid=$!
@@ -4015,7 +4013,7 @@ endpoint_tests()
chk_mptcp_info subflows 3 subflows 3
done
- mptcp_lib_kill_wait $tests_pid
+ mptcp_lib_kill_group_wait $tests_pid
kill_events_pids
chk_evt_nr ns1 MPTCP_LIB_EVENT_LISTENER_CREATED 1
@@ -4048,7 +4046,7 @@ endpoint_tests()
# broadcast IP: no packet for this address will be received on ns1
pm_nl_add_endpoint $ns1 224.0.0.1 id 2 flags signal
pm_nl_add_endpoint $ns1 10.0.1.1 id 42 flags signal
- { test_linkfail=4 speed=5 \
+ { timeout_test=120 test_linkfail=128 speed=5 \
run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null
local tests_pid=$!
@@ -4057,39 +4055,46 @@ endpoint_tests()
$ns1 10.0.2.1 id 1 flags signal
chk_subflow_nr "before delete" 2
chk_mptcp_info subflows 1 subflows 1
+ chk_mptcp_info add_addr_signal 2 add_addr_accepted 1
pm_nl_del_endpoint $ns1 1 10.0.2.1
pm_nl_del_endpoint $ns1 2 224.0.0.1
sleep 0.5
chk_subflow_nr "after delete" 1
chk_mptcp_info subflows 0 subflows 0
+ chk_mptcp_info add_addr_signal 0 add_addr_accepted 0
pm_nl_add_endpoint $ns1 10.0.2.1 id 1 flags signal
pm_nl_add_endpoint $ns1 10.0.3.1 id 2 flags signal
wait_mpj $ns2
chk_subflow_nr "after re-add" 3
chk_mptcp_info subflows 2 subflows 2
+ chk_mptcp_info add_addr_signal 2 add_addr_accepted 2
pm_nl_del_endpoint $ns1 42 10.0.1.1
sleep 0.5
chk_subflow_nr "after delete ID 0" 2
chk_mptcp_info subflows 2 subflows 2
+ chk_mptcp_info add_addr_signal 2 add_addr_accepted 2
pm_nl_add_endpoint $ns1 10.0.1.1 id 99 flags signal
wait_mpj $ns2
chk_subflow_nr "after re-add ID 0" 3
chk_mptcp_info subflows 3 subflows 3
+ chk_mptcp_info add_addr_signal 3 add_addr_accepted 2
pm_nl_del_endpoint $ns1 99 10.0.1.1
sleep 0.5
chk_subflow_nr "after re-delete ID 0" 2
chk_mptcp_info subflows 2 subflows 2
+ chk_mptcp_info add_addr_signal 2 add_addr_accepted 2
pm_nl_add_endpoint $ns1 10.0.1.1 id 88 flags signal
wait_mpj $ns2
chk_subflow_nr "after re-re-add ID 0" 3
chk_mptcp_info subflows 3 subflows 3
- mptcp_lib_kill_wait $tests_pid
+ chk_mptcp_info add_addr_signal 3 add_addr_accepted 2
+ mptcp_lib_kill_group_wait $tests_pid
kill_events_pids
chk_evt_nr ns1 MPTCP_LIB_EVENT_LISTENER_CREATED 1
@@ -4121,7 +4126,7 @@ endpoint_tests()
# broadcast IP: no packet for this address will be received on ns1
pm_nl_add_endpoint $ns1 224.0.0.1 id 2 flags signal
pm_nl_add_endpoint $ns2 10.0.3.2 id 3 flags subflow
- { test_linkfail=4 speed=20 \
+ { timeout_test=120 test_linkfail=128 speed=20 \
run_tests $ns1 $ns2 10.0.1.1 & } 2>/dev/null
local tests_pid=$!
@@ -4137,7 +4142,7 @@ endpoint_tests()
wait_mpj $ns2
pm_nl_add_endpoint $ns1 10.0.3.1 id 2 flags signal
wait_mpj $ns2
- mptcp_lib_kill_wait $tests_pid
+ mptcp_lib_kill_group_wait $tests_pid
join_syn_tx=3 join_connect_err=1 \
chk_join_nr 2 2 2
diff --git a/tools/testing/selftests/net/mptcp/mptcp_lib.sh b/tools/testing/selftests/net/mptcp/mptcp_lib.sh
index d62e653d48b0..f4388900016a 100644
--- a/tools/testing/selftests/net/mptcp/mptcp_lib.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh
@@ -350,6 +350,27 @@ mptcp_lib_kill_wait() {
wait "${1}" 2>/dev/null
}
+# $1: PID
+mptcp_lib_pid_list_children() {
+ local curr="${1}"
+ # evoke 'ps' only once
+ local pids="${2:-"$(ps o pid,ppid)"}"
+
+ echo "${curr}"
+
+ local pid
+ for pid in $(echo "${pids}" | awk "\$2 == ${curr} { print \$1 }"); do
+ mptcp_lib_pid_list_children "${pid}" "${pids}"
+ done
+}
+
+# $1: PID
+mptcp_lib_kill_group_wait() {
+ # Some users might not have procps-ng: cannot use "kill -- -PID"
+ mptcp_lib_pid_list_children "${1}" | xargs -r kill &>/dev/null
+ wait "${1}" 2>/dev/null
+}
+
# $1: IP address
mptcp_lib_is_v6() {
[ -z "${1##*:*}" ]
diff --git a/tools/testing/selftests/pidfd/pidfd.h b/tools/testing/selftests/pidfd/pidfd.h
index f87993def738..d60f10a873bb 100644
--- a/tools/testing/selftests/pidfd/pidfd.h
+++ b/tools/testing/selftests/pidfd/pidfd.h
@@ -148,6 +148,14 @@
#define PIDFD_INFO_COREDUMP (1UL << 4)
#endif
+#ifndef PIDFD_INFO_SUPPORTED_MASK
+#define PIDFD_INFO_SUPPORTED_MASK (1UL << 5)
+#endif
+
+#ifndef PIDFD_INFO_COREDUMP_SIGNAL
+#define PIDFD_INFO_COREDUMP_SIGNAL (1UL << 6)
+#endif
+
#ifndef PIDFD_COREDUMPED
#define PIDFD_COREDUMPED (1U << 0) /* Did crash and... */
#endif
@@ -183,8 +191,11 @@ struct pidfd_info {
__u32 fsuid;
__u32 fsgid;
__s32 exit_code;
- __u32 coredump_mask;
- __u32 __spare1;
+ struct {
+ __u32 coredump_mask;
+ __u32 coredump_signal;
+ };
+ __u64 supported_mask;
};
/*
diff --git a/tools/testing/selftests/pidfd/pidfd_info_test.c b/tools/testing/selftests/pidfd/pidfd_info_test.c
index a0eb6e81eaa2..cb5430a2fd75 100644
--- a/tools/testing/selftests/pidfd/pidfd_info_test.c
+++ b/tools/testing/selftests/pidfd/pidfd_info_test.c
@@ -690,4 +690,77 @@ TEST_F(pidfd_info, thread_group_exec_thread)
EXPECT_EQ(close(pidfd_thread), 0);
}
+/*
+ * Test: PIDFD_INFO_SUPPORTED_MASK field
+ *
+ * Verify that when PIDFD_INFO_SUPPORTED_MASK is requested, the kernel
+ * returns the supported_mask field indicating which flags the kernel supports.
+ */
+TEST(supported_mask_field)
+{
+ struct pidfd_info info = {
+ .mask = PIDFD_INFO_SUPPORTED_MASK,
+ };
+ int pidfd;
+ pid_t pid;
+
+ pid = create_child(&pidfd, 0);
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0)
+ pause();
+
+ /* Request supported_mask field */
+ ASSERT_EQ(ioctl(pidfd, PIDFD_GET_INFO, &info), 0);
+
+ /* Verify PIDFD_INFO_SUPPORTED_MASK is set in the reply */
+ ASSERT_TRUE(!!(info.mask & PIDFD_INFO_SUPPORTED_MASK));
+
+ /* Verify supported_mask contains expected flags */
+ ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_PID));
+ ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_CREDS));
+ ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_CGROUPID));
+ ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_EXIT));
+ ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_COREDUMP));
+ ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_SUPPORTED_MASK));
+ ASSERT_TRUE(!!(info.supported_mask & PIDFD_INFO_COREDUMP_SIGNAL));
+
+ /* Clean up */
+ sys_pidfd_send_signal(pidfd, SIGKILL, NULL, 0);
+ sys_waitid(P_PIDFD, pidfd, NULL, WEXITED);
+ close(pidfd);
+}
+
+/*
+ * Test: PIDFD_INFO_SUPPORTED_MASK always available
+ *
+ * Verify that supported_mask is returned even when other fields are requested.
+ */
+TEST(supported_mask_with_other_fields)
+{
+ struct pidfd_info info = {
+ .mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_SUPPORTED_MASK,
+ };
+ int pidfd;
+ pid_t pid;
+
+ pid = create_child(&pidfd, 0);
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0)
+ pause();
+
+ ASSERT_EQ(ioctl(pidfd, PIDFD_GET_INFO, &info), 0);
+
+ /* Both fields should be present */
+ ASSERT_TRUE(!!(info.mask & PIDFD_INFO_CGROUPID));
+ ASSERT_TRUE(!!(info.mask & PIDFD_INFO_SUPPORTED_MASK));
+ ASSERT_NE(info.supported_mask, 0);
+
+ /* Clean up */
+ sys_pidfd_send_signal(pidfd, SIGKILL, NULL, 0);
+ sys_waitid(P_PIDFD, pidfd, NULL, WEXITED);
+ close(pidfd);
+}
+
TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json
index 998e5a2f4579..0091bcd91c2c 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json
@@ -961,5 +961,49 @@
"teardown": [
"$TC qdisc del dev $DUMMY root"
]
+ },
+ {
+ "id": "4989",
+ "name": "Try to add an fq child to an ingress qdisc",
+ "category": [
+ "qdisc",
+ "ingress"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DUMMY handle ffff:0 ingress"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY parent ffff:0 handle ffe0:0 fq",
+ "expExitCode": "2",
+ "verifyCmd": "$TC -j qdisc ls dev $DUMMY handle ffe0:",
+ "matchJSON": [],
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY ingress"
+ ]
+ },
+ {
+ "id": "c2b0",
+ "name": "Try to add an fq child to a clsact qdisc",
+ "category": [
+ "qdisc",
+ "ingress"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DUMMY handle ffff:0 clsact"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY parent ffff:0 handle ffe0:0 fq",
+ "expExitCode": "2",
+ "verifyCmd": "$TC -j qdisc ls dev $DUMMY handle ffe0:",
+ "matchJSON": [],
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY clsact"
+ ]
}
]
diff --git a/tools/testing/selftests/user_events/perf_test.c b/tools/testing/selftests/user_events/perf_test.c
index 5288e768b207..68625362add2 100644
--- a/tools/testing/selftests/user_events/perf_test.c
+++ b/tools/testing/selftests/user_events/perf_test.c
@@ -236,7 +236,7 @@ TEST_F(user, perf_empty_events) {
ASSERT_EQ(1 << reg.enable_bit, self->check);
/* Ensure write shows up at correct offset */
- ASSERT_NE(-1, write(self->data_fd, &reg.write_index,
+ ASSERT_NE(-1, write(self->data_fd, (void *)&reg.write_index,
sizeof(reg.write_index)));
val = (void *)(((char *)perf_page) + perf_page->data_offset);
ASSERT_EQ(PERF_RECORD_SAMPLE, *val);
diff --git a/tools/testing/selftests/vfio/lib/include/vfio_util.h b/tools/testing/selftests/vfio/lib/include/vfio_util.h
index 240409bf5f8a..69ec0c856481 100644
--- a/tools/testing/selftests/vfio/lib/include/vfio_util.h
+++ b/tools/testing/selftests/vfio/lib/include/vfio_util.h
@@ -4,9 +4,12 @@
#include <fcntl.h>
#include <string.h>
-#include <linux/vfio.h>
+
+#include <uapi/linux/types.h>
+#include <linux/iommufd.h>
#include <linux/list.h>
#include <linux/pci_regs.h>
+#include <linux/vfio.h>
#include "../../../kselftest.h"
@@ -185,6 +188,13 @@ struct vfio_pci_device {
struct vfio_pci_driver driver;
};
+struct iova_allocator {
+ struct iommu_iova_range *ranges;
+ u32 nranges;
+ u32 range_idx;
+ u64 range_offset;
+};
+
/*
* Return the BDF string of the device that the test should use.
*
@@ -206,6 +216,13 @@ struct vfio_pci_device *vfio_pci_device_init(const char *bdf, const char *iommu_
void vfio_pci_device_cleanup(struct vfio_pci_device *device);
void vfio_pci_device_reset(struct vfio_pci_device *device);
+struct iommu_iova_range *vfio_pci_iova_ranges(struct vfio_pci_device *device,
+ u32 *nranges);
+
+struct iova_allocator *iova_allocator_init(struct vfio_pci_device *device);
+void iova_allocator_cleanup(struct iova_allocator *allocator);
+iova_t iova_allocator_alloc(struct iova_allocator *allocator, size_t size);
+
int __vfio_pci_dma_map(struct vfio_pci_device *device,
struct vfio_dma_region *region);
int __vfio_pci_dma_unmap(struct vfio_pci_device *device,
diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_device.c b/tools/testing/selftests/vfio/lib/vfio_pci_device.c
index a381fd253aa7..b479a359da12 100644
--- a/tools/testing/selftests/vfio/lib/vfio_pci_device.c
+++ b/tools/testing/selftests/vfio/lib/vfio_pci_device.c
@@ -12,11 +12,12 @@
#include <sys/mman.h>
#include <uapi/linux/types.h>
+#include <linux/iommufd.h>
#include <linux/limits.h>
#include <linux/mman.h>
+#include <linux/overflow.h>
#include <linux/types.h>
#include <linux/vfio.h>
-#include <linux/iommufd.h>
#include "../../../kselftest.h"
#include <vfio_util.h>
@@ -29,6 +30,249 @@
VFIO_ASSERT_EQ(__ret, 0, "ioctl(%s, %s, %s) returned %d\n", #_fd, #_op, #_arg, __ret); \
} while (0)
+static struct vfio_info_cap_header *next_cap_hdr(void *buf, u32 bufsz,
+ u32 *cap_offset)
+{
+ struct vfio_info_cap_header *hdr;
+
+ if (!*cap_offset)
+ return NULL;
+
+ VFIO_ASSERT_LT(*cap_offset, bufsz);
+ VFIO_ASSERT_GE(bufsz - *cap_offset, sizeof(*hdr));
+
+ hdr = (struct vfio_info_cap_header *)((u8 *)buf + *cap_offset);
+ *cap_offset = hdr->next;
+
+ return hdr;
+}
+
+static struct vfio_info_cap_header *vfio_iommu_info_cap_hdr(struct vfio_iommu_type1_info *info,
+ u16 cap_id)
+{
+ struct vfio_info_cap_header *hdr;
+ u32 cap_offset = info->cap_offset;
+ u32 max_depth;
+ u32 depth = 0;
+
+ if (!(info->flags & VFIO_IOMMU_INFO_CAPS))
+ return NULL;
+
+ if (cap_offset)
+ VFIO_ASSERT_GE(cap_offset, sizeof(*info));
+
+ max_depth = (info->argsz - sizeof(*info)) / sizeof(*hdr);
+
+ while ((hdr = next_cap_hdr(info, info->argsz, &cap_offset))) {
+ depth++;
+ VFIO_ASSERT_LE(depth, max_depth, "Capability chain contains a cycle\n");
+
+ if (hdr->id == cap_id)
+ return hdr;
+ }
+
+ return NULL;
+}
+
+/* Return buffer including capability chain, if present. Free with free() */
+static struct vfio_iommu_type1_info *vfio_iommu_get_info(struct vfio_pci_device *device)
+{
+ struct vfio_iommu_type1_info *info;
+
+ info = malloc(sizeof(*info));
+ VFIO_ASSERT_NOT_NULL(info);
+
+ *info = (struct vfio_iommu_type1_info) {
+ .argsz = sizeof(*info),
+ };
+
+ ioctl_assert(device->container_fd, VFIO_IOMMU_GET_INFO, info);
+ VFIO_ASSERT_GE(info->argsz, sizeof(*info));
+
+ info = realloc(info, info->argsz);
+ VFIO_ASSERT_NOT_NULL(info);
+
+ ioctl_assert(device->container_fd, VFIO_IOMMU_GET_INFO, info);
+ VFIO_ASSERT_GE(info->argsz, sizeof(*info));
+
+ return info;
+}
+
+/*
+ * Return iova ranges for the device's container. Normalize vfio_iommu_type1 to
+ * report iommufd's iommu_iova_range. Free with free().
+ */
+static struct iommu_iova_range *vfio_iommu_iova_ranges(struct vfio_pci_device *device,
+ u32 *nranges)
+{
+ struct vfio_iommu_type1_info_cap_iova_range *cap_range;
+ struct vfio_iommu_type1_info *info;
+ struct vfio_info_cap_header *hdr;
+ struct iommu_iova_range *ranges = NULL;
+
+ info = vfio_iommu_get_info(device);
+ hdr = vfio_iommu_info_cap_hdr(info, VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE);
+ VFIO_ASSERT_NOT_NULL(hdr);
+
+ cap_range = container_of(hdr, struct vfio_iommu_type1_info_cap_iova_range, header);
+ VFIO_ASSERT_GT(cap_range->nr_iovas, 0);
+
+ ranges = calloc(cap_range->nr_iovas, sizeof(*ranges));
+ VFIO_ASSERT_NOT_NULL(ranges);
+
+ for (u32 i = 0; i < cap_range->nr_iovas; i++) {
+ ranges[i] = (struct iommu_iova_range){
+ .start = cap_range->iova_ranges[i].start,
+ .last = cap_range->iova_ranges[i].end,
+ };
+ }
+
+ *nranges = cap_range->nr_iovas;
+
+ free(info);
+ return ranges;
+}
+
+/* Return iova ranges of the device's IOAS. Free with free() */
+static struct iommu_iova_range *iommufd_iova_ranges(struct vfio_pci_device *device,
+ u32 *nranges)
+{
+ struct iommu_iova_range *ranges;
+ int ret;
+
+ struct iommu_ioas_iova_ranges query = {
+ .size = sizeof(query),
+ .ioas_id = device->ioas_id,
+ };
+
+ ret = ioctl(device->iommufd, IOMMU_IOAS_IOVA_RANGES, &query);
+ VFIO_ASSERT_EQ(ret, -1);
+ VFIO_ASSERT_EQ(errno, EMSGSIZE);
+ VFIO_ASSERT_GT(query.num_iovas, 0);
+
+ ranges = calloc(query.num_iovas, sizeof(*ranges));
+ VFIO_ASSERT_NOT_NULL(ranges);
+
+ query.allowed_iovas = (uintptr_t)ranges;
+
+ ioctl_assert(device->iommufd, IOMMU_IOAS_IOVA_RANGES, &query);
+ *nranges = query.num_iovas;
+
+ return ranges;
+}
+
+static int iova_range_comp(const void *a, const void *b)
+{
+ const struct iommu_iova_range *ra = a, *rb = b;
+
+ if (ra->start < rb->start)
+ return -1;
+
+ if (ra->start > rb->start)
+ return 1;
+
+ return 0;
+}
+
+/* Return sorted IOVA ranges of the device. Free with free(). */
+struct iommu_iova_range *vfio_pci_iova_ranges(struct vfio_pci_device *device,
+ u32 *nranges)
+{
+ struct iommu_iova_range *ranges;
+
+ if (device->iommufd)
+ ranges = iommufd_iova_ranges(device, nranges);
+ else
+ ranges = vfio_iommu_iova_ranges(device, nranges);
+
+ if (!ranges)
+ return NULL;
+
+ VFIO_ASSERT_GT(*nranges, 0);
+
+ /* Sort and check that ranges are sane and non-overlapping */
+ qsort(ranges, *nranges, sizeof(*ranges), iova_range_comp);
+ VFIO_ASSERT_LT(ranges[0].start, ranges[0].last);
+
+ for (u32 i = 1; i < *nranges; i++) {
+ VFIO_ASSERT_LT(ranges[i].start, ranges[i].last);
+ VFIO_ASSERT_LT(ranges[i - 1].last, ranges[i].start);
+ }
+
+ return ranges;
+}
+
+struct iova_allocator *iova_allocator_init(struct vfio_pci_device *device)
+{
+ struct iova_allocator *allocator;
+ struct iommu_iova_range *ranges;
+ u32 nranges;
+
+ ranges = vfio_pci_iova_ranges(device, &nranges);
+ VFIO_ASSERT_NOT_NULL(ranges);
+
+ allocator = malloc(sizeof(*allocator));
+ VFIO_ASSERT_NOT_NULL(allocator);
+
+ *allocator = (struct iova_allocator){
+ .ranges = ranges,
+ .nranges = nranges,
+ .range_idx = 0,
+ .range_offset = 0,
+ };
+
+ return allocator;
+}
+
+void iova_allocator_cleanup(struct iova_allocator *allocator)
+{
+ free(allocator->ranges);
+ free(allocator);
+}
+
+iova_t iova_allocator_alloc(struct iova_allocator *allocator, size_t size)
+{
+ VFIO_ASSERT_GT(size, 0, "Invalid size arg, zero\n");
+ VFIO_ASSERT_EQ(size & (size - 1), 0, "Invalid size arg, non-power-of-2\n");
+
+ for (;;) {
+ struct iommu_iova_range *range;
+ iova_t iova, last;
+
+ VFIO_ASSERT_LT(allocator->range_idx, allocator->nranges,
+ "IOVA allocator out of space\n");
+
+ range = &allocator->ranges[allocator->range_idx];
+ iova = range->start + allocator->range_offset;
+
+ /* Check for sufficient space at the current offset */
+ if (check_add_overflow(iova, size - 1, &last) ||
+ last > range->last)
+ goto next_range;
+
+ /* Align iova to size */
+ iova = last & ~(size - 1);
+
+ /* Check for sufficient space at the aligned iova */
+ if (check_add_overflow(iova, size - 1, &last) ||
+ last > range->last)
+ goto next_range;
+
+ if (last == range->last) {
+ allocator->range_idx++;
+ allocator->range_offset = 0;
+ } else {
+ allocator->range_offset = last - range->start + 1;
+ }
+
+ return iova;
+
+next_range:
+ allocator->range_idx++;
+ allocator->range_offset = 0;
+ }
+}
+
iova_t __to_iova(struct vfio_pci_device *device, void *vaddr)
{
struct vfio_dma_region *region;
diff --git a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c
index 4f1ea79a200c..102603d4407d 100644
--- a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c
+++ b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c
@@ -3,6 +3,8 @@
#include <sys/mman.h>
#include <unistd.h>
+#include <uapi/linux/types.h>
+#include <linux/iommufd.h>
#include <linux/limits.h>
#include <linux/mman.h>
#include <linux/sizes.h>
@@ -93,6 +95,7 @@ static int iommu_mapping_get(const char *bdf, u64 iova,
FIXTURE(vfio_dma_mapping_test) {
struct vfio_pci_device *device;
+ struct iova_allocator *iova_allocator;
};
FIXTURE_VARIANT(vfio_dma_mapping_test) {
@@ -117,10 +120,12 @@ FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(anonymous_hugetlb_1gb, SZ_1G, MAP_HUGETLB |
FIXTURE_SETUP(vfio_dma_mapping_test)
{
self->device = vfio_pci_device_init(device_bdf, variant->iommu_mode);
+ self->iova_allocator = iova_allocator_init(self->device);
}
FIXTURE_TEARDOWN(vfio_dma_mapping_test)
{
+ iova_allocator_cleanup(self->iova_allocator);
vfio_pci_device_cleanup(self->device);
}
@@ -142,7 +147,7 @@ TEST_F(vfio_dma_mapping_test, dma_map_unmap)
else
ASSERT_NE(region.vaddr, MAP_FAILED);
- region.iova = (u64)region.vaddr;
+ region.iova = iova_allocator_alloc(self->iova_allocator, size);
region.size = size;
vfio_pci_dma_map(self->device, &region);
@@ -219,7 +224,10 @@ FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES();
FIXTURE_SETUP(vfio_dma_map_limit_test)
{
struct vfio_dma_region *region = &self->region;
+ struct iommu_iova_range *ranges;
u64 region_size = getpagesize();
+ iova_t last_iova;
+ u32 nranges;
/*
* Over-allocate mmap by double the size to provide enough backing vaddr
@@ -232,8 +240,13 @@ FIXTURE_SETUP(vfio_dma_map_limit_test)
MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
ASSERT_NE(region->vaddr, MAP_FAILED);
- /* One page prior to the end of address space */
- region->iova = ~(iova_t)0 & ~(region_size - 1);
+ ranges = vfio_pci_iova_ranges(self->device, &nranges);
+ VFIO_ASSERT_NOT_NULL(ranges);
+ last_iova = ranges[nranges - 1].last;
+ free(ranges);
+
+ /* One page prior to the last iova */
+ region->iova = last_iova & ~(region_size - 1);
region->size = region_size;
}
@@ -276,6 +289,7 @@ TEST_F(vfio_dma_map_limit_test, overflow)
struct vfio_dma_region *region = &self->region;
int rc;
+ region->iova = ~(iova_t)0 & ~(region->size - 1);
region->size = self->mmap_size;
rc = __vfio_pci_dma_map(self->device, region);
diff --git a/tools/testing/selftests/vfio/vfio_pci_driver_test.c b/tools/testing/selftests/vfio/vfio_pci_driver_test.c
index 2dbd70b7db62..f69eec8b928d 100644
--- a/tools/testing/selftests/vfio/vfio_pci_driver_test.c
+++ b/tools/testing/selftests/vfio/vfio_pci_driver_test.c
@@ -19,6 +19,7 @@ static const char *device_bdf;
} while (0)
static void region_setup(struct vfio_pci_device *device,
+ struct iova_allocator *iova_allocator,
struct vfio_dma_region *region, u64 size)
{
const int flags = MAP_SHARED | MAP_ANONYMOUS;
@@ -29,7 +30,7 @@ static void region_setup(struct vfio_pci_device *device,
VFIO_ASSERT_NE(vaddr, MAP_FAILED);
region->vaddr = vaddr;
- region->iova = (u64)vaddr;
+ region->iova = iova_allocator_alloc(iova_allocator, size);
region->size = size;
vfio_pci_dma_map(device, region);
@@ -44,6 +45,7 @@ static void region_teardown(struct vfio_pci_device *device,
FIXTURE(vfio_pci_driver_test) {
struct vfio_pci_device *device;
+ struct iova_allocator *iova_allocator;
struct vfio_dma_region memcpy_region;
void *vaddr;
int msi_fd;
@@ -72,14 +74,15 @@ FIXTURE_SETUP(vfio_pci_driver_test)
struct vfio_pci_driver *driver;
self->device = vfio_pci_device_init(device_bdf, variant->iommu_mode);
+ self->iova_allocator = iova_allocator_init(self->device);
driver = &self->device->driver;
- region_setup(self->device, &self->memcpy_region, SZ_1G);
- region_setup(self->device, &driver->region, SZ_2M);
+ region_setup(self->device, self->iova_allocator, &self->memcpy_region, SZ_1G);
+ region_setup(self->device, self->iova_allocator, &driver->region, SZ_2M);
/* Any IOVA that doesn't overlap memcpy_region and driver->region. */
- self->unmapped_iova = 8UL * SZ_1G;
+ self->unmapped_iova = iova_allocator_alloc(self->iova_allocator, SZ_1G);
vfio_pci_driver_init(self->device);
self->msi_fd = self->device->msi_eventfds[driver->msi];
@@ -108,6 +111,7 @@ FIXTURE_TEARDOWN(vfio_pci_driver_test)
region_teardown(self->device, &self->memcpy_region);
region_teardown(self->device, &driver->region);
+ iova_allocator_cleanup(self->iova_allocator);
vfio_pci_device_cleanup(self->device);
}
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index fbca8c0972da..ffadc5ee8e04 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -623,31 +623,50 @@ err:
return r;
}
-void kvm_gmem_unbind(struct kvm_memory_slot *slot)
+static void __kvm_gmem_unbind(struct kvm_memory_slot *slot, struct kvm_gmem *gmem)
{
unsigned long start = slot->gmem.pgoff;
unsigned long end = start + slot->npages;
- struct kvm_gmem *gmem;
+
+ xa_store_range(&gmem->bindings, start, end - 1, NULL, GFP_KERNEL);
+
+ /*
+ * synchronize_srcu(&kvm->srcu) ensured that kvm_gmem_get_pfn()
+ * cannot see this memslot.
+ */
+ WRITE_ONCE(slot->gmem.file, NULL);
+}
+
+void kvm_gmem_unbind(struct kvm_memory_slot *slot)
+{
struct file *file;
/*
- * Nothing to do if the underlying file was already closed (or is being
- * closed right now), kvm_gmem_release() invalidates all bindings.
+ * Nothing to do if the underlying file was _already_ closed, as
+ * kvm_gmem_release() invalidates and nullifies all bindings.
*/
- file = kvm_gmem_get_file(slot);
- if (!file)
+ if (!slot->gmem.file)
return;
- gmem = file->private_data;
-
- filemap_invalidate_lock(file->f_mapping);
- xa_store_range(&gmem->bindings, start, end - 1, NULL, GFP_KERNEL);
+ file = kvm_gmem_get_file(slot);
/*
- * synchronize_srcu(&kvm->srcu) ensured that kvm_gmem_get_pfn()
- * cannot see this memslot.
+ * However, if the file is _being_ closed, then the bindings need to be
+ * removed as kvm_gmem_release() might not run until after the memslot
+ * is freed. Note, modifying the bindings is safe even though the file
+ * is dying as kvm_gmem_release() nullifies slot->gmem.file under
+ * slots_lock, and only puts its reference to KVM after destroying all
+ * bindings. I.e. reaching this point means kvm_gmem_release() hasn't
+ * yet destroyed the bindings or freed the gmem_file, and can't do so
+ * until the caller drops slots_lock.
*/
- WRITE_ONCE(slot->gmem.file, NULL);
+ if (!file) {
+ __kvm_gmem_unbind(slot, slot->gmem.file->private_data);
+ return;
+ }
+
+ filemap_invalidate_lock(file->f_mapping);
+ __kvm_gmem_unbind(slot, file->private_data);
filemap_invalidate_unlock(file->f_mapping);
fput(file);