diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block index e34cdeeeb9d4..a0ed87386639 100644 --- a/Documentation/ABI/testing/sysfs-block +++ b/Documentation/ABI/testing/sysfs-block @@ -28,6 +28,18 @@ Description: For more details refer Documentation/admin-guide/iostats.rst +What: /sys/block//diskseq +Date: February 2021 +Contact: Matteo Croce +Description: + The /sys/block//diskseq files reports the disk + sequence number, which is a monotonically increasing + number assigned to every drive. + Some devices, like the loop device, refresh such number + every time the backing file is changed. + The value type is 64 bit unsigned. + + What: /sys/block///stat Date: February 2008 Contact: Jerome Marchand diff --git a/Documentation/ABI/testing/sysfs-block-device b/Documentation/ABI/testing/sysfs-block-device index aa0fb500e3c9..7ac7b19b2f72 100644 --- a/Documentation/ABI/testing/sysfs-block-device +++ b/Documentation/ABI/testing/sysfs-block-device @@ -55,6 +55,43 @@ Date: Oct, 2016 KernelVersion: v4.10 Contact: linux-ide@vger.kernel.org Description: - (RW) Write to the file to turn on or off the SATA ncq (native - command queueing) support. By default this feature is turned - off. + (RW) Write to the file to turn on or off the SATA NCQ (native + command queueing) priority support. By default this feature is + turned off. If the device does not support the SATA NCQ + priority feature, writing "1" to this file results in an error + (see ncq_prio_supported). + + +What: /sys/block/*/device/sas_ncq_prio_enable +Date: Oct, 2016 +KernelVersion: v4.10 +Contact: linux-ide@vger.kernel.org +Description: + (RW) This is the equivalent of the ncq_prio_enable attribute + file for SATA devices connected to a SAS host-bus-adapter + (HBA) implementing support for the SATA NCQ priority feature. + This file does not exist if the HBA driver does not implement + support for the SATA NCQ priority feature, regardless of the + device support for this feature (see sas_ncq_prio_supported). + + +What: /sys/block/*/device/ncq_prio_supported +Date: Aug, 2021 +KernelVersion: v5.15 +Contact: linux-ide@vger.kernel.org +Description: + (RO) Indicates if the device supports the SATA NCQ (native + command queueing) priority feature. + + +What: /sys/block/*/device/sas_ncq_prio_supported +Date: Aug, 2021 +KernelVersion: v5.15 +Contact: linux-ide@vger.kernel.org +Description: + (RO) This is the equivalent of the ncq_prio_supported attribute + file for SATA devices connected to a SAS host-bus-adapter + (HBA) implementing support for the SATA NCQ priority feature. + This file does not exist if the HBA driver does not implement + support for the SATA NCQ priority feature, regardless of the + device support for this feature. diff --git a/Documentation/admin-guide/hw-vuln/index.rst b/Documentation/admin-guide/hw-vuln/index.rst index f12cda55538b..8cbc711cda93 100644 --- a/Documentation/admin-guide/hw-vuln/index.rst +++ b/Documentation/admin-guide/hw-vuln/index.rst @@ -16,3 +16,4 @@ are configurable at compile, boot or run time. multihit.rst special-register-buffer-data-sampling.rst core-scheduling.rst + l1d_flush.rst diff --git a/Documentation/admin-guide/hw-vuln/l1d_flush.rst b/Documentation/admin-guide/hw-vuln/l1d_flush.rst new file mode 100644 index 000000000000..210020bc3f56 --- /dev/null +++ b/Documentation/admin-guide/hw-vuln/l1d_flush.rst @@ -0,0 +1,69 @@ +L1D Flushing +============ + +With an increasing number of vulnerabilities being reported around data +leaks from the Level 1 Data cache (L1D) the kernel provides an opt-in +mechanism to flush the L1D cache on context switch. + +This mechanism can be used to address e.g. CVE-2020-0550. For applications +the mechanism keeps them safe from vulnerabilities, related to leaks +(snooping of) from the L1D cache. + + +Related CVEs +------------ +The following CVEs can be addressed by this +mechanism + + ============= ======================== ================== + CVE-2020-0550 Improper Data Forwarding OS related aspects + ============= ======================== ================== + +Usage Guidelines +---------------- + +Please see document: :ref:`Documentation/userspace-api/spec_ctrl.rst +` for details. + +**NOTE**: The feature is disabled by default, applications need to +specifically opt into the feature to enable it. + +Mitigation +---------- + +When PR_SET_L1D_FLUSH is enabled for a task a flush of the L1D cache is +performed when the task is scheduled out and the incoming task belongs to a +different process and therefore to a different address space. + +If the underlying CPU supports L1D flushing in hardware, the hardware +mechanism is used, software fallback for the mitigation, is not supported. + +Mitigation control on the kernel command line +--------------------------------------------- + +The kernel command line allows to control the L1D flush mitigations at boot +time with the option "l1d_flush=". The valid arguments for this option are: + + ============ ============================================================= + on Enables the prctl interface, applications trying to use + the prctl() will fail with an error if l1d_flush is not + enabled + ============ ============================================================= + +By default the mechanism is disabled. + +Limitations +----------- + +The mechanism does not mitigate L1D data leaks between tasks belonging to +different processes which are concurrently executing on sibling threads of +a physical CPU core when SMT is enabled on the system. + +This can be addressed by controlled placement of processes on physical CPU +cores or by disabling SMT. See the relevant chapter in the L1TF mitigation +document: :ref:`Documentation/admin-guide/hw-vuln/l1tf.rst `. + +**NOTE** : The opt-in of a task for L1D flushing works only when the task's +affinity is limited to cores running in non-SMT mode. If a task which +requested L1D flushing is scheduled on a SMT-enabled core the kernel sends +a SIGBUS to the task. diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 6f10a677632e..4b0dc54e9f11 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2425,6 +2425,23 @@ feature (tagged TLBs) on capable Intel chips. Default is 1 (enabled) + l1d_flush= [X86,INTEL] + Control mitigation for L1D based snooping vulnerability. + + Certain CPUs are vulnerable to an exploit against CPU + internal buffers which can forward information to a + disclosure gadget under certain conditions. + + In vulnerable processors, the speculatively + forwarded data can be used in a cache side channel + attack, to access data to which the attacker does + not have direct access. + + This parameter controls the mitigation. The + options are: + + on - enable the interface for the mitigation + l1tf= [X86] Control mitigation of the L1TF vulnerability on affected CPUs @@ -4781,7 +4798,7 @@ reboot= [KNL] Format (x86 or x86_64): - [w[arm] | c[old] | h[ard] | s[oft] | g[pio]] \ + [w[arm] | c[old] | h[ard] | s[oft] | g[pio]] | d[efault] \ [[,]s[mp]#### \ [[,]b[ios] | a[cpi] | k[bd] | t[riple] | e[fi] | p[ci]] \ [[,]f[orce] diff --git a/Documentation/devicetree/bindings/timer/rockchip,rk-timer.txt b/Documentation/devicetree/bindings/timer/rockchip,rk-timer.txt deleted file mode 100644 index d65fdce7c7f0..000000000000 --- a/Documentation/devicetree/bindings/timer/rockchip,rk-timer.txt +++ /dev/null @@ -1,27 +0,0 @@ -Rockchip rk timer - -Required properties: -- compatible: should be: - "rockchip,rv1108-timer", "rockchip,rk3288-timer": for Rockchip RV1108 - "rockchip,rk3036-timer", "rockchip,rk3288-timer": for Rockchip RK3036 - "rockchip,rk3066-timer", "rockchip,rk3288-timer": for Rockchip RK3066 - "rockchip,rk3188-timer", "rockchip,rk3288-timer": for Rockchip RK3188 - "rockchip,rk3228-timer", "rockchip,rk3288-timer": for Rockchip RK3228 - "rockchip,rk3229-timer", "rockchip,rk3288-timer": for Rockchip RK3229 - "rockchip,rk3288-timer": for Rockchip RK3288 - "rockchip,rk3368-timer", "rockchip,rk3288-timer": for Rockchip RK3368 - "rockchip,rk3399-timer": for Rockchip RK3399 -- reg: base address of the timer register starting with TIMERS CONTROL register -- interrupts: should contain the interrupts for Timer0 -- clocks : must contain an entry for each entry in clock-names -- clock-names : must include the following entries: - "timer", "pclk" - -Example: - timer: timer@ff810000 { - compatible = "rockchip,rk3288-timer"; - reg = <0xff810000 0x20>; - interrupts = ; - clocks = <&xin24m>, <&cru PCLK_TIMER>; - clock-names = "timer", "pclk"; - }; diff --git a/Documentation/devicetree/bindings/timer/rockchip,rk-timer.yaml b/Documentation/devicetree/bindings/timer/rockchip,rk-timer.yaml new file mode 100644 index 000000000000..e26ecb5893ae --- /dev/null +++ b/Documentation/devicetree/bindings/timer/rockchip,rk-timer.yaml @@ -0,0 +1,64 @@ +# SPDX-License-Identifier: GPL-2.0 +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/timer/rockchip,rk-timer.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Rockchip Timer Device Tree Bindings + +maintainers: + - Daniel Lezcano + +properties: + compatible: + oneOf: + - const: rockchip,rk3288-timer + - const: rockchip,rk3399-timer + - items: + - enum: + - rockchip,rv1108-timer + - rockchip,rk3036-timer + - rockchip,rk3066-timer + - rockchip,rk3188-timer + - rockchip,rk3228-timer + - rockchip,rk3229-timer + - rockchip,rk3288-timer + - rockchip,rk3368-timer + - rockchip,px30-timer + - const: rockchip,rk3288-timer + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + clocks: + minItems: 2 + maxItems: 2 + + clock-names: + items: + - const: pclk + - const: timer + +required: + - compatible + - reg + - interrupts + - clocks + - clock-names + +additionalProperties: false + +examples: + - | + #include + #include + + timer: timer@ff810000 { + compatible = "rockchip,rk3288-timer"; + reg = <0xff810000 0x20>; + interrupts = ; + clocks = <&cru PCLK_TIMER>, <&xin24m>; + clock-names = "pclk", "timer"; + }; diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst index f5a3207aa7fa..c57c609ad2eb 100644 --- a/Documentation/driver-api/index.rst +++ b/Documentation/driver-api/index.rst @@ -85,7 +85,6 @@ available subsections can be seen below. io-mapping io_ordering generic-counter - lightnvm-pblk memory-devices/index men-chameleon-bus ntb diff --git a/Documentation/driver-api/lightnvm-pblk.rst b/Documentation/driver-api/lightnvm-pblk.rst deleted file mode 100644 index 1040ed1cec81..000000000000 --- a/Documentation/driver-api/lightnvm-pblk.rst +++ /dev/null @@ -1,21 +0,0 @@ -pblk: Physical Block Device Target -================================== - -pblk implements a fully associative, host-based FTL that exposes a traditional -block I/O interface. Its primary responsibilities are: - - - Map logical addresses onto physical addresses (4KB granularity) in a - logical-to-physical (L2P) table. - - Maintain the integrity and consistency of the L2P table as well as its - recovery from normal tear down and power outage. - - Deal with controller- and media-specific constrains. - - Handle I/O errors. - - Implement garbage collection. - - Maintain consistency across the I/O stack during synchronization points. - -For more information please refer to: - - http://lightnvm.io - -which maintains updated FAQs, manual pages, technical documentation, tools, -contacts, etc. diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst index 1409e40e6345..b7070d76f076 100644 --- a/Documentation/userspace-api/ioctl/ioctl-number.rst +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst @@ -160,7 +160,6 @@ Code Seq# Include File Comments 'K' all linux/kd.h 'L' 00-1F linux/loop.h conflict! 'L' 10-1F drivers/scsi/mpt3sas/mpt3sas_ctl.h conflict! -'L' 20-2F linux/lightnvm.h 'L' E0-FF linux/ppdd.h encrypted disk device driver 'M' all linux/soundcard.h conflict! diff --git a/Documentation/userspace-api/spec_ctrl.rst b/Documentation/userspace-api/spec_ctrl.rst index 7ddd8f667459..5e8ed9eef9aa 100644 --- a/Documentation/userspace-api/spec_ctrl.rst +++ b/Documentation/userspace-api/spec_ctrl.rst @@ -106,3 +106,11 @@ Speculation misfeature controls * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_ENABLE, 0, 0); * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_DISABLE, 0, 0); * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_FORCE_DISABLE, 0, 0); + +- PR_SPEC_L1D_FLUSH: Flush L1D Cache on context switch out of the task + (works only when tasks run on non SMT cores) + + Invocations: + * prctl(PR_GET_SPECULATION_CTRL, PR_SPEC_L1D_FLUSH, 0, 0, 0); + * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_L1D_FLUSH, PR_SPEC_ENABLE, 0, 0); + * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_L1D_FLUSH, PR_SPEC_DISABLE, 0, 0); diff --git a/Documentation/x86/x86_64/boot-options.rst b/Documentation/x86/x86_64/boot-options.rst index 5f62b3b86357..ccb7e86bf8d9 100644 --- a/Documentation/x86/x86_64/boot-options.rst +++ b/Documentation/x86/x86_64/boot-options.rst @@ -126,7 +126,7 @@ Idle loop Rebooting ========= - reboot=b[ios] | t[riple] | k[bd] | a[cpi] | e[fi] [, [w]arm | [c]old] + reboot=b[ios] | t[riple] | k[bd] | a[cpi] | e[fi] | p[ci] [, [w]arm | [c]old] bios Use the CPU reboot vector for warm reset warm @@ -145,6 +145,8 @@ Rebooting Use efi reset_system runtime service. If EFI is not configured or the EFI reset does not work, the reboot path attempts the reset using the keyboard controller. + pci + Use a write to the PCI config space register 0xcf9 to trigger reboot. Using warm reset will be much faster especially on big memory systems because the BIOS will not go through the memory check. @@ -155,6 +157,13 @@ Rebooting Don't stop other CPUs on reboot. This can make reboot more reliable in some cases. + reboot=default + There are some built-in platform specific "quirks" - you may see: + "reboot: series board detected. Selecting for reboots." + In the case where you think the quirk is in error (e.g. you have + newer BIOS, or newer board) using this option will ignore the built-in + quirk table, and use the generic default reboot actions. + Non Executable Mappings ======================= diff --git a/MAINTAINERS b/MAINTAINERS index 14d4e77ffd1b..62acf826a043 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10626,15 +10626,6 @@ F: LICENSES/ F: scripts/spdxcheck-test.sh F: scripts/spdxcheck.py -LIGHTNVM PLATFORM SUPPORT -M: Matias Bjorling -L: linux-block@vger.kernel.org -S: Maintained -W: http://github/OpenChannelSSD -F: drivers/lightnvm/ -F: include/linux/lightnvm.h -F: include/uapi/linux/lightnvm.h - LINEAR RANGES HELPERS M: Mark Brown R: Matti Vaittinen diff --git a/arch/Kconfig b/arch/Kconfig index 129df498a8e1..98db63496bab 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1282,6 +1282,9 @@ config ARCH_SPLIT_ARG64 config ARCH_HAS_ELFCORE_COMPAT bool +config ARCH_HAS_PARANOID_L1D_FLUSH + bool + source "kernel/gcov/Kconfig" source "scripts/gcc-plugins/Kconfig" diff --git a/arch/m68k/configs/stmark2_defconfig b/arch/m68k/configs/stmark2_defconfig index d92306472fce..8898ae321779 100644 --- a/arch/m68k/configs/stmark2_defconfig +++ b/arch/m68k/configs/stmark2_defconfig @@ -22,7 +22,6 @@ CONFIG_RAMSIZE=0x8000000 CONFIG_VECTORBASE=0x40000000 CONFIG_KERNELBASE=0x40001000 # CONFIG_BLK_DEV_BSG is not set -CONFIG_BLK_CMDLINE_PARSER=y CONFIG_BINFMT_FLAT=y CONFIG_BINFMT_ZFLAT=y CONFIG_BINFMT_MISC=y diff --git a/arch/mips/include/asm/mach-rc32434/rb.h b/arch/mips/include/asm/mach-rc32434/rb.h index d502673a4f6c..34d179ca020b 100644 --- a/arch/mips/include/asm/mach-rc32434/rb.h +++ b/arch/mips/include/asm/mach-rc32434/rb.h @@ -7,8 +7,6 @@ #ifndef __ASM_RC32434_RB_H #define __ASM_RC32434_RB_H -#include - #define REGBASE 0x18000000 #define IDT434_REG_BASE ((volatile void *) KSEG1ADDR(REGBASE)) #define UART0BASE 0x58000 diff --git a/arch/riscv/configs/defconfig b/arch/riscv/configs/defconfig index 1f2be234b11c..bc68231a8fb7 100644 --- a/arch/riscv/configs/defconfig +++ b/arch/riscv/configs/defconfig @@ -132,7 +132,6 @@ CONFIG_DEBUG_PLIST=y CONFIG_DEBUG_SG=y # CONFIG_RCU_TRACE is not set CONFIG_RCU_EQS_DEBUG=y -CONFIG_DEBUG_BLOCK_EXT_DEVT=y # CONFIG_FTRACE is not set # CONFIG_RUNTIME_TESTING_MENU is not set CONFIG_MEMTEST=y diff --git a/arch/riscv/configs/rv32_defconfig b/arch/riscv/configs/rv32_defconfig index 8dd02b842fef..434ef5b64599 100644 --- a/arch/riscv/configs/rv32_defconfig +++ b/arch/riscv/configs/rv32_defconfig @@ -127,7 +127,6 @@ CONFIG_DEBUG_PLIST=y CONFIG_DEBUG_SG=y # CONFIG_RCU_TRACE is not set CONFIG_RCU_EQS_DEBUG=y -CONFIG_DEBUG_BLOCK_EXT_DEVT=y # CONFIG_FTRACE is not set # CONFIG_RUNTIME_TESTING_MENU is not set CONFIG_MEMTEST=y diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index e497185dd393..cd9dc0556e91 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -1268,8 +1268,7 @@ static void ubd_map_req(struct ubd *dev, struct io_thread_req *io_req, rq_for_each_segment(bvec, req, iter) { BUG_ON(i >= io_req->desc_cnt); - io_req->io_desc[i].buffer = - page_address(bvec.bv_page) + bvec.bv_offset; + io_req->io_desc[i].buffer = bvec_virt(&bvec); io_req->io_desc[i].length = bvec.bv_len; i++; } diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 88fb922c23a0..421fa9e38c60 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -119,6 +119,7 @@ config X86 select ARCH_WANT_HUGE_PMD_SHARE select ARCH_WANT_LD_ORPHAN_WARN select ARCH_WANTS_THP_SWAP if X86_64 + select ARCH_HAS_PARANOID_L1D_FLUSH select BUILDTIME_TABLE_SORT select CLKEVT_I8253 select CLOCKSOURCE_VALIDATE_LAST_CYCLE diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h index 89789e8c80f6..637fa1df3512 100644 --- a/arch/x86/include/asm/i8259.h +++ b/arch/x86/include/asm/i8259.h @@ -19,6 +19,8 @@ extern unsigned int cached_irq_mask; #define PIC_MASTER_OCW3 PIC_MASTER_ISR #define PIC_SLAVE_CMD 0xa0 #define PIC_SLAVE_IMR 0xa1 +#define PIC_ELCR1 0x4d0 +#define PIC_ELCR2 0x4d1 /* i8259A PIC related value */ #define PIC_CASCADE_IR 2 diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 3ad8c6d3cbb3..ec2d5c8c6694 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -252,6 +252,8 @@ DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb); DECLARE_STATIC_KEY_FALSE(mds_user_clear); DECLARE_STATIC_KEY_FALSE(mds_idle_clear); +DECLARE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); + #include /** diff --git a/arch/x86/include/asm/pc-conf-reg.h b/arch/x86/include/asm/pc-conf-reg.h new file mode 100644 index 000000000000..56bceceacf5f --- /dev/null +++ b/arch/x86/include/asm/pc-conf-reg.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Support for the configuration register space at port I/O locations + * 0x22 and 0x23 variously used by PC architectures, e.g. the MP Spec, + * Cyrix CPUs, numerous chipsets. + */ +#ifndef _ASM_X86_PC_CONF_REG_H +#define _ASM_X86_PC_CONF_REG_H + +#include +#include +#include + +#define PC_CONF_INDEX 0x22 +#define PC_CONF_DATA 0x23 + +#define PC_CONF_MPS_IMCR 0x70 + +extern raw_spinlock_t pc_conf_lock; + +static inline u8 pc_conf_get(u8 reg) +{ + outb(reg, PC_CONF_INDEX); + return inb(PC_CONF_DATA); +} + +static inline void pc_conf_set(u8 reg, u8 data) +{ + outb(reg, PC_CONF_INDEX); + outb(data, PC_CONF_DATA); +} + +#endif /* _ASM_X86_PC_CONF_REG_H */ diff --git a/arch/x86/include/asm/processor-cyrix.h b/arch/x86/include/asm/processor-cyrix.h index df700a6cc869..efe3e46e454b 100644 --- a/arch/x86/include/asm/processor-cyrix.h +++ b/arch/x86/include/asm/processor-cyrix.h @@ -5,14 +5,14 @@ * Access order is always 0x22 (=offset), 0x23 (=value) */ +#include + static inline u8 getCx86(u8 reg) { - outb(reg, 0x22); - return inb(0x23); + return pc_conf_get(reg); } static inline void setCx86(u8 reg, u8 data) { - outb(reg, 0x22); - outb(data, 0x23); + pc_conf_set(reg, data); } diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 33dd1575104e..9ad2acaaae9b 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -136,6 +136,8 @@ struct cpuinfo_x86 { u16 logical_die_id; /* Index into per_cpu list: */ u16 cpu_index; + /* Is SMT active on this core? */ + bool smt_active; u32 microcode; /* Address space bits used by the cache internally */ u8 x86_cache_bits; diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index de406d93b515..cf132663c219 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -81,7 +81,7 @@ struct thread_info { #define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/ #define TIF_SSBD 5 /* Speculative store bypass disable */ #define TIF_SPEC_IB 9 /* Indirect branch speculation mitigation */ -#define TIF_SPEC_FORCE_UPDATE 10 /* Force speculation MSR update in context switch */ +#define TIF_SPEC_L1D_FLUSH 10 /* Flush L1D on mm switches (processes) */ #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ #define TIF_UPROBE 12 /* breakpointed or singlestepping */ #define TIF_PATCH_PENDING 13 /* pending live patching update */ @@ -93,6 +93,7 @@ struct thread_info { #define TIF_MEMDIE 20 /* is terminating due to OOM killer */ #define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */ #define TIF_IO_BITMAP 22 /* uses I/O bitmap */ +#define TIF_SPEC_FORCE_UPDATE 23 /* Force speculation MSR update in context switch */ #define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ #define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */ #define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ @@ -104,7 +105,7 @@ struct thread_info { #define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) #define _TIF_SSBD (1 << TIF_SSBD) #define _TIF_SPEC_IB (1 << TIF_SPEC_IB) -#define _TIF_SPEC_FORCE_UPDATE (1 << TIF_SPEC_FORCE_UPDATE) +#define _TIF_SPEC_L1D_FLUSH (1 << TIF_SPEC_L1D_FLUSH) #define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY) #define _TIF_UPROBE (1 << TIF_UPROBE) #define _TIF_PATCH_PENDING (1 << TIF_PATCH_PENDING) @@ -115,6 +116,7 @@ struct thread_info { #define _TIF_SLD (1 << TIF_SLD) #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) +#define _TIF_SPEC_FORCE_UPDATE (1 << TIF_SPEC_FORCE_UPDATE) #define _TIF_FORCED_TF (1 << TIF_FORCED_TF) #define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP) #define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index fa952eadbc2e..b587a9ee9cb2 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -83,7 +83,7 @@ struct tlb_state { /* Last user mm for optimizing IBPB */ union { struct mm_struct *last_user_mm; - unsigned long last_user_mm_ibpb; + unsigned long last_user_mm_spec; }; u16 loaded_mm_asid; diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index e55e0c1fad8c..14bcd59bcdee 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -558,10 +558,10 @@ acpi_parse_nmi_src(union acpi_subtable_headers * header, const unsigned long end * If a PIC-mode SCI is not recognized or gives spurious IRQ7's * it may require Edge Trigger -- use "acpi_sci=edge" * - * Port 0x4d0-4d1 are ECLR1 and ECLR2, the Edge/Level Control Registers + * Port 0x4d0-4d1 are ELCR1 and ELCR2, the Edge/Level Control Registers * for the 8259 PIC. bit[n] = 1 means irq[n] is Level, otherwise Edge. - * ECLR1 is IRQs 0-7 (IRQ 0, 1, 2 must be 0) - * ECLR2 is IRQs 8-15 (IRQ 8, 13 must be 0) + * ELCR1 is IRQs 0-7 (IRQ 0, 1, 2 must be 0) + * ELCR2 is IRQs 8-15 (IRQ 8, 13 must be 0) */ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger) @@ -570,7 +570,7 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger) unsigned int old, new; /* Real old ELCR mask */ - old = inb(0x4d0) | (inb(0x4d1) << 8); + old = inb(PIC_ELCR1) | (inb(PIC_ELCR2) << 8); /* * If we use ACPI to set PCI IRQs, then we should clear ELCR @@ -596,8 +596,8 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger) return; pr_warn("setting ELCR to %04x (from %04x)\n", new, old); - outb(new, 0x4d0); - outb(new >> 8, 0x4d1); + outb(new, PIC_ELCR1); + outb(new >> 8, PIC_ELCR2); } int acpi_gsi_to_irq(u32 gsi, unsigned int *irqp) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index d262811ce14b..b70344bf6600 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -38,6 +38,7 @@ #include #include +#include #include #include #include @@ -132,18 +133,14 @@ static int enabled_via_apicbase __ro_after_init; */ static inline void imcr_pic_to_apic(void) { - /* select IMCR register */ - outb(0x70, 0x22); /* NMI and 8259 INTR go through APIC */ - outb(0x01, 0x23); + pc_conf_set(PC_CONF_MPS_IMCR, 0x01); } static inline void imcr_apic_to_pic(void) { - /* select IMCR register */ - outb(0x70, 0x22); /* NMI and 8259 INTR go directly to BSP */ - outb(0x00, 0x23); + pc_conf_set(PC_CONF_MPS_IMCR, 0x00); } #endif diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 39224e035e47..c1bb384935b0 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -764,7 +764,7 @@ static bool irq_active_low(int idx) static bool EISA_ELCR(unsigned int irq) { if (irq < nr_legacy_irqs()) { - unsigned int port = 0x4d0 + (irq >> 3); + unsigned int port = PIC_ELCR1 + (irq >> 3); return (inb(port) >> (irq & 7)) & 1; } apic_printk(APIC_VERBOSE, KERN_INFO diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index fb67ed5e7e6a..c132daabe615 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -1299,7 +1299,7 @@ static void __init print_PIC(void) pr_debug("... PIC ISR: %04x\n", v); - v = inb(0x4d1) << 8 | inb(0x4d0); + v = inb(PIC_ELCR2) << 8 | inb(PIC_ELCR1); pr_debug("... PIC ELCR: %04x\n", v); } diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index d41b70fe4918..ecfca3bbcd96 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -43,6 +43,7 @@ static void __init mds_select_mitigation(void); static void __init mds_print_mitigation(void); static void __init taa_select_mitigation(void); static void __init srbds_select_mitigation(void); +static void __init l1d_flush_select_mitigation(void); /* The base value of the SPEC_CTRL MSR that always has to be preserved. */ u64 x86_spec_ctrl_base; @@ -76,6 +77,13 @@ EXPORT_SYMBOL_GPL(mds_user_clear); DEFINE_STATIC_KEY_FALSE(mds_idle_clear); EXPORT_SYMBOL_GPL(mds_idle_clear); +/* + * Controls whether l1d flush based mitigations are enabled, + * based on hw features and admin setting via boot parameter + * defaults to false + */ +DEFINE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); + void __init check_bugs(void) { identify_boot_cpu(); @@ -111,6 +119,7 @@ void __init check_bugs(void) mds_select_mitigation(); taa_select_mitigation(); srbds_select_mitigation(); + l1d_flush_select_mitigation(); /* * As MDS and TAA mitigations are inter-related, print MDS @@ -491,6 +500,34 @@ static int __init srbds_parse_cmdline(char *str) } early_param("srbds", srbds_parse_cmdline); +#undef pr_fmt +#define pr_fmt(fmt) "L1D Flush : " fmt + +enum l1d_flush_mitigations { + L1D_FLUSH_OFF = 0, + L1D_FLUSH_ON, +}; + +static enum l1d_flush_mitigations l1d_flush_mitigation __initdata = L1D_FLUSH_OFF; + +static void __init l1d_flush_select_mitigation(void) +{ + if (!l1d_flush_mitigation || !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) + return; + + static_branch_enable(&switch_mm_cond_l1d_flush); + pr_info("Conditional flush on switch_mm() enabled\n"); +} + +static int __init l1d_flush_parse_cmdline(char *str) +{ + if (!strcmp(str, "on")) + l1d_flush_mitigation = L1D_FLUSH_ON; + + return 0; +} +early_param("l1d_flush", l1d_flush_parse_cmdline); + #undef pr_fmt #define pr_fmt(fmt) "Spectre V1 : " fmt @@ -1215,6 +1252,24 @@ static void task_update_spec_tif(struct task_struct *tsk) speculation_ctrl_update_current(); } +static int l1d_flush_prctl_set(struct task_struct *task, unsigned long ctrl) +{ + + if (!static_branch_unlikely(&switch_mm_cond_l1d_flush)) + return -EPERM; + + switch (ctrl) { + case PR_SPEC_ENABLE: + set_ti_thread_flag(&task->thread_info, TIF_SPEC_L1D_FLUSH); + return 0; + case PR_SPEC_DISABLE: + clear_ti_thread_flag(&task->thread_info, TIF_SPEC_L1D_FLUSH); + return 0; + default: + return -ERANGE; + } +} + static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl) { if (ssb_mode != SPEC_STORE_BYPASS_PRCTL && @@ -1324,6 +1379,8 @@ int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which, return ssb_prctl_set(task, ctrl); case PR_SPEC_INDIRECT_BRANCH: return ib_prctl_set(task, ctrl); + case PR_SPEC_L1D_FLUSH: + return l1d_flush_prctl_set(task, ctrl); default: return -ENODEV; } @@ -1340,6 +1397,17 @@ void arch_seccomp_spec_mitigate(struct task_struct *task) } #endif +static int l1d_flush_prctl_get(struct task_struct *task) +{ + if (!static_branch_unlikely(&switch_mm_cond_l1d_flush)) + return PR_SPEC_FORCE_DISABLE; + + if (test_ti_thread_flag(&task->thread_info, TIF_SPEC_L1D_FLUSH)) + return PR_SPEC_PRCTL | PR_SPEC_ENABLE; + else + return PR_SPEC_PRCTL | PR_SPEC_DISABLE; +} + static int ssb_prctl_get(struct task_struct *task) { switch (ssb_mode) { @@ -1390,6 +1458,8 @@ int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) return ssb_prctl_get(task); case PR_SPEC_INDIRECT_BRANCH: return ib_prctl_get(task); + case PR_SPEC_L1D_FLUSH: + return l1d_flush_prctl_get(task); default: return -ENODEV; } diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 282b4ee1339f..15aefa3f3e18 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -235,15 +235,15 @@ static char irq_trigger[2]; */ static void restore_ELCR(char *trigger) { - outb(trigger[0], 0x4d0); - outb(trigger[1], 0x4d1); + outb(trigger[0], PIC_ELCR1); + outb(trigger[1], PIC_ELCR2); } static void save_ELCR(char *trigger) { /* IRQ 0,1,2,8,13 are marked as reserved */ - trigger[0] = inb(0x4d0) & 0xF8; - trigger[1] = inb(0x4d1) & 0xDE; + trigger[0] = inb(PIC_ELCR1) & 0xF8; + trigger[1] = inb(PIC_ELCR2) & 0xDE; } static void i8259A_resume(void) diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 8f06449aab27..fed721f90116 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -251,7 +252,7 @@ static int __init ELCR_trigger(unsigned int irq) { unsigned int port; - port = 0x4d0 + (irq >> 3); + port = PIC_ELCR1 + (irq >> 3); return (inb(port) >> (irq & 7)) & 1; } diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index ebfb91108232..0a40df66a40d 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -388,10 +388,11 @@ static const struct dmi_system_id reboot_dmi_table[] __initconst = { }, { /* Handle problems with rebooting on the OptiPlex 990. */ .callback = set_pci_reboot, - .ident = "Dell OptiPlex 990", + .ident = "Dell OptiPlex 990 BIOS A0x", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"), + DMI_MATCH(DMI_BIOS_VERSION, "A0"), }, }, { /* Handle problems with rebooting on Dell 300's */ diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 9320285a5e29..85f6e242b6b4 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -610,6 +610,9 @@ void set_cpu_sibling_map(int cpu) if (threads > __max_smt_threads) __max_smt_threads = threads; + for_each_cpu(i, topology_sibling_cpumask(cpu)) + cpu_data(i).smt_active = threads > 1; + /* * This needs a separate iteration over the cpus because we rely on all * topology_sibling_cpumask links to be set-up. @@ -1552,8 +1555,13 @@ static void remove_siblinginfo(int cpu) for_each_cpu(sibling, topology_die_cpumask(cpu)) cpumask_clear_cpu(cpu, topology_die_cpumask(sibling)); - for_each_cpu(sibling, topology_sibling_cpumask(cpu)) + + for_each_cpu(sibling, topology_sibling_cpumask(cpu)) { cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling)); + if (cpumask_weight(topology_sibling_cpumask(sibling)) == 1) + cpu_data(sibling).smt_active = false; + } + for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling)); cpumask_clear(cpu_llc_shared_mask(cpu)); diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 629a09ca9860..0b80263d46d8 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -541,17 +541,17 @@ static int picdev_slave_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, addr, len, val); } -static int picdev_eclr_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, +static int picdev_elcr_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, gpa_t addr, int len, const void *val) { - return picdev_write(container_of(dev, struct kvm_pic, dev_eclr), + return picdev_write(container_of(dev, struct kvm_pic, dev_elcr), addr, len, val); } -static int picdev_eclr_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, +static int picdev_elcr_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, gpa_t addr, int len, void *val) { - return picdev_read(container_of(dev, struct kvm_pic, dev_eclr), + return picdev_read(container_of(dev, struct kvm_pic, dev_elcr), addr, len, val); } @@ -577,9 +577,9 @@ static const struct kvm_io_device_ops picdev_slave_ops = { .write = picdev_slave_write, }; -static const struct kvm_io_device_ops picdev_eclr_ops = { - .read = picdev_eclr_read, - .write = picdev_eclr_write, +static const struct kvm_io_device_ops picdev_elcr_ops = { + .read = picdev_elcr_read, + .write = picdev_elcr_write, }; int kvm_pic_init(struct kvm *kvm) @@ -602,7 +602,7 @@ int kvm_pic_init(struct kvm *kvm) */ kvm_iodevice_init(&s->dev_master, &picdev_master_ops); kvm_iodevice_init(&s->dev_slave, &picdev_slave_ops); - kvm_iodevice_init(&s->dev_eclr, &picdev_eclr_ops); + kvm_iodevice_init(&s->dev_elcr, &picdev_elcr_ops); mutex_lock(&kvm->slots_lock); ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x20, 2, &s->dev_master); @@ -613,7 +613,7 @@ int kvm_pic_init(struct kvm *kvm) if (ret < 0) goto fail_unreg_2; - ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x4d0, 2, &s->dev_eclr); + ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x4d0, 2, &s->dev_elcr); if (ret < 0) goto fail_unreg_1; @@ -647,7 +647,7 @@ void kvm_pic_destroy(struct kvm *kvm) mutex_lock(&kvm->slots_lock); kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_master); kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_slave); - kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_eclr); + kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_elcr); mutex_unlock(&kvm->slots_lock); kvm->arch.vpic = NULL; diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 9b64abf9b3f1..650642b18d15 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -55,7 +55,7 @@ struct kvm_pic { int output; /* intr from master PIC */ struct kvm_io_device dev_master; struct kvm_io_device dev_slave; - struct kvm_io_device dev_eclr; + struct kvm_io_device dev_elcr; void (*ack_notifier)(void *opaque, int irq); unsigned long irq_states[PIC_NUM_PINS]; }; diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index bad4dee4f0e4..c6506c6a7092 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -44,6 +44,7 @@ obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o lib-y := delay.o misc.o cmdline.o cpu.o lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o lib-y += memcpy_$(BITS).o +lib-y += pc-conf-reg.o lib-$(CONFIG_ARCH_HAS_COPY_MC) += copy_mc.o copy_mc_64.o lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o insn-eval.o lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o diff --git a/arch/x86/lib/pc-conf-reg.c b/arch/x86/lib/pc-conf-reg.c new file mode 100644 index 000000000000..febb52749e8d --- /dev/null +++ b/arch/x86/lib/pc-conf-reg.c @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Support for the configuration register space at port I/O locations + * 0x22 and 0x23 variously used by PC architectures, e.g. the MP Spec, + * Cyrix CPUs, numerous chipsets. As the space is indirectly addressed + * it may have to be protected with a spinlock, depending on the context. + */ + +#include + +#include + +DEFINE_RAW_SPINLOCK(pc_conf_lock); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index cfe6b1e85fa6..59ba2968af1b 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -8,11 +8,13 @@ #include #include #include +#include #include #include #include #include +#include #include #include @@ -43,10 +45,15 @@ */ /* - * Use bit 0 to mangle the TIF_SPEC_IB state into the mm pointer which is - * stored in cpu_tlb_state.last_user_mm_ibpb. + * Bits to mangle the TIF_SPEC_* state into the mm pointer which is + * stored in cpu_tlb_state.last_user_mm_spec. */ #define LAST_USER_MM_IBPB 0x1UL +#define LAST_USER_MM_L1D_FLUSH 0x2UL +#define LAST_USER_MM_SPEC_MASK (LAST_USER_MM_IBPB | LAST_USER_MM_L1D_FLUSH) + +/* Bits to set when tlbstate and flush is (re)initialized */ +#define LAST_USER_MM_INIT LAST_USER_MM_IBPB /* * The x86 feature is called PCID (Process Context IDentifier). It is similar @@ -317,20 +324,70 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next, local_irq_restore(flags); } -static unsigned long mm_mangle_tif_spec_ib(struct task_struct *next) +/* + * Invoked from return to user/guest by a task that opted-in to L1D + * flushing but ended up running on an SMT enabled core due to wrong + * affinity settings or CPU hotplug. This is part of the paranoid L1D flush + * contract which this task requested. + */ +static void l1d_flush_force_sigbus(struct callback_head *ch) { - unsigned long next_tif = task_thread_info(next)->flags; - unsigned long ibpb = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_IBPB; - - return (unsigned long)next->mm | ibpb; + force_sig(SIGBUS); } -static void cond_ibpb(struct task_struct *next) +static void l1d_flush_evaluate(unsigned long prev_mm, unsigned long next_mm, + struct task_struct *next) { - if (!next || !next->mm) + /* Flush L1D if the outgoing task requests it */ + if (prev_mm & LAST_USER_MM_L1D_FLUSH) + wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); + + /* Check whether the incoming task opted in for L1D flush */ + if (likely(!(next_mm & LAST_USER_MM_L1D_FLUSH))) return; /* + * Validate that it is not running on an SMT sibling as this would + * make the excercise pointless because the siblings share L1D. If + * it runs on a SMT sibling, notify it with SIGBUS on return to + * user/guest + */ + if (this_cpu_read(cpu_info.smt_active)) { + clear_ti_thread_flag(&next->thread_info, TIF_SPEC_L1D_FLUSH); + next->l1d_flush_kill.func = l1d_flush_force_sigbus; + task_work_add(next, &next->l1d_flush_kill, TWA_RESUME); + } +} + +static unsigned long mm_mangle_tif_spec_bits(struct task_struct *next) +{ + unsigned long next_tif = task_thread_info(next)->flags; + unsigned long spec_bits = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_SPEC_MASK; + + /* + * Ensure that the bit shift above works as expected and the two flags + * end up in bit 0 and 1. + */ + BUILD_BUG_ON(TIF_SPEC_L1D_FLUSH != TIF_SPEC_IB + 1); + + return (unsigned long)next->mm | spec_bits; +} + +static void cond_mitigation(struct task_struct *next) +{ + unsigned long prev_mm, next_mm; + + if (!next || !next->mm) + return; + + next_mm = mm_mangle_tif_spec_bits(next); + prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_spec); + + /* + * Avoid user/user BTB poisoning by flushing the branch predictor + * when switching between processes. This stops one process from + * doing Spectre-v2 attacks on another. + * * Both, the conditional and the always IBPB mode use the mm * pointer to avoid the IBPB when switching between tasks of the * same process. Using the mm pointer instead of mm->context.ctx_id @@ -340,8 +397,6 @@ static void cond_ibpb(struct task_struct *next) * exposed data is not really interesting. */ if (static_branch_likely(&switch_mm_cond_ibpb)) { - unsigned long prev_mm, next_mm; - /* * This is a bit more complex than the always mode because * it has to handle two cases: @@ -371,20 +426,14 @@ static void cond_ibpb(struct task_struct *next) * Optimize this with reasonably small overhead for the * above cases. Mangle the TIF_SPEC_IB bit into the mm * pointer of the incoming task which is stored in - * cpu_tlbstate.last_user_mm_ibpb for comparison. - */ - next_mm = mm_mangle_tif_spec_ib(next); - prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_ibpb); - - /* + * cpu_tlbstate.last_user_mm_spec for comparison. + * * Issue IBPB only if the mm's are different and one or * both have the IBPB bit set. */ if (next_mm != prev_mm && (next_mm | prev_mm) & LAST_USER_MM_IBPB) indirect_branch_prediction_barrier(); - - this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, next_mm); } if (static_branch_unlikely(&switch_mm_always_ibpb)) { @@ -393,11 +442,22 @@ static void cond_ibpb(struct task_struct *next) * different context than the user space task which ran * last on this CPU. */ - if (this_cpu_read(cpu_tlbstate.last_user_mm) != next->mm) { + if ((prev_mm & ~LAST_USER_MM_SPEC_MASK) != + (unsigned long)next->mm) indirect_branch_prediction_barrier(); - this_cpu_write(cpu_tlbstate.last_user_mm, next->mm); - } } + + if (static_branch_unlikely(&switch_mm_cond_l1d_flush)) { + /* + * Flush L1D when the outgoing task requested it and/or + * check whether the incoming task requested L1D flushing + * and ended up on an SMT sibling. + */ + if (unlikely((prev_mm | next_mm) & LAST_USER_MM_L1D_FLUSH)) + l1d_flush_evaluate(prev_mm, next_mm, next); + } + + this_cpu_write(cpu_tlbstate.last_user_mm_spec, next_mm); } #ifdef CONFIG_PERF_EVENTS @@ -531,11 +591,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, need_flush = true; } else { /* - * Avoid user/user BTB poisoning by flushing the branch - * predictor when switching between processes. This stops - * one process from doing Spectre-v2 attacks on another. + * Apply process to process speculation vulnerability + * mitigations if applicable. */ - cond_ibpb(tsk); + cond_mitigation(tsk); /* * Stop remote flushes for the previous mm. @@ -643,7 +702,7 @@ void initialize_tlbstate_and_flush(void) write_cr3(build_cr3(mm->pgd, 0)); /* Reinitialize tlbstate. */ - this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, LAST_USER_MM_IBPB); + this_cpu_write(cpu_tlbstate.last_user_mm_spec, LAST_USER_MM_INIT); this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0); this_cpu_write(cpu_tlbstate.next_asid, 1); this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id); diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index d3a73f9335e1..97b63e35e152 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -13,9 +13,13 @@ #include #include #include +#include #include #include #include + +#include +#include #include #define PIRQ_SIGNATURE (('$' << 0) + ('P' << 8) + ('I' << 16) + ('R' << 24)) @@ -47,6 +51,8 @@ struct irq_router { int (*get)(struct pci_dev *router, struct pci_dev *dev, int pirq); int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq, int new); + int (*lvl)(struct pci_dev *router, struct pci_dev *dev, int pirq, + int irq); }; struct irq_router_handler { @@ -153,7 +159,7 @@ static void __init pirq_peer_trick(void) void elcr_set_level_irq(unsigned int irq) { unsigned char mask = 1 << (irq & 7); - unsigned int port = 0x4d0 + (irq >> 3); + unsigned int port = PIC_ELCR1 + (irq >> 3); unsigned char val; static u16 elcr_irq_mask; @@ -169,6 +175,139 @@ void elcr_set_level_irq(unsigned int irq) } } +/* + * PIRQ routing for the M1487 ISA Bus Controller (IBC) ASIC used + * with the ALi FinALi 486 chipset. The IBC is not decoded in the + * PCI configuration space, so we identify it by the accompanying + * M1489 Cache-Memory PCI Controller (CMP) ASIC. + * + * There are four 4-bit mappings provided, spread across two PCI + * INTx Routing Table Mapping Registers, available in the port I/O + * space accessible indirectly via the index/data register pair at + * 0x22/0x23, located at indices 0x42 and 0x43 for the INT1/INT2 + * and INT3/INT4 lines respectively. The INT1/INT3 and INT2/INT4 + * lines are mapped in the low and the high 4-bit nibble of the + * corresponding register as follows: + * + * 0000 : Disabled + * 0001 : IRQ9 + * 0010 : IRQ3 + * 0011 : IRQ10 + * 0100 : IRQ4 + * 0101 : IRQ5 + * 0110 : IRQ7 + * 0111 : IRQ6 + * 1000 : Reserved + * 1001 : IRQ11 + * 1010 : Reserved + * 1011 : IRQ12 + * 1100 : Reserved + * 1101 : IRQ14 + * 1110 : Reserved + * 1111 : IRQ15 + * + * In addition to the usual ELCR register pair there is a separate + * PCI INTx Sensitivity Register at index 0x44 in the same port I/O + * space, whose bits 3:0 select the trigger mode for INT[4:1] lines + * respectively. Any bit set to 1 causes interrupts coming on the + * corresponding line to be passed to ISA as edge-triggered and + * otherwise they are passed as level-triggered. Manufacturer's + * documentation says this register has to be set consistently with + * the relevant ELCR register. + * + * Accesses to the port I/O space concerned here need to be unlocked + * by writing the value of 0xc5 to the Lock Register at index 0x03 + * beforehand. Any other value written to said register prevents + * further accesses from reaching the register file, except for the + * Lock Register being written with 0xc5 again. + * + * References: + * + * "M1489/M1487: 486 PCI Chip Set", Version 1.2, Acer Laboratories + * Inc., July 1997 + */ + +#define PC_CONF_FINALI_LOCK 0x03u +#define PC_CONF_FINALI_PCI_INTX_RT1 0x42u +#define PC_CONF_FINALI_PCI_INTX_RT2 0x43u +#define PC_CONF_FINALI_PCI_INTX_SENS 0x44u + +#define PC_CONF_FINALI_LOCK_KEY 0xc5u + +static u8 read_pc_conf_nybble(u8 base, u8 index) +{ + u8 reg = base + (index >> 1); + u8 x; + + x = pc_conf_get(reg); + return index & 1 ? x >> 4 : x & 0xf; +} + +static void write_pc_conf_nybble(u8 base, u8 index, u8 val) +{ + u8 reg = base + (index >> 1); + u8 x; + + x = pc_conf_get(reg); + x = index & 1 ? (x & 0x0f) | (val << 4) : (x & 0xf0) | val; + pc_conf_set(reg, x); +} + +static int pirq_finali_get(struct pci_dev *router, struct pci_dev *dev, + int pirq) +{ + static const u8 irqmap[16] = { + 0, 9, 3, 10, 4, 5, 7, 6, 0, 11, 0, 12, 0, 14, 0, 15 + }; + unsigned long flags; + u8 x; + + raw_spin_lock_irqsave(&pc_conf_lock, flags); + pc_conf_set(PC_CONF_FINALI_LOCK, PC_CONF_FINALI_LOCK_KEY); + x = irqmap[read_pc_conf_nybble(PC_CONF_FINALI_PCI_INTX_RT1, pirq - 1)]; + pc_conf_set(PC_CONF_FINALI_LOCK, 0); + raw_spin_unlock_irqrestore(&pc_conf_lock, flags); + return x; +} + +static int pirq_finali_set(struct pci_dev *router, struct pci_dev *dev, + int pirq, int irq) +{ + static const u8 irqmap[16] = { + 0, 0, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 + }; + u8 val = irqmap[irq]; + unsigned long flags; + + if (!val) + return 0; + + raw_spin_lock_irqsave(&pc_conf_lock, flags); + pc_conf_set(PC_CONF_FINALI_LOCK, PC_CONF_FINALI_LOCK_KEY); + write_pc_conf_nybble(PC_CONF_FINALI_PCI_INTX_RT1, pirq - 1, val); + pc_conf_set(PC_CONF_FINALI_LOCK, 0); + raw_spin_unlock_irqrestore(&pc_conf_lock, flags); + return 1; +} + +static int pirq_finali_lvl(struct pci_dev *router, struct pci_dev *dev, + int pirq, int irq) +{ + u8 mask = ~(1u << (pirq - 1)); + unsigned long flags; + u8 trig; + + elcr_set_level_irq(irq); + raw_spin_lock_irqsave(&pc_conf_lock, flags); + pc_conf_set(PC_CONF_FINALI_LOCK, PC_CONF_FINALI_LOCK_KEY); + trig = pc_conf_get(PC_CONF_FINALI_PCI_INTX_SENS); + trig &= mask; + pc_conf_set(PC_CONF_FINALI_PCI_INTX_SENS, trig); + pc_conf_set(PC_CONF_FINALI_LOCK, 0); + raw_spin_unlock_irqrestore(&pc_conf_lock, flags); + return 1; +} + /* * Common IRQ routing practice: nibbles in config space, * offset by some magic constant. @@ -219,6 +358,74 @@ static int pirq_ali_set(struct pci_dev *router, struct pci_dev *dev, int pirq, i return 0; } +/* + * PIRQ routing for the 82374EB/82374SB EISA System Component (ESC) + * ASIC used with the Intel 82420 and 82430 PCIsets. The ESC is not + * decoded in the PCI configuration space, so we identify it by the + * accompanying 82375EB/82375SB PCI-EISA Bridge (PCEB) ASIC. + * + * There are four PIRQ Route Control registers, available in the + * port I/O space accessible indirectly via the index/data register + * pair at 0x22/0x23, located at indices 0x60/0x61/0x62/0x63 for the + * PIRQ0/1/2/3# lines respectively. The semantics is the same as + * with the PIIX router. + * + * Accesses to the port I/O space concerned here need to be unlocked + * by writing the value of 0x0f to the ESC ID Register at index 0x02 + * beforehand. Any other value written to said register prevents + * further accesses from reaching the register file, except for the + * ESC ID Register being written with 0x0f again. + * + * References: + * + * "82374EB/82374SB EISA System Component (ESC)", Intel Corporation, + * Order Number: 290476-004, March 1996 + * + * "82375EB/82375SB PCI-EISA Bridge (PCEB)", Intel Corporation, Order + * Number: 290477-004, March 1996 + */ + +#define PC_CONF_I82374_ESC_ID 0x02u +#define PC_CONF_I82374_PIRQ_ROUTE_CONTROL 0x60u + +#define PC_CONF_I82374_ESC_ID_KEY 0x0fu + +static int pirq_esc_get(struct pci_dev *router, struct pci_dev *dev, int pirq) +{ + unsigned long flags; + int reg; + u8 x; + + reg = pirq; + if (reg >= 1 && reg <= 4) + reg += PC_CONF_I82374_PIRQ_ROUTE_CONTROL - 1; + + raw_spin_lock_irqsave(&pc_conf_lock, flags); + pc_conf_set(PC_CONF_I82374_ESC_ID, PC_CONF_I82374_ESC_ID_KEY); + x = pc_conf_get(reg); + pc_conf_set(PC_CONF_I82374_ESC_ID, 0); + raw_spin_unlock_irqrestore(&pc_conf_lock, flags); + return (x < 16) ? x : 0; +} + +static int pirq_esc_set(struct pci_dev *router, struct pci_dev *dev, int pirq, + int irq) +{ + unsigned long flags; + int reg; + + reg = pirq; + if (reg >= 1 && reg <= 4) + reg += PC_CONF_I82374_PIRQ_ROUTE_CONTROL - 1; + + raw_spin_lock_irqsave(&pc_conf_lock, flags); + pc_conf_set(PC_CONF_I82374_ESC_ID, PC_CONF_I82374_ESC_ID_KEY); + pc_conf_set(reg, irq); + pc_conf_set(PC_CONF_I82374_ESC_ID, 0); + raw_spin_unlock_irqrestore(&pc_conf_lock, flags); + return 1; +} + /* * The Intel PIIX4 pirq rules are fairly simple: "pirq" is * just a pointer to the config space. @@ -237,6 +444,50 @@ static int pirq_piix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, return 1; } +/* + * PIRQ routing for the 82426EX ISA Bridge (IB) ASIC used with the + * Intel 82420EX PCIset. + * + * There are only two PIRQ Route Control registers, available in the + * combined 82425EX/82426EX PCI configuration space, at 0x66 and 0x67 + * for the PIRQ0# and PIRQ1# lines respectively. The semantics is + * the same as with the PIIX router. + * + * References: + * + * "82420EX PCIset Data Sheet, 82425EX PCI System Controller (PSC) + * and 82426EX ISA Bridge (IB)", Intel Corporation, Order Number: + * 290488-004, December 1995 + */ + +#define PCI_I82426EX_PIRQ_ROUTE_CONTROL 0x66u + +static int pirq_ib_get(struct pci_dev *router, struct pci_dev *dev, int pirq) +{ + int reg; + u8 x; + + reg = pirq; + if (reg >= 1 && reg <= 2) + reg += PCI_I82426EX_PIRQ_ROUTE_CONTROL - 1; + + pci_read_config_byte(router, reg, &x); + return (x < 16) ? x : 0; +} + +static int pirq_ib_set(struct pci_dev *router, struct pci_dev *dev, int pirq, + int irq) +{ + int reg; + + reg = pirq; + if (reg >= 1 && reg <= 2) + reg += PCI_I82426EX_PIRQ_ROUTE_CONTROL - 1; + + pci_write_config_byte(router, reg, irq); + return 1; +} + /* * The VIA pirq rules are nibble-based, like ALI, * but without the ugly irq number munging. @@ -549,6 +800,11 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route return 0; switch (device) { + case PCI_DEVICE_ID_INTEL_82375: + r->name = "PCEB/ESC"; + r->get = pirq_esc_get; + r->set = pirq_esc_set; + return 1; case PCI_DEVICE_ID_INTEL_82371FB_0: case PCI_DEVICE_ID_INTEL_82371SB_0: case PCI_DEVICE_ID_INTEL_82371AB_0: @@ -594,6 +850,11 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route r->get = pirq_piix_get; r->set = pirq_piix_set; return 1; + case PCI_DEVICE_ID_INTEL_82425: + r->name = "PSC/IB"; + r->get = pirq_ib_get; + r->set = pirq_ib_set; + return 1; } if ((device >= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MIN && @@ -745,6 +1006,12 @@ static __init int ite_router_probe(struct irq_router *r, struct pci_dev *router, static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) { switch (device) { + case PCI_DEVICE_ID_AL_M1489: + r->name = "FinALi"; + r->get = pirq_finali_get; + r->set = pirq_finali_set; + r->lvl = pirq_finali_lvl; + return 1; case PCI_DEVICE_ID_AL_M1533: case PCI_DEVICE_ID_AL_M1563: r->name = "ALI"; @@ -968,11 +1235,17 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign) } else if (r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \ ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask))) { msg = "found"; - elcr_set_level_irq(irq); + if (r->lvl) + r->lvl(pirq_router_dev, dev, pirq, irq); + else + elcr_set_level_irq(irq); } else if (newirq && r->set && (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) { if (r->set(pirq_router_dev, dev, pirq, newirq)) { - elcr_set_level_irq(newirq); + if (r->lvl) + r->lvl(pirq_router_dev, dev, pirq, newirq); + else + elcr_set_level_irq(newirq); msg = "assigned"; irq = newirq; } diff --git a/block/Kconfig b/block/Kconfig index 1f951a4b89d6..e7d8cb911a7e 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -114,16 +114,6 @@ config BLK_DEV_THROTTLING_LOW Note, this is an experimental interface and could be changed someday. -config BLK_CMDLINE_PARSER - bool "Block device command line partition parser" - help - Enabling this option allows you to specify the partition layout from - the kernel boot args. This is typically of use for embedded devices - which don't otherwise have any standardized method for listing the - partitions on a block device. - - See Documentation/block/cmdline-partition.rst for more information. - config BLK_WBT bool "Enable support for block device writeback throttling" help @@ -251,4 +241,8 @@ config BLK_MQ_RDMA config BLK_PM def_bool BLOCK && PM +# do not use in new code +config BLOCK_HOLDER_DEPRECATED + bool + source "block/Kconfig.iosched" diff --git a/block/Makefile b/block/Makefile index 1e1afa10f869..1d0d466f2182 100644 --- a/block/Makefile +++ b/block/Makefile @@ -26,7 +26,6 @@ obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o obj-$(CONFIG_IOSCHED_BFQ) += bfq.o -obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o obj-$(CONFIG_BLK_DEV_INTEGRITY_T10) += t10-pi.o obj-$(CONFIG_BLK_MQ_PCI) += blk-mq-pci.o @@ -40,3 +39,4 @@ obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o obj-$(CONFIG_BLK_PM) += blk-pm.o obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += keyslot-manager.o blk-crypto.o obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) += blk-crypto-fallback.o +obj-$(CONFIG_BLOCK_HOLDER_DEPRECATED) += holder.o diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 727955918563..480e1a134859 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2361,6 +2361,9 @@ static int bfq_request_merge(struct request_queue *q, struct request **req, __rq = bfq_find_rq_fmerge(bfqd, bio, q); if (__rq && elv_bio_merge_ok(__rq, bio)) { *req = __rq; + + if (blk_discard_mergable(__rq)) + return ELEVATOR_DISCARD_MERGE; return ELEVATOR_FRONT_MERGE; } @@ -2505,7 +2508,7 @@ void bfq_end_wr_async_queues(struct bfq_data *bfqd, int i, j; for (i = 0; i < 2; i++) - for (j = 0; j < IOPRIO_BE_NR; j++) + for (j = 0; j < IOPRIO_NR_LEVELS; j++) if (bfqg->async_bfqq[i][j]) bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]); if (bfqg->async_idle_bfqq) @@ -5266,8 +5269,8 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) switch (ioprio_class) { default: pr_err("bdi %s: bfq: bad prio class %d\n", - bdi_dev_name(bfqq->bfqd->queue->backing_dev_info), - ioprio_class); + bdi_dev_name(bfqq->bfqd->queue->disk->bdi), + ioprio_class); fallthrough; case IOPRIO_CLASS_NONE: /* @@ -5290,10 +5293,10 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) break; } - if (bfqq->new_ioprio >= IOPRIO_BE_NR) { + if (bfqq->new_ioprio >= IOPRIO_NR_LEVELS) { pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n", bfqq->new_ioprio); - bfqq->new_ioprio = IOPRIO_BE_NR; + bfqq->new_ioprio = IOPRIO_NR_LEVELS - 1; } bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); @@ -5408,7 +5411,7 @@ static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, case IOPRIO_CLASS_RT: return &bfqg->async_bfqq[0][ioprio]; case IOPRIO_CLASS_NONE: - ioprio = IOPRIO_NORM; + ioprio = IOPRIO_BE_NORM; fallthrough; case IOPRIO_CLASS_BE: return &bfqg->async_bfqq[1][ioprio]; @@ -6822,7 +6825,7 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) int i, j; for (i = 0; i < 2; i++) - for (j = 0; j < IOPRIO_BE_NR; j++) + for (j = 0; j < IOPRIO_NR_LEVELS; j++) __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 99c2a3cb081e..a73488eec8a4 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -931,7 +931,7 @@ struct bfq_group { void *bfqd; - struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; + struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS]; struct bfq_queue *async_idle_bfqq; struct bfq_entity *my_entity; @@ -948,15 +948,13 @@ struct bfq_group { struct bfq_entity entity; struct bfq_sched_data sched_data; - struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; + struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS]; struct bfq_queue *async_idle_bfqq; struct rb_root rq_pos_tree; }; #endif -struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); - /* --------------- main algorithm interface ----------------- */ #define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index 7a462df71f68..b74cc0da118e 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -505,7 +505,7 @@ static void bfq_active_insert(struct bfq_service_tree *st, */ unsigned short bfq_ioprio_to_weight(int ioprio) { - return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF; + return (IOPRIO_NR_LEVELS - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF; } /** @@ -514,12 +514,12 @@ unsigned short bfq_ioprio_to_weight(int ioprio) * * To preserve as much as possible the old only-ioprio user interface, * 0 is used as an escape ioprio value for weights (numerically) equal or - * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF. + * larger than IOPRIO_NR_LEVELS * BFQ_WEIGHT_CONVERSION_COEFF. */ static unsigned short bfq_weight_to_ioprio(int weight) { return max_t(int, 0, - IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight); + IOPRIO_NR_LEVELS * BFQ_WEIGHT_CONVERSION_COEFF - weight); } static void bfq_get_entity(struct bfq_entity *entity) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 4b4eb8964a6f..6b47cddbbca1 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -104,8 +104,7 @@ void bio_integrity_free(struct bio *bio) struct bio_set *bs = bio->bi_pool; if (bip->bip_flags & BIP_BLOCK_INTEGRITY) - kfree(page_address(bip->bip_vec->bv_page) + - bip->bip_vec->bv_offset); + kfree(bvec_virt(bip->bip_vec)); __bio_integrity_free(bs, bip); bio->bi_integrity = NULL; @@ -163,27 +162,23 @@ static blk_status_t bio_integrity_process(struct bio *bio, struct bio_vec bv; struct bio_integrity_payload *bip = bio_integrity(bio); blk_status_t ret = BLK_STS_OK; - void *prot_buf = page_address(bip->bip_vec->bv_page) + - bip->bip_vec->bv_offset; iter.disk_name = bio->bi_bdev->bd_disk->disk_name; iter.interval = 1 << bi->interval_exp; iter.seed = proc_iter->bi_sector; - iter.prot_buf = prot_buf; + iter.prot_buf = bvec_virt(bip->bip_vec); __bio_for_each_segment(bv, bio, bviter, *proc_iter) { - void *kaddr = kmap_atomic(bv.bv_page); + void *kaddr = bvec_kmap_local(&bv); - iter.data_buf = kaddr + bv.bv_offset; + iter.data_buf = kaddr; iter.data_size = bv.bv_len; - ret = proc_fn(&iter); - if (ret) { - kunmap_atomic(kaddr); - return ret; - } + kunmap_local(kaddr); + + if (ret) + break; - kunmap_atomic(kaddr); } return ret; } diff --git a/block/bio.c b/block/bio.c index 1fab762e079b..265bff6b549a 100644 --- a/block/bio.c +++ b/block/bio.c @@ -495,16 +495,11 @@ EXPORT_SYMBOL(bio_kmalloc); void zero_fill_bio(struct bio *bio) { - unsigned long flags; struct bio_vec bv; struct bvec_iter iter; - bio_for_each_segment(bv, bio, iter) { - char *data = bvec_kmap_irq(&bv, &flags); - memset(data, 0, bv.bv_len); - flush_dcache_page(bv.bv_page); - bvec_kunmap_irq(data, &flags); - } + bio_for_each_segment(bv, bio, iter) + memzero_bvec(&bv); } EXPORT_SYMBOL(zero_fill_bio); @@ -979,6 +974,14 @@ static int bio_iov_bvec_set_append(struct bio *bio, struct iov_iter *iter) return 0; } +static void bio_put_pages(struct page **pages, size_t size, size_t off) +{ + size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE); + + for (i = 0; i < nr; i++) + put_page(pages[i]); +} + #define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *)) /** @@ -1023,8 +1026,10 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) if (same_page) put_page(page); } else { - if (WARN_ON_ONCE(bio_full(bio, len))) - return -EINVAL; + if (WARN_ON_ONCE(bio_full(bio, len))) { + bio_put_pages(pages + i, left, offset); + return -EINVAL; + } __bio_add_page(bio, page, len, offset); } offset = 0; @@ -1069,6 +1074,7 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) len = min_t(size_t, PAGE_SIZE - offset, left); if (bio_add_hw_page(q, bio, page, len, offset, max_append_sectors, &same_page) != len) { + bio_put_pages(pages + i, left, offset); ret = -EINVAL; break; } @@ -1191,27 +1197,15 @@ EXPORT_SYMBOL(bio_advance); void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, struct bio *src, struct bvec_iter *src_iter) { - struct bio_vec src_bv, dst_bv; - void *src_p, *dst_p; - unsigned bytes; - while (src_iter->bi_size && dst_iter->bi_size) { - src_bv = bio_iter_iovec(src, *src_iter); - dst_bv = bio_iter_iovec(dst, *dst_iter); + struct bio_vec src_bv = bio_iter_iovec(src, *src_iter); + struct bio_vec dst_bv = bio_iter_iovec(dst, *dst_iter); + unsigned int bytes = min(src_bv.bv_len, dst_bv.bv_len); + void *src_buf; - bytes = min(src_bv.bv_len, dst_bv.bv_len); - - src_p = kmap_atomic(src_bv.bv_page); - dst_p = kmap_atomic(dst_bv.bv_page); - - memcpy(dst_p + dst_bv.bv_offset, - src_p + src_bv.bv_offset, - bytes); - - kunmap_atomic(dst_p); - kunmap_atomic(src_p); - - flush_dcache_page(dst_bv.bv_page); + src_buf = bvec_kmap_local(&src_bv); + memcpy_to_bvec(&dst_bv, src_buf); + kunmap_local(src_buf); bio_advance_iter_single(src, src_iter, bytes); bio_advance_iter_single(dst, dst_iter, bytes); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 31fe9be179d9..3c88a79a319b 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -489,10 +489,9 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, const char *blkg_dev_name(struct blkcg_gq *blkg) { - /* some drivers (floppy) instantiate a queue w/o disk registered */ - if (blkg->q->backing_dev_info->dev) - return bdi_dev_name(blkg->q->backing_dev_info); - return NULL; + if (!blkg->q->disk || !blkg->q->disk->bdi->dev) + return NULL; + return bdi_dev_name(blkg->q->disk->bdi); } /** @@ -873,6 +872,63 @@ static void blkcg_fill_root_iostats(void) } } +static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s) +{ + struct blkg_iostat_set *bis = &blkg->iostat; + u64 rbytes, wbytes, rios, wios, dbytes, dios; + bool has_stats = false; + const char *dname; + unsigned seq; + int i; + + if (!blkg->online) + return; + + dname = blkg_dev_name(blkg); + if (!dname) + return; + + seq_printf(s, "%s ", dname); + + do { + seq = u64_stats_fetch_begin(&bis->sync); + + rbytes = bis->cur.bytes[BLKG_IOSTAT_READ]; + wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE]; + dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD]; + rios = bis->cur.ios[BLKG_IOSTAT_READ]; + wios = bis->cur.ios[BLKG_IOSTAT_WRITE]; + dios = bis->cur.ios[BLKG_IOSTAT_DISCARD]; + } while (u64_stats_fetch_retry(&bis->sync, seq)); + + if (rbytes || wbytes || rios || wios) { + has_stats = true; + seq_printf(s, "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu", + rbytes, wbytes, rios, wios, + dbytes, dios); + } + + if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) { + has_stats = true; + seq_printf(s, " use_delay=%d delay_nsec=%llu", + atomic_read(&blkg->use_delay), + atomic64_read(&blkg->delay_nsec)); + } + + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + + if (!blkg->pd[i] || !pol->pd_stat_fn) + continue; + + if (pol->pd_stat_fn(blkg->pd[i], s)) + has_stats = true; + } + + if (has_stats) + seq_printf(s, "\n"); +} + static int blkcg_print_stat(struct seq_file *sf, void *v) { struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); @@ -884,86 +940,11 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) cgroup_rstat_flush(blkcg->css.cgroup); rcu_read_lock(); - hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { - struct blkg_iostat_set *bis = &blkg->iostat; - const char *dname; - char *buf; - u64 rbytes, wbytes, rios, wios, dbytes, dios; - size_t size = seq_get_buf(sf, &buf), off = 0; - int i; - bool has_stats = false; - unsigned seq; - spin_lock_irq(&blkg->q->queue_lock); - - if (!blkg->online) - goto skip; - - dname = blkg_dev_name(blkg); - if (!dname) - goto skip; - - /* - * Hooray string manipulation, count is the size written NOT - * INCLUDING THE \0, so size is now count+1 less than what we - * had before, but we want to start writing the next bit from - * the \0 so we only add count to buf. - */ - off += scnprintf(buf+off, size-off, "%s ", dname); - - do { - seq = u64_stats_fetch_begin(&bis->sync); - - rbytes = bis->cur.bytes[BLKG_IOSTAT_READ]; - wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE]; - dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD]; - rios = bis->cur.ios[BLKG_IOSTAT_READ]; - wios = bis->cur.ios[BLKG_IOSTAT_WRITE]; - dios = bis->cur.ios[BLKG_IOSTAT_DISCARD]; - } while (u64_stats_fetch_retry(&bis->sync, seq)); - - if (rbytes || wbytes || rios || wios) { - has_stats = true; - off += scnprintf(buf+off, size-off, - "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu", - rbytes, wbytes, rios, wios, - dbytes, dios); - } - - if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) { - has_stats = true; - off += scnprintf(buf+off, size-off, - " use_delay=%d delay_nsec=%llu", - atomic_read(&blkg->use_delay), - (unsigned long long)atomic64_read(&blkg->delay_nsec)); - } - - for (i = 0; i < BLKCG_MAX_POLS; i++) { - struct blkcg_policy *pol = blkcg_policy[i]; - size_t written; - - if (!blkg->pd[i] || !pol->pd_stat_fn) - continue; - - written = pol->pd_stat_fn(blkg->pd[i], buf+off, size-off); - if (written) - has_stats = true; - off += written; - } - - if (has_stats) { - if (off < size - 1) { - off += scnprintf(buf+off, size-off, "\n"); - seq_commit(sf, off); - } else { - seq_commit(sf, -1); - } - } - skip: + blkcg_print_one_stat(blkg, sf); spin_unlock_irq(&blkg->q->queue_lock); } - rcu_read_unlock(); return 0; } diff --git a/block/blk-core.c b/block/blk-core.c index 4f8449b29b21..b5098739f72a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -14,7 +14,6 @@ */ #include #include -#include #include #include #include @@ -393,10 +392,7 @@ void blk_cleanup_queue(struct request_queue *q) /* for synchronous bio-based driver finish in-flight integrity i/o */ blk_flush_integrity(); - /* @q won't process any more request, flush async actions */ - del_timer_sync(&q->backing_dev_info->laptop_mode_wb_timer); blk_sync_queue(q); - if (queue_is_mq(q)) blk_mq_exit_queue(q); @@ -533,20 +529,14 @@ struct request_queue *blk_alloc_queue(int node_id) if (ret) goto fail_id; - q->backing_dev_info = bdi_alloc(node_id); - if (!q->backing_dev_info) - goto fail_split; - q->stats = blk_alloc_queue_stats(); if (!q->stats) - goto fail_stats; + goto fail_split; q->node = node_id; atomic_set(&q->nr_active_requests_shared_sbitmap, 0); - timer_setup(&q->backing_dev_info->laptop_mode_wb_timer, - laptop_mode_timer_fn, 0); timer_setup(&q->timeout, blk_rq_timed_out_timer, 0); INIT_WORK(&q->timeout_work, blk_timeout_work); INIT_LIST_HEAD(&q->icq_list); @@ -571,7 +561,7 @@ struct request_queue *blk_alloc_queue(int node_id) if (percpu_ref_init(&q->q_usage_counter, blk_queue_usage_counter_release, PERCPU_REF_INIT_ATOMIC, GFP_KERNEL)) - goto fail_bdi; + goto fail_stats; if (blkcg_init_queue(q)) goto fail_ref; @@ -584,10 +574,8 @@ struct request_queue *blk_alloc_queue(int node_id) fail_ref: percpu_ref_exit(&q->q_usage_counter); -fail_bdi: - blk_free_queue_stats(q->stats); fail_stats: - bdi_put(q->backing_dev_info); + blk_free_queue_stats(q->stats); fail_split: bioset_exit(&q->bio_split); fail_id: diff --git a/block/blk-crypto.c b/block/blk-crypto.c index fc82fc1aece3..6fb15510a4b3 100644 --- a/block/blk-crypto.c +++ b/block/blk-crypto.c @@ -348,7 +348,7 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key, return -EINVAL; } - if (dun_bytes == 0 || dun_bytes > BLK_CRYPTO_MAX_IV_SIZE) + if (dun_bytes == 0 || dun_bytes > mode->ivsize) return -EINVAL; if (!is_power_of_2(data_unit_size)) diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 410da060d1f5..69a12177dfb6 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -431,13 +431,15 @@ void blk_integrity_unregister(struct gendisk *disk) } EXPORT_SYMBOL(blk_integrity_unregister); -void blk_integrity_add(struct gendisk *disk) +int blk_integrity_add(struct gendisk *disk) { - if (kobject_init_and_add(&disk->integrity_kobj, &integrity_ktype, - &disk_to_dev(disk)->kobj, "%s", "integrity")) - return; + int ret; - kobject_uevent(&disk->integrity_kobj, KOBJ_ADD); + ret = kobject_init_and_add(&disk->integrity_kobj, &integrity_ktype, + &disk_to_dev(disk)->kobj, "%s", "integrity"); + if (!ret) + kobject_uevent(&disk->integrity_kobj, KOBJ_ADD); + return ret; } void blk_integrity_del(struct gendisk *disk) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 0e56557cacf2..b3880e4ba22a 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -2988,34 +2988,29 @@ static void ioc_pd_free(struct blkg_policy_data *pd) kfree(iocg); } -static size_t ioc_pd_stat(struct blkg_policy_data *pd, char *buf, size_t size) +static bool ioc_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) { struct ioc_gq *iocg = pd_to_iocg(pd); struct ioc *ioc = iocg->ioc; - size_t pos = 0; if (!ioc->enabled) - return 0; + return false; if (iocg->level == 0) { unsigned vp10k = DIV64_U64_ROUND_CLOSEST( ioc->vtime_base_rate * 10000, VTIME_PER_USEC); - pos += scnprintf(buf + pos, size - pos, " cost.vrate=%u.%02u", - vp10k / 100, vp10k % 100); + seq_printf(s, " cost.vrate=%u.%02u", vp10k / 100, vp10k % 100); } - pos += scnprintf(buf + pos, size - pos, " cost.usage=%llu", - iocg->last_stat.usage_us); + seq_printf(s, " cost.usage=%llu", iocg->last_stat.usage_us); if (blkcg_debug_stats) - pos += scnprintf(buf + pos, size - pos, - " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu", - iocg->last_stat.wait_us, - iocg->last_stat.indebt_us, - iocg->last_stat.indelay_us); - - return pos; + seq_printf(s, " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu", + iocg->last_stat.wait_us, + iocg->last_stat.indebt_us, + iocg->last_stat.indelay_us); + return true; } static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd, diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index d8b0d8bd132b..c0545f9da549 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -890,8 +890,7 @@ static int iolatency_print_limit(struct seq_file *sf, void *v) return 0; } -static size_t iolatency_ssd_stat(struct iolatency_grp *iolat, char *buf, - size_t size) +static bool iolatency_ssd_stat(struct iolatency_grp *iolat, struct seq_file *s) { struct latency_stat stat; int cpu; @@ -906,39 +905,40 @@ static size_t iolatency_ssd_stat(struct iolatency_grp *iolat, char *buf, preempt_enable(); if (iolat->rq_depth.max_depth == UINT_MAX) - return scnprintf(buf, size, " missed=%llu total=%llu depth=max", - (unsigned long long)stat.ps.missed, - (unsigned long long)stat.ps.total); - return scnprintf(buf, size, " missed=%llu total=%llu depth=%u", - (unsigned long long)stat.ps.missed, - (unsigned long long)stat.ps.total, - iolat->rq_depth.max_depth); + seq_printf(s, " missed=%llu total=%llu depth=max", + (unsigned long long)stat.ps.missed, + (unsigned long long)stat.ps.total); + else + seq_printf(s, " missed=%llu total=%llu depth=%u", + (unsigned long long)stat.ps.missed, + (unsigned long long)stat.ps.total, + iolat->rq_depth.max_depth); + return true; } -static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf, - size_t size) +static bool iolatency_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) { struct iolatency_grp *iolat = pd_to_lat(pd); unsigned long long avg_lat; unsigned long long cur_win; if (!blkcg_debug_stats) - return 0; + return false; if (iolat->ssd) - return iolatency_ssd_stat(iolat, buf, size); + return iolatency_ssd_stat(iolat, s); avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC); cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC); if (iolat->rq_depth.max_depth == UINT_MAX) - return scnprintf(buf, size, " depth=max avg_lat=%llu win=%llu", - avg_lat, cur_win); - - return scnprintf(buf, size, " depth=%u avg_lat=%llu win=%llu", - iolat->rq_depth.max_depth, avg_lat, cur_win); + seq_printf(s, " depth=max avg_lat=%llu win=%llu", + avg_lat, cur_win); + else + seq_printf(s, " depth=%u avg_lat=%llu win=%llu", + iolat->rq_depth.max_depth, avg_lat, cur_win); + return true; } - static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, struct request_queue *q, struct blkcg *blkcg) diff --git a/block/blk-map.c b/block/blk-map.c index 3743158ddaeb..d1448aaad980 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -400,7 +400,7 @@ static void bio_copy_kern_endio_read(struct bio *bio) struct bvec_iter_all iter_all; bio_for_each_segment_all(bvec, bio, iter_all) { - memcpy(p, page_address(bvec->bv_page), bvec->bv_len); + memcpy_from_bvec(p, bvec); p += bvec->bv_len; } diff --git a/block/blk-merge.c b/block/blk-merge.c index a11b3b53717e..eeba8422ae82 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -348,6 +348,8 @@ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs) trace_block_split(split, (*bio)->bi_iter.bi_sector); submit_bio_noacct(*bio); *bio = split; + + blk_throtl_charge_bio_split(*bio); } } @@ -705,22 +707,6 @@ static void blk_account_io_merge_request(struct request *req) } } -/* - * Two cases of handling DISCARD merge: - * If max_discard_segments > 1, the driver takes every bio - * as a range and send them to controller together. The ranges - * needn't to be contiguous. - * Otherwise, the bios/requests will be handled as same as - * others which should be contiguous. - */ -static inline bool blk_discard_mergable(struct request *req) -{ - if (req_op(req) == REQ_OP_DISCARD && - queue_max_discard_segments(req->q) > 1) - return true; - return false; -} - static enum elv_merge blk_try_req_merge(struct request *req, struct request *next) { diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 7b52e7657b2d..253c857cba47 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -45,60 +45,12 @@ static void blk_mq_hw_sysfs_release(struct kobject *kobj) kfree(hctx); } -struct blk_mq_ctx_sysfs_entry { - struct attribute attr; - ssize_t (*show)(struct blk_mq_ctx *, char *); - ssize_t (*store)(struct blk_mq_ctx *, const char *, size_t); -}; - struct blk_mq_hw_ctx_sysfs_entry { struct attribute attr; ssize_t (*show)(struct blk_mq_hw_ctx *, char *); ssize_t (*store)(struct blk_mq_hw_ctx *, const char *, size_t); }; -static ssize_t blk_mq_sysfs_show(struct kobject *kobj, struct attribute *attr, - char *page) -{ - struct blk_mq_ctx_sysfs_entry *entry; - struct blk_mq_ctx *ctx; - struct request_queue *q; - ssize_t res; - - entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr); - ctx = container_of(kobj, struct blk_mq_ctx, kobj); - q = ctx->queue; - - if (!entry->show) - return -EIO; - - mutex_lock(&q->sysfs_lock); - res = entry->show(ctx, page); - mutex_unlock(&q->sysfs_lock); - return res; -} - -static ssize_t blk_mq_sysfs_store(struct kobject *kobj, struct attribute *attr, - const char *page, size_t length) -{ - struct blk_mq_ctx_sysfs_entry *entry; - struct blk_mq_ctx *ctx; - struct request_queue *q; - ssize_t res; - - entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr); - ctx = container_of(kobj, struct blk_mq_ctx, kobj); - q = ctx->queue; - - if (!entry->store) - return -EIO; - - mutex_lock(&q->sysfs_lock); - res = entry->store(ctx, page, length); - mutex_unlock(&q->sysfs_lock); - return res; -} - static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj, struct attribute *attr, char *page) { @@ -198,23 +150,16 @@ static struct attribute *default_hw_ctx_attrs[] = { }; ATTRIBUTE_GROUPS(default_hw_ctx); -static const struct sysfs_ops blk_mq_sysfs_ops = { - .show = blk_mq_sysfs_show, - .store = blk_mq_sysfs_store, -}; - static const struct sysfs_ops blk_mq_hw_sysfs_ops = { .show = blk_mq_hw_sysfs_show, .store = blk_mq_hw_sysfs_store, }; static struct kobj_type blk_mq_ktype = { - .sysfs_ops = &blk_mq_sysfs_ops, .release = blk_mq_sysfs_release, }; static struct kobj_type blk_mq_ctx_ktype = { - .sysfs_ops = &blk_mq_sysfs_ops, .release = blk_mq_ctx_sysfs_release, }; diff --git a/block/blk-mq.c b/block/blk-mq.c index d0b881eed032..944049982e6e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -525,7 +525,7 @@ void blk_mq_free_request(struct request *rq) __blk_mq_dec_active_requests(hctx); if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) - laptop_io_completion(q->backing_dev_info); + laptop_io_completion(q->disk->bdi); rq_qos_done(q, rq); @@ -3115,7 +3115,8 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) } EXPORT_SYMBOL(blk_mq_init_queue); -struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata) +struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, + struct lock_class_key *lkclass) { struct request_queue *q; struct gendisk *disk; @@ -3124,12 +3125,11 @@ struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata) if (IS_ERR(q)) return ERR_CAST(q); - disk = __alloc_disk_node(0, set->numa_node); + disk = __alloc_disk_node(q, set->numa_node, lkclass); if (!disk) { blk_cleanup_queue(q); return ERR_PTR(-ENOMEM); } - disk->queue = q; return disk; } EXPORT_SYMBOL(__blk_mq_alloc_disk); diff --git a/block/blk-settings.c b/block/blk-settings.c index 902c40d67120..a7c857ad7d10 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -140,7 +141,9 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto limits->logical_block_size >> SECTOR_SHIFT); limits->max_sectors = max_sectors; - q->backing_dev_info->io_pages = max_sectors >> (PAGE_SHIFT - 9); + if (!q->disk) + return; + q->disk->bdi->io_pages = max_sectors >> (PAGE_SHIFT - 9); } EXPORT_SYMBOL(blk_queue_max_hw_sectors); @@ -380,18 +383,19 @@ void blk_queue_alignment_offset(struct request_queue *q, unsigned int offset) } EXPORT_SYMBOL(blk_queue_alignment_offset); -void blk_queue_update_readahead(struct request_queue *q) +void disk_update_readahead(struct gendisk *disk) { + struct request_queue *q = disk->queue; + /* * For read-ahead of large files to be effective, we need to read ahead * at least twice the optimal I/O size. */ - q->backing_dev_info->ra_pages = + disk->bdi->ra_pages = max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES); - q->backing_dev_info->io_pages = - queue_max_sectors(q) >> (PAGE_SHIFT - 9); + disk->bdi->io_pages = queue_max_sectors(q) >> (PAGE_SHIFT - 9); } -EXPORT_SYMBOL_GPL(blk_queue_update_readahead); +EXPORT_SYMBOL_GPL(disk_update_readahead); /** * blk_limits_io_min - set minimum request size for a device @@ -471,7 +475,9 @@ EXPORT_SYMBOL(blk_limits_io_opt); void blk_queue_io_opt(struct request_queue *q, unsigned int opt) { blk_limits_io_opt(&q->limits, opt); - q->backing_dev_info->ra_pages = + if (!q->disk) + return; + q->disk->bdi->ra_pages = max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES); } EXPORT_SYMBOL(blk_queue_io_opt); @@ -661,17 +667,11 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, struct request_queue *t = disk->queue; if (blk_stack_limits(&t->limits, &bdev_get_queue(bdev)->limits, - get_start_sect(bdev) + (offset >> 9)) < 0) { - char top[BDEVNAME_SIZE], bottom[BDEVNAME_SIZE]; + get_start_sect(bdev) + (offset >> 9)) < 0) + pr_notice("%s: Warning: Device %pg is misaligned\n", + disk->disk_name, bdev); - disk_name(disk, 0, top); - bdevname(bdev, bottom); - - printk(KERN_NOTICE "%s: Warning: Device %s is misaligned\n", - top, bottom); - } - - blk_queue_update_readahead(disk->queue); + disk_update_readahead(disk); } EXPORT_SYMBOL(disk_stack_limits); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 370d83c18057..614d9d47de36 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -88,9 +88,11 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) static ssize_t queue_ra_show(struct request_queue *q, char *page) { - unsigned long ra_kb = q->backing_dev_info->ra_pages << - (PAGE_SHIFT - 10); + unsigned long ra_kb; + if (!q->disk) + return -EINVAL; + ra_kb = q->disk->bdi->ra_pages << (PAGE_SHIFT - 10); return queue_var_show(ra_kb, page); } @@ -98,13 +100,14 @@ static ssize_t queue_ra_store(struct request_queue *q, const char *page, size_t count) { unsigned long ra_kb; - ssize_t ret = queue_var_store(&ra_kb, page, count); + ssize_t ret; + if (!q->disk) + return -EINVAL; + ret = queue_var_store(&ra_kb, page, count); if (ret < 0) return ret; - - q->backing_dev_info->ra_pages = ra_kb >> (PAGE_SHIFT - 10); - + q->disk->bdi->ra_pages = ra_kb >> (PAGE_SHIFT - 10); return ret; } @@ -251,7 +254,8 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) spin_lock_irq(&q->queue_lock); q->limits.max_sectors = max_sectors_kb << 1; - q->backing_dev_info->io_pages = max_sectors_kb >> (PAGE_SHIFT - 10); + if (q->disk) + q->disk->bdi->io_pages = max_sectors_kb >> (PAGE_SHIFT - 10); spin_unlock_irq(&q->queue_lock); return ret; @@ -766,13 +770,6 @@ static void blk_exit_queue(struct request_queue *q) * e.g. blkcg_print_blkgs() to crash. */ blkcg_exit_queue(q); - - /* - * Since the cgroup code may dereference the @q->backing_dev_info - * pointer, only decrease its reference count after having removed the - * association with the block cgroup controller. - */ - bdi_put(q->backing_dev_info); } /** @@ -859,15 +856,6 @@ int blk_register_queue(struct gendisk *disk) struct device *dev = disk_to_dev(disk); struct request_queue *q = disk->queue; - if (WARN_ON(!q)) - return -ENXIO; - - WARN_ONCE(blk_queue_registered(q), - "%s is registering an already registered queue\n", - kobject_name(&dev->kobj)); - - blk_queue_update_readahead(q); - ret = blk_trace_init_sysfs(dev); if (ret) return ret; @@ -941,7 +929,6 @@ unlock: return ret; } -EXPORT_SYMBOL_GPL(blk_register_queue); /** * blk_unregister_queue - counterpart of blk_register_queue() diff --git a/block/blk-throttle.c b/block/blk-throttle.c index b1b22d863bdf..55c49015e533 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -178,6 +178,9 @@ struct throtl_grp { unsigned int bad_bio_cnt; /* bios exceeding latency threshold */ unsigned long bio_cnt_reset_time; + atomic_t io_split_cnt[2]; + atomic_t last_io_split_cnt[2]; + struct blkg_rwstat stat_bytes; struct blkg_rwstat stat_ios; }; @@ -777,6 +780,8 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, tg->bytes_disp[rw] = 0; tg->io_disp[rw] = 0; + atomic_set(&tg->io_split_cnt[rw], 0); + /* * Previous slice has expired. We must have trimmed it after last * bio dispatch. That means since start of last slice, we never used @@ -799,6 +804,9 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw) tg->io_disp[rw] = 0; tg->slice_start[rw] = jiffies; tg->slice_end[rw] = jiffies + tg->td->throtl_slice; + + atomic_set(&tg->io_split_cnt[rw], 0); + throtl_log(&tg->service_queue, "[%c] new slice start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', tg->slice_start[rw], @@ -1031,6 +1039,9 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, jiffies + tg->td->throtl_slice); } + if (iops_limit != UINT_MAX) + tg->io_disp[rw] += atomic_xchg(&tg->io_split_cnt[rw], 0); + if (tg_with_in_bps_limit(tg, bio, bps_limit, &bps_wait) && tg_with_in_iops_limit(tg, bio, iops_limit, &iops_wait)) { if (wait) @@ -2052,12 +2063,14 @@ static void throtl_downgrade_check(struct throtl_grp *tg) } if (tg->iops[READ][LIMIT_LOW]) { + tg->last_io_disp[READ] += atomic_xchg(&tg->last_io_split_cnt[READ], 0); iops = tg->last_io_disp[READ] * HZ / elapsed_time; if (iops >= tg->iops[READ][LIMIT_LOW]) tg->last_low_overflow_time[READ] = now; } if (tg->iops[WRITE][LIMIT_LOW]) { + tg->last_io_disp[WRITE] += atomic_xchg(&tg->last_io_split_cnt[WRITE], 0); iops = tg->last_io_disp[WRITE] * HZ / elapsed_time; if (iops >= tg->iops[WRITE][LIMIT_LOW]) tg->last_low_overflow_time[WRITE] = now; @@ -2176,6 +2189,25 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td) } #endif +void blk_throtl_charge_bio_split(struct bio *bio) +{ + struct blkcg_gq *blkg = bio->bi_blkg; + struct throtl_grp *parent = blkg_to_tg(blkg); + struct throtl_service_queue *parent_sq; + bool rw = bio_data_dir(bio); + + do { + if (!parent->has_rules[rw]) + break; + + atomic_inc(&parent->io_split_cnt[rw]); + atomic_inc(&parent->last_io_split_cnt[rw]); + + parent_sq = parent->service_queue.parent_sq; + parent = sq_to_tg(parent_sq); + } while (parent); +} + bool blk_throtl_bio(struct bio *bio) { struct request_queue *q = bio->bi_bdev->bd_disk->queue; diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 3ed71b8da887..874c1c37bf0c 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -97,7 +97,7 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) */ static bool wb_recent_wait(struct rq_wb *rwb) { - struct bdi_writeback *wb = &rwb->rqos.q->backing_dev_info->wb; + struct bdi_writeback *wb = &rwb->rqos.q->disk->bdi->wb; return time_before(jiffies, wb->dirty_sleep + HZ); } @@ -234,7 +234,7 @@ enum { static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) { - struct backing_dev_info *bdi = rwb->rqos.q->backing_dev_info; + struct backing_dev_info *bdi = rwb->rqos.q->disk->bdi; struct rq_depth *rqd = &rwb->rq_depth; u64 thislat; @@ -287,7 +287,7 @@ static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) static void rwb_trace_step(struct rq_wb *rwb, const char *msg) { - struct backing_dev_info *bdi = rwb->rqos.q->backing_dev_info; + struct backing_dev_info *bdi = rwb->rqos.q->disk->bdi; struct rq_depth *rqd = &rwb->rq_depth; trace_wbt_step(bdi, msg, rqd->scale_step, rwb->cur_win_nsec, @@ -359,7 +359,7 @@ static void wb_timer_fn(struct blk_stat_callback *cb) status = latency_exceeded(rwb, cb->stat); - trace_wbt_timer(rwb->rqos.q->backing_dev_info, status, rqd->scale_step, + trace_wbt_timer(rwb->rqos.q->disk->bdi, status, rqd->scale_step, inflight); /* diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 86fce751bb17..1d0c76c18fc5 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -360,9 +360,6 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, if (!blk_queue_is_zoned(q)) return -ENOTTY; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report))) return -EFAULT; @@ -421,9 +418,6 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode, if (!blk_queue_is_zoned(q)) return -ENOTTY; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - if (!(mode & FMODE_WRITE)) return -EBADF; diff --git a/block/blk.h b/block/blk.h index cb01429c162c..346d184c82b0 100644 --- a/block/blk.h +++ b/block/blk.h @@ -128,7 +128,7 @@ static inline bool integrity_req_gap_front_merge(struct request *req, bip_next->bip_vec[0].bv_offset); } -void blk_integrity_add(struct gendisk *); +int blk_integrity_add(struct gendisk *disk); void blk_integrity_del(struct gendisk *); #else /* CONFIG_BLK_DEV_INTEGRITY */ static inline bool blk_integrity_merge_rq(struct request_queue *rq, @@ -162,8 +162,9 @@ static inline bool bio_integrity_endio(struct bio *bio) static inline void bio_integrity_free(struct bio *bio) { } -static inline void blk_integrity_add(struct gendisk *disk) +static inline int blk_integrity_add(struct gendisk *disk) { + return 0; } static inline void blk_integrity_del(struct gendisk *disk) { @@ -289,11 +290,13 @@ int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node); extern int blk_throtl_init(struct request_queue *q); extern void blk_throtl_exit(struct request_queue *q); extern void blk_throtl_register_queue(struct request_queue *q); +extern void blk_throtl_charge_bio_split(struct bio *bio); bool blk_throtl_bio(struct bio *bio); #else /* CONFIG_BLK_DEV_THROTTLING */ static inline int blk_throtl_init(struct request_queue *q) { return 0; } static inline void blk_throtl_exit(struct request_queue *q) { } static inline void blk_throtl_register_queue(struct request_queue *q) { } +static inline void blk_throtl_charge_bio_split(struct bio *bio) { } static inline bool blk_throtl_bio(struct bio *bio) { return false; } #endif /* CONFIG_BLK_DEV_THROTTLING */ #ifdef CONFIG_BLK_DEV_THROTTLING_LOW @@ -340,15 +343,14 @@ static inline void blk_queue_clear_zone_settings(struct request_queue *q) {} int blk_alloc_ext_minor(void); void blk_free_ext_minor(unsigned int minor); -char *disk_name(struct gendisk *hd, int partno, char *buf); #define ADDPART_FLAG_NONE 0 #define ADDPART_FLAG_RAID 1 #define ADDPART_FLAG_WHOLEDISK 2 -int bdev_add_partition(struct block_device *bdev, int partno, - sector_t start, sector_t length); -int bdev_del_partition(struct block_device *bdev, int partno); -int bdev_resize_partition(struct block_device *bdev, int partno, - sector_t start, sector_t length); +int bdev_add_partition(struct gendisk *disk, int partno, sector_t start, + sector_t length); +int bdev_del_partition(struct gendisk *disk, int partno); +int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start, + sector_t length); int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, @@ -356,7 +358,7 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct request_queue *blk_alloc_queue(int node_id); -void disk_alloc_events(struct gendisk *disk); +int disk_alloc_events(struct gendisk *disk); void disk_add_events(struct gendisk *disk); void disk_del_events(struct gendisk *disk); void disk_release_events(struct gendisk *disk); diff --git a/block/bounce.c b/block/bounce.c index 94081e013c58..05fc7148489d 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -67,18 +67,6 @@ static __init int init_emergency_pool(void) __initcall(init_emergency_pool); -/* - * highmem version, map in to vec - */ -static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom) -{ - unsigned char *vto; - - vto = kmap_atomic(to->bv_page); - memcpy(vto + to->bv_offset, vfrom, to->bv_len); - kunmap_atomic(vto); -} - /* * Simple bounce buffer support for highmem pages. Depending on the * queue gfp mask set, *to may or may not be a highmem page. kmap it @@ -86,7 +74,6 @@ static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom) */ static void copy_to_high_bio_irq(struct bio *to, struct bio *from) { - unsigned char *vfrom; struct bio_vec tovec, fromvec; struct bvec_iter iter; /* @@ -104,11 +91,8 @@ static void copy_to_high_bio_irq(struct bio *to, struct bio *from) * been modified by the block layer, so use the original * copy, bounce_copy_vec already uses tovec->bv_len */ - vfrom = page_address(fromvec.bv_page) + - tovec.bv_offset; - - bounce_copy_vec(&tovec, vfrom); - flush_dcache_page(tovec.bv_page); + memcpy_to_bvec(&tovec, page_address(fromvec.bv_page) + + tovec.bv_offset); } bio_advance_iter(from, &from_iter, tovec.bv_len); } @@ -255,24 +239,19 @@ void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) * because the 'bio' is single-page bvec. */ for (i = 0, to = bio->bi_io_vec; i < bio->bi_vcnt; to++, i++) { - struct page *page = to->bv_page; + struct page *bounce_page; - if (!PageHighMem(page)) + if (!PageHighMem(to->bv_page)) continue; - to->bv_page = mempool_alloc(&page_pool, GFP_NOIO); - inc_zone_page_state(to->bv_page, NR_BOUNCE); + bounce_page = mempool_alloc(&page_pool, GFP_NOIO); + inc_zone_page_state(bounce_page, NR_BOUNCE); if (rw == WRITE) { - char *vto, *vfrom; - - flush_dcache_page(page); - - vto = page_address(to->bv_page) + to->bv_offset; - vfrom = kmap_atomic(page) + to->bv_offset; - memcpy(vto, vfrom, to->bv_len); - kunmap_atomic(vfrom); + flush_dcache_page(to->bv_page); + memcpy_from_bvec(page_address(bounce_page), to); } + to->bv_page = bounce_page; } trace_block_bio_bounce(*bio_orig); diff --git a/block/cmdline-parser.c b/block/cmdline-parser.c deleted file mode 100644 index f2a14571882b..000000000000 --- a/block/cmdline-parser.c +++ /dev/null @@ -1,255 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Parse command line, get partition information - * - * Written by Cai Zhiyong - * - */ -#include -#include - -static int parse_subpart(struct cmdline_subpart **subpart, char *partdef) -{ - int ret = 0; - struct cmdline_subpart *new_subpart; - - *subpart = NULL; - - new_subpart = kzalloc(sizeof(struct cmdline_subpart), GFP_KERNEL); - if (!new_subpart) - return -ENOMEM; - - if (*partdef == '-') { - new_subpart->size = (sector_t)(~0ULL); - partdef++; - } else { - new_subpart->size = (sector_t)memparse(partdef, &partdef); - if (new_subpart->size < (sector_t)PAGE_SIZE) { - pr_warn("cmdline partition size is invalid."); - ret = -EINVAL; - goto fail; - } - } - - if (*partdef == '@') { - partdef++; - new_subpart->from = (sector_t)memparse(partdef, &partdef); - } else { - new_subpart->from = (sector_t)(~0ULL); - } - - if (*partdef == '(') { - int length; - char *next = strchr(++partdef, ')'); - - if (!next) { - pr_warn("cmdline partition format is invalid."); - ret = -EINVAL; - goto fail; - } - - length = min_t(int, next - partdef, - sizeof(new_subpart->name) - 1); - strncpy(new_subpart->name, partdef, length); - new_subpart->name[length] = '\0'; - - partdef = ++next; - } else - new_subpart->name[0] = '\0'; - - new_subpart->flags = 0; - - if (!strncmp(partdef, "ro", 2)) { - new_subpart->flags |= PF_RDONLY; - partdef += 2; - } - - if (!strncmp(partdef, "lk", 2)) { - new_subpart->flags |= PF_POWERUP_LOCK; - partdef += 2; - } - - *subpart = new_subpart; - return 0; -fail: - kfree(new_subpart); - return ret; -} - -static void free_subpart(struct cmdline_parts *parts) -{ - struct cmdline_subpart *subpart; - - while (parts->subpart) { - subpart = parts->subpart; - parts->subpart = subpart->next_subpart; - kfree(subpart); - } -} - -static int parse_parts(struct cmdline_parts **parts, const char *bdevdef) -{ - int ret = -EINVAL; - char *next; - int length; - struct cmdline_subpart **next_subpart; - struct cmdline_parts *newparts; - char buf[BDEVNAME_SIZE + 32 + 4]; - - *parts = NULL; - - newparts = kzalloc(sizeof(struct cmdline_parts), GFP_KERNEL); - if (!newparts) - return -ENOMEM; - - next = strchr(bdevdef, ':'); - if (!next) { - pr_warn("cmdline partition has no block device."); - goto fail; - } - - length = min_t(int, next - bdevdef, sizeof(newparts->name) - 1); - strncpy(newparts->name, bdevdef, length); - newparts->name[length] = '\0'; - newparts->nr_subparts = 0; - - next_subpart = &newparts->subpart; - - while (next && *(++next)) { - bdevdef = next; - next = strchr(bdevdef, ','); - - length = (!next) ? (sizeof(buf) - 1) : - min_t(int, next - bdevdef, sizeof(buf) - 1); - - strncpy(buf, bdevdef, length); - buf[length] = '\0'; - - ret = parse_subpart(next_subpart, buf); - if (ret) - goto fail; - - newparts->nr_subparts++; - next_subpart = &(*next_subpart)->next_subpart; - } - - if (!newparts->subpart) { - pr_warn("cmdline partition has no valid partition."); - ret = -EINVAL; - goto fail; - } - - *parts = newparts; - - return 0; -fail: - free_subpart(newparts); - kfree(newparts); - return ret; -} - -void cmdline_parts_free(struct cmdline_parts **parts) -{ - struct cmdline_parts *next_parts; - - while (*parts) { - next_parts = (*parts)->next_parts; - free_subpart(*parts); - kfree(*parts); - *parts = next_parts; - } -} -EXPORT_SYMBOL(cmdline_parts_free); - -int cmdline_parts_parse(struct cmdline_parts **parts, const char *cmdline) -{ - int ret; - char *buf; - char *pbuf; - char *next; - struct cmdline_parts **next_parts; - - *parts = NULL; - - next = pbuf = buf = kstrdup(cmdline, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - next_parts = parts; - - while (next && *pbuf) { - next = strchr(pbuf, ';'); - if (next) - *next = '\0'; - - ret = parse_parts(next_parts, pbuf); - if (ret) - goto fail; - - if (next) - pbuf = ++next; - - next_parts = &(*next_parts)->next_parts; - } - - if (!*parts) { - pr_warn("cmdline partition has no valid partition."); - ret = -EINVAL; - goto fail; - } - - ret = 0; -done: - kfree(buf); - return ret; - -fail: - cmdline_parts_free(parts); - goto done; -} -EXPORT_SYMBOL(cmdline_parts_parse); - -struct cmdline_parts *cmdline_parts_find(struct cmdline_parts *parts, - const char *bdev) -{ - while (parts && strncmp(bdev, parts->name, sizeof(parts->name))) - parts = parts->next_parts; - return parts; -} -EXPORT_SYMBOL(cmdline_parts_find); - -/* - * add_part() - * 0 success. - * 1 can not add so many partitions. - */ -int cmdline_parts_set(struct cmdline_parts *parts, sector_t disk_size, - int slot, - int (*add_part)(int, struct cmdline_subpart *, void *), - void *param) -{ - sector_t from = 0; - struct cmdline_subpart *subpart; - - for (subpart = parts->subpart; subpart; - subpart = subpart->next_subpart, slot++) { - if (subpart->from == (sector_t)(~0ULL)) - subpart->from = from; - else - from = subpart->from; - - if (from >= disk_size) - break; - - if (subpart->size > (disk_size - from)) - subpart->size = disk_size - from; - - from += subpart->size; - - if (add_part(slot, subpart, param)) - break; - } - - return slot; -} -EXPORT_SYMBOL(cmdline_parts_set); diff --git a/block/disk-events.c b/block/disk-events.c index a75931ff5da4..8d5496e7592a 100644 --- a/block/disk-events.c +++ b/block/disk-events.c @@ -163,15 +163,31 @@ void disk_flush_events(struct gendisk *disk, unsigned int mask) spin_unlock_irq(&ev->lock); } +/* + * Tell userland about new events. Only the events listed in @disk->events are + * reported, and only if DISK_EVENT_FLAG_UEVENT is set. Otherwise, events are + * processed internally but never get reported to userland. + */ +static void disk_event_uevent(struct gendisk *disk, unsigned int events) +{ + char *envp[ARRAY_SIZE(disk_uevents) + 1] = { }; + int nr_events = 0, i; + + for (i = 0; i < ARRAY_SIZE(disk_uevents); i++) + if (events & disk->events & (1 << i)) + envp[nr_events++] = disk_uevents[i]; + + if (nr_events) + kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); +} + static void disk_check_events(struct disk_events *ev, unsigned int *clearing_ptr) { struct gendisk *disk = ev->disk; - char *envp[ARRAY_SIZE(disk_uevents) + 1] = { }; unsigned int clearing = *clearing_ptr; unsigned int events; unsigned long intv; - int nr_events = 0, i; /* check events */ events = disk->fops->check_events(disk, clearing); @@ -190,19 +206,11 @@ static void disk_check_events(struct disk_events *ev, spin_unlock_irq(&ev->lock); - /* - * Tell userland about new events. Only the events listed in - * @disk->events are reported, and only if DISK_EVENT_FLAG_UEVENT - * is set. Otherwise, events are processed internally but never - * get reported to userland. - */ - for (i = 0; i < ARRAY_SIZE(disk_uevents); i++) - if ((events & disk->events & (1 << i)) && - (disk->event_flags & DISK_EVENT_FLAG_UEVENT)) - envp[nr_events++] = disk_uevents[i]; + if (events & DISK_EVENT_MEDIA_CHANGE) + inc_diskseq(disk); - if (nr_events) - kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); + if (disk->event_flags & DISK_EVENT_FLAG_UEVENT) + disk_event_uevent(disk, events); } /** @@ -281,6 +289,32 @@ bool bdev_check_media_change(struct block_device *bdev) } EXPORT_SYMBOL(bdev_check_media_change); +/** + * disk_force_media_change - force a media change event + * @disk: the disk which will raise the event + * @events: the events to raise + * + * Generate uevents for the disk. If DISK_EVENT_MEDIA_CHANGE is present, + * attempt to free all dentries and inodes and invalidates all block + * device page cache entries in that case. + * + * Returns %true if DISK_EVENT_MEDIA_CHANGE was raised, or %false if not. + */ +bool disk_force_media_change(struct gendisk *disk, unsigned int events) +{ + disk_event_uevent(disk, events); + + if (!(events & DISK_EVENT_MEDIA_CHANGE)) + return false; + + if (__invalidate_device(disk->part0, true)) + pr_warn("VFS: busy inodes on changed media %s\n", + disk->disk_name); + set_bit(GD_NEED_PART_SCAN, &disk->state); + return true; +} +EXPORT_SYMBOL_GPL(disk_force_media_change); + /* * Separate this part out so that a different pointer for clearing_ptr can be * passed in for disk_clear_events. @@ -410,17 +444,17 @@ module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops, /* * disk_{alloc|add|del|release}_events - initialize and destroy disk_events. */ -void disk_alloc_events(struct gendisk *disk) +int disk_alloc_events(struct gendisk *disk) { struct disk_events *ev; if (!disk->fops->check_events || !disk->events) - return; + return 0; ev = kzalloc(sizeof(*ev), GFP_KERNEL); if (!ev) { pr_warn("%s: failed to initialize events\n", disk->disk_name); - return; + return -ENOMEM; } INIT_LIST_HEAD(&ev->node); @@ -432,6 +466,7 @@ void disk_alloc_events(struct gendisk *disk) INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn); disk->ev = ev; + return 0; } void disk_add_events(struct gendisk *disk) diff --git a/block/elevator.c b/block/elevator.c index 52ada14cfe45..ff45d8388f48 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -336,6 +336,9 @@ enum elv_merge elv_merge(struct request_queue *q, struct request **req, __rq = elv_rqhash_find(q, bio->bi_iter.bi_sector); if (__rq && elv_bio_merge_ok(__rq, bio)) { *req = __rq; + + if (blk_discard_mergable(__rq)) + return ELEVATOR_DISCARD_MERGE; return ELEVATOR_BACK_MERGE; } @@ -630,6 +633,9 @@ static inline bool elv_support_iosched(struct request_queue *q) */ static struct elevator_type *elevator_get_default(struct request_queue *q) { + if (q->tag_set && q->tag_set->flags & BLK_MQ_F_NO_SCHED_BY_DEFAULT) + return NULL; + if (q->nr_hw_queues != 1 && !blk_mq_is_sbitmap_shared(q->tag_set->flags)) return NULL; @@ -702,7 +708,6 @@ void elevator_init_mq(struct request_queue *q) elevator_put(e); } } -EXPORT_SYMBOL_GPL(elevator_init_mq); /* only for dm-rq */ /* * switch to new_e io scheduler. be careful not to introduce deadlocks - diff --git a/block/genhd.c b/block/genhd.c index 298ee78c1bda..567549a011d1 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -29,6 +29,23 @@ static struct kobject *block_depr; +/* + * Unique, monotonically increasing sequential number associated with block + * devices instances (i.e. incremented each time a device is attached). + * Associating uevents with block devices in userspace is difficult and racy: + * the uevent netlink socket is lossy, and on slow and overloaded systems has + * a very high latency. + * Block devices do not have exclusive owners in userspace, any process can set + * one up (e.g. loop devices). Moreover, device names can be reused (e.g. loop0 + * can be reused again and again). + * A userspace process setting up a block device and watching for its events + * cannot thus reliably tell whether an event relates to the device it just set + * up or another earlier instance with the same name. + * This sequential number allows userspace processes to solve this problem, and + * uniquely associate an uevent to the lifetime to a device. + */ +static atomic64_t diskseq; + /* for extended dynamic devt allocation, currently only one major is used */ #define NR_EXT_DEVT (1 << MINORBITS) static DEFINE_IDA(ext_devt_ida); @@ -60,7 +77,8 @@ bool set_capacity_and_notify(struct gendisk *disk, sector_t size) * initial capacity during probing. */ if (size == capacity || - (disk->flags & (GENHD_FL_UP | GENHD_FL_HIDDEN)) != GENHD_FL_UP) + !disk_live(disk) || + (disk->flags & GENHD_FL_HIDDEN)) return false; pr_info("%s: detected capacity change from %lld to %lld\n", @@ -78,11 +96,17 @@ bool set_capacity_and_notify(struct gendisk *disk, sector_t size) EXPORT_SYMBOL_GPL(set_capacity_and_notify); /* - * Format the device name of the indicated disk into the supplied buffer and - * return a pointer to that same buffer for convenience. + * Format the device name of the indicated block device into the supplied buffer + * and return a pointer to that same buffer for convenience. + * + * Note: do not use this in new code, use the %pg specifier to sprintf and + * printk insted. */ -char *disk_name(struct gendisk *hd, int partno, char *buf) +const char *bdevname(struct block_device *bdev, char *buf) { + struct gendisk *hd = bdev->bd_disk; + int partno = bdev->bd_partno; + if (!partno) snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name); else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1])) @@ -92,11 +116,6 @@ char *disk_name(struct gendisk *hd, int partno, char *buf) return buf; } - -const char *bdevname(struct block_device *bdev, char *buf) -{ - return disk_name(bdev->bd_disk, bdev->bd_partno, buf); -} EXPORT_SYMBOL(bdevname); static void part_stat_read_all(struct block_device *part, @@ -294,54 +313,19 @@ void unregister_blkdev(unsigned int major, const char *name) EXPORT_SYMBOL(unregister_blkdev); -/** - * blk_mangle_minor - scatter minor numbers apart - * @minor: minor number to mangle - * - * Scatter consecutively allocated @minor number apart if MANGLE_DEVT - * is enabled. Mangling twice gives the original value. - * - * RETURNS: - * Mangled value. - * - * CONTEXT: - * Don't care. - */ -static int blk_mangle_minor(int minor) -{ -#ifdef CONFIG_DEBUG_BLOCK_EXT_DEVT - int i; - - for (i = 0; i < MINORBITS / 2; i++) { - int low = minor & (1 << i); - int high = minor & (1 << (MINORBITS - 1 - i)); - int distance = MINORBITS - 1 - 2 * i; - - minor ^= low | high; /* clear both bits */ - low <<= distance; /* swap the positions */ - high >>= distance; - minor |= low | high; /* and set */ - } -#endif - return minor; -} - int blk_alloc_ext_minor(void) { int idx; idx = ida_alloc_range(&ext_devt_ida, 0, NR_EXT_DEVT, GFP_KERNEL); - if (idx < 0) { - if (idx == -ENOSPC) - return -EBUSY; - return idx; - } - return blk_mangle_minor(idx); + if (idx == -ENOSPC) + return -EBUSY; + return idx; } void blk_free_ext_minor(unsigned int minor) { - ida_free(&ext_devt_ida, blk_mangle_minor(minor)); + ida_free(&ext_devt_ida, minor); } static char *bdevt_str(dev_t devt, char *buf) @@ -390,78 +374,20 @@ static void disk_scan_partitions(struct gendisk *disk) blkdev_put(bdev, FMODE_READ); } -static void register_disk(struct device *parent, struct gendisk *disk, - const struct attribute_group **groups) -{ - struct device *ddev = disk_to_dev(disk); - int err; - - ddev->parent = parent; - - dev_set_name(ddev, "%s", disk->disk_name); - - /* delay uevents, until we scanned partition table */ - dev_set_uevent_suppress(ddev, 1); - - if (groups) { - WARN_ON(ddev->groups); - ddev->groups = groups; - } - if (device_add(ddev)) - return; - if (!sysfs_deprecated) { - err = sysfs_create_link(block_depr, &ddev->kobj, - kobject_name(&ddev->kobj)); - if (err) { - device_del(ddev); - return; - } - } - - /* - * avoid probable deadlock caused by allocating memory with - * GFP_KERNEL in runtime_resume callback of its all ancestor - * devices - */ - pm_runtime_set_memalloc_noio(ddev, true); - - disk->part0->bd_holder_dir = - kobject_create_and_add("holders", &ddev->kobj); - disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); - - if (disk->flags & GENHD_FL_HIDDEN) - return; - - disk_scan_partitions(disk); - - /* announce the disk and partitions after all partitions are created */ - dev_set_uevent_suppress(ddev, 0); - disk_uevent(disk, KOBJ_ADD); - - if (disk->queue->backing_dev_info->dev) { - err = sysfs_create_link(&ddev->kobj, - &disk->queue->backing_dev_info->dev->kobj, - "bdi"); - WARN_ON(err); - } -} - /** - * __device_add_disk - add disk information to kernel list + * device_add_disk - add disk information to kernel list * @parent: parent device for the disk * @disk: per-device partitioning information * @groups: Additional per-device sysfs groups - * @register_queue: register the queue if set to true * * This function registers the partitioning information in @disk * with the kernel. - * - * FIXME: error handling */ -static void __device_add_disk(struct device *parent, struct gendisk *disk, - const struct attribute_group **groups, - bool register_queue) +int device_add_disk(struct device *parent, struct gendisk *disk, + const struct attribute_group **groups) + { + struct device *ddev = disk_to_dev(disk); int ret; /* @@ -470,8 +396,7 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, * elevator if one is needed, that is, for devices requesting queue * registration. */ - if (register_queue) - elevator_init_mq(disk->queue); + elevator_init_mq(disk->queue); /* * If the driver provides an explicit major number it also must provide @@ -481,7 +406,8 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, * and all partitions from the extended dev_t space. */ if (disk->major) { - WARN_ON(!disk->minors); + if (WARN_ON(!disk->minors)) + return -EINVAL; if (disk->minors > DISK_MAX_PARTS) { pr_err("block: can't allocate more than %d partitions\n", @@ -489,21 +415,65 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, disk->minors = DISK_MAX_PARTS; } } else { - WARN_ON(disk->minors); + if (WARN_ON(disk->minors)) + return -EINVAL; ret = blk_alloc_ext_minor(); - if (ret < 0) { - WARN_ON(1); - return; - } + if (ret < 0) + return ret; disk->major = BLOCK_EXT_MAJOR; - disk->first_minor = MINOR(ret); + disk->first_minor = ret; disk->flags |= GENHD_FL_EXT_DEVT; } - disk->flags |= GENHD_FL_UP; + ret = disk_alloc_events(disk); + if (ret) + goto out_free_ext_minor; - disk_alloc_events(disk); + /* delay uevents, until we scanned partition table */ + dev_set_uevent_suppress(ddev, 1); + + ddev->parent = parent; + ddev->groups = groups; + dev_set_name(ddev, "%s", disk->disk_name); + if (!(disk->flags & GENHD_FL_HIDDEN)) + ddev->devt = MKDEV(disk->major, disk->first_minor); + ret = device_add(ddev); + if (ret) + goto out_disk_release_events; + if (!sysfs_deprecated) { + ret = sysfs_create_link(block_depr, &ddev->kobj, + kobject_name(&ddev->kobj)); + if (ret) + goto out_device_del; + } + + /* + * avoid probable deadlock caused by allocating memory with + * GFP_KERNEL in runtime_resume callback of its all ancestor + * devices + */ + pm_runtime_set_memalloc_noio(ddev, true); + + ret = blk_integrity_add(disk); + if (ret) + goto out_del_block_link; + + disk->part0->bd_holder_dir = + kobject_create_and_add("holders", &ddev->kobj); + if (!disk->part0->bd_holder_dir) + goto out_del_integrity; + disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); + if (!disk->slave_dir) + goto out_put_holder_dir; + + ret = bd_register_pending_holders(disk); + if (ret < 0) + goto out_put_slave_dir; + + ret = blk_register_queue(disk); + if (ret) + goto out_put_slave_dir; if (disk->flags & GENHD_FL_HIDDEN) { /* @@ -513,48 +483,56 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO; disk->flags |= GENHD_FL_NO_PART_SCAN; } else { - struct backing_dev_info *bdi = disk->queue->backing_dev_info; - struct device *dev = disk_to_dev(disk); - - /* Register BDI before referencing it from bdev */ - dev->devt = MKDEV(disk->major, disk->first_minor); - ret = bdi_register(bdi, "%u:%u", + ret = bdi_register(disk->bdi, "%u:%u", disk->major, disk->first_minor); - WARN_ON(ret); - bdi_set_owner(bdi, dev); - bdev_add(disk->part0, dev->devt); + if (ret) + goto out_unregister_queue; + bdi_set_owner(disk->bdi, ddev); + ret = sysfs_create_link(&ddev->kobj, + &disk->bdi->dev->kobj, "bdi"); + if (ret) + goto out_unregister_bdi; + + bdev_add(disk->part0, ddev->devt); + disk_scan_partitions(disk); + + /* + * Announce the disk and partitions after all partitions are + * created. (for hidden disks uevents remain suppressed forever) + */ + dev_set_uevent_suppress(ddev, 0); + disk_uevent(disk, KOBJ_ADD); } - register_disk(parent, disk, groups); - if (register_queue) - blk_register_queue(disk); - - /* - * Take an extra ref on queue which will be put on disk_release() - * so that it sticks around as long as @disk is there. - */ - if (blk_get_queue(disk->queue)) - set_bit(GD_QUEUE_REF, &disk->state); - else - WARN_ON_ONCE(1); + disk_update_readahead(disk); disk_add_events(disk); - blk_integrity_add(disk); -} + return 0; -void device_add_disk(struct device *parent, struct gendisk *disk, - const struct attribute_group **groups) - -{ - __device_add_disk(parent, disk, groups, true); +out_unregister_bdi: + if (!(disk->flags & GENHD_FL_HIDDEN)) + bdi_unregister(disk->bdi); +out_unregister_queue: + blk_unregister_queue(disk); +out_put_slave_dir: + kobject_put(disk->slave_dir); +out_put_holder_dir: + kobject_put(disk->part0->bd_holder_dir); +out_del_integrity: + blk_integrity_del(disk); +out_del_block_link: + if (!sysfs_deprecated) + sysfs_remove_link(block_depr, dev_name(ddev)); +out_device_del: + device_del(ddev); +out_disk_release_events: + disk_release_events(disk); +out_free_ext_minor: + if (disk->major == BLOCK_EXT_MAJOR) + blk_free_ext_minor(disk->first_minor); + return WARN_ON_ONCE(ret); /* keep until all callers handle errors */ } EXPORT_SYMBOL(device_add_disk); -void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk) -{ - __device_add_disk(parent, disk, NULL, false); -} -EXPORT_SYMBOL(device_add_disk_no_queue_reg); - /** * del_gendisk - remove the gendisk * @disk: the struct gendisk to remove @@ -578,26 +556,20 @@ void del_gendisk(struct gendisk *disk) { might_sleep(); - if (WARN_ON_ONCE(!disk->queue)) + if (WARN_ON_ONCE(!disk_live(disk) && !(disk->flags & GENHD_FL_HIDDEN))) return; blk_integrity_del(disk); disk_del_events(disk); mutex_lock(&disk->open_mutex); - disk->flags &= ~GENHD_FL_UP; + remove_inode_hash(disk->part0->bd_inode); blk_drop_partitions(disk); mutex_unlock(&disk->open_mutex); fsync_bdev(disk->part0); __invalidate_device(disk->part0, true); - /* - * Unhash the bdev inode for this device so that it can't be looked - * up any more even if openers still hold references to it. - */ - remove_inode_hash(disk->part0->bd_inode); - set_capacity(disk, 0); if (!(disk->flags & GENHD_FL_HIDDEN)) { @@ -607,7 +579,7 @@ void del_gendisk(struct gendisk *disk) * Unregister bdi before releasing device numbers (as they can * get reused and we'd get clashes in sysfs). */ - bdi_unregister(disk->queue->backing_dev_info); + bdi_unregister(disk->bdi); } blk_unregister_queue(disk); @@ -683,7 +655,6 @@ void __init printk_all_partitions(void) while ((dev = class_dev_iter_next(&iter))) { struct gendisk *disk = dev_to_disk(dev); struct block_device *part; - char name_buf[BDEVNAME_SIZE]; char devt_buf[BDEVT_SIZE]; unsigned long idx; @@ -703,11 +674,10 @@ void __init printk_all_partitions(void) xa_for_each(&disk->part_tbl, idx, part) { if (!bdev_nr_sectors(part)) continue; - printk("%s%s %10llu %s %s", + printk("%s%s %10llu %pg %s", bdev_is_partition(part) ? " " : "", bdevt_str(part->bd_dev, devt_buf), - bdev_nr_sectors(part) >> 1, - disk_name(disk, part->bd_partno, name_buf), + bdev_nr_sectors(part) >> 1, part, part->bd_meta_info ? part->bd_meta_info->uuid : ""); if (bdev_is_partition(part)) @@ -785,7 +755,6 @@ static int show_partition(struct seq_file *seqf, void *v) struct gendisk *sgp = v; struct block_device *part; unsigned long idx; - char buf[BDEVNAME_SIZE]; /* Don't show non-partitionable removeable devices or empty devices */ if (!get_capacity(sgp) || (!disk_max_parts(sgp) && @@ -798,10 +767,9 @@ static int show_partition(struct seq_file *seqf, void *v) xa_for_each(&sgp->part_tbl, idx, part) { if (!bdev_nr_sectors(part)) continue; - seq_printf(seqf, "%4d %7d %10llu %s\n", + seq_printf(seqf, "%4d %7d %10llu %pg\n", MAJOR(part->bd_dev), MINOR(part->bd_dev), - bdev_nr_sectors(part) >> 1, - disk_name(sgp, part->bd_partno, buf)); + bdev_nr_sectors(part) >> 1, part); } rcu_read_unlock(); return 0; @@ -968,6 +936,14 @@ static ssize_t disk_discard_alignment_show(struct device *dev, return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue)); } +static ssize_t diskseq_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%llu\n", disk->diskseq); +} + static DEVICE_ATTR(range, 0444, disk_range_show, NULL); static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL); static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL); @@ -980,6 +956,7 @@ static DEVICE_ATTR(capability, 0444, disk_capability_show, NULL); static DEVICE_ATTR(stat, 0444, part_stat_show, NULL); static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL); static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store); +static DEVICE_ATTR(diskseq, 0444, diskseq_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST ssize_t part_fail_show(struct device *dev, @@ -1025,6 +1002,7 @@ static struct attribute *disk_attrs[] = { &dev_attr_events.attr, &dev_attr_events_async.attr, &dev_attr_events_poll_msecs.attr, + &dev_attr_diskseq.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif @@ -1074,17 +1052,24 @@ static void disk_release(struct device *dev) might_sleep(); - if (MAJOR(dev->devt) == BLOCK_EXT_MAJOR) - blk_free_ext_minor(MINOR(dev->devt)); disk_release_events(disk); kfree(disk->random); xa_destroy(&disk->part_tbl); - if (test_bit(GD_QUEUE_REF, &disk->state) && disk->queue) - blk_put_queue(disk->queue); - bdput(disk->part0); /* frees the disk */ + disk->queue->disk = NULL; + blk_put_queue(disk->queue); + iput(disk->part0->bd_inode); /* frees the disk */ } + +static int block_uevent(struct device *dev, struct kobj_uevent_env *env) +{ + struct gendisk *disk = dev_to_disk(dev); + + return add_uevent_var(env, "DISKSEQ=%llu", disk->diskseq); +} + struct class block_class = { .name = "block", + .dev_uevent = block_uevent, }; static char *block_devnode(struct device *dev, umode_t *mode, @@ -1116,7 +1101,6 @@ static int diskstats_show(struct seq_file *seqf, void *v) { struct gendisk *gp = v; struct block_device *hd; - char buf[BDEVNAME_SIZE]; unsigned int inflight; struct disk_stats stat; unsigned long idx; @@ -1139,15 +1123,14 @@ static int diskstats_show(struct seq_file *seqf, void *v) else inflight = part_in_flight(hd); - seq_printf(seqf, "%4d %7d %s " + seq_printf(seqf, "%4d %7d %pg " "%lu %lu %lu %u " "%lu %lu %lu %u " "%u %u %u " "%lu %lu %lu %u " "%lu %u" "\n", - MAJOR(hd->bd_dev), MINOR(hd->bd_dev), - disk_name(gp, hd->bd_partno, buf), + MAJOR(hd->bd_dev), MINOR(hd->bd_dev), hd, stat.ios[STAT_READ], stat.merges[STAT_READ], stat.sectors[STAT_READ], @@ -1239,17 +1222,25 @@ dev_t blk_lookup_devt(const char *name, int partno) return devt; } -struct gendisk *__alloc_disk_node(int minors, int node_id) +struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, + struct lock_class_key *lkclass) { struct gendisk *disk; + if (!blk_get_queue(q)) + return NULL; + disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id); if (!disk) - return NULL; + goto out_put_queue; + + disk->bdi = bdi_alloc(node_id); + if (!disk->bdi) + goto out_free_disk; disk->part0 = bdev_alloc(disk, 0); if (!disk->part0) - goto out_free_disk; + goto out_free_bdi; disk->node_id = node_id; mutex_init(&disk->open_mutex); @@ -1257,23 +1248,33 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL)) goto out_destroy_part_tbl; - disk->minors = minors; rand_initialize_disk(disk); disk_to_dev(disk)->class = &block_class; disk_to_dev(disk)->type = &disk_type; device_initialize(disk_to_dev(disk)); + inc_diskseq(disk); + disk->queue = q; + q->disk = disk; + lockdep_init_map(&disk->lockdep_map, "(bio completion)", lkclass, 0); +#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED + INIT_LIST_HEAD(&disk->slave_bdevs); +#endif return disk; out_destroy_part_tbl: xa_destroy(&disk->part_tbl); - bdput(disk->part0); + iput(disk->part0->bd_inode); +out_free_bdi: + bdi_put(disk->bdi); out_free_disk: kfree(disk); +out_put_queue: + blk_put_queue(q); return NULL; } EXPORT_SYMBOL(__alloc_disk_node); -struct gendisk *__blk_alloc_disk(int node) +struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass) { struct request_queue *q; struct gendisk *disk; @@ -1282,12 +1283,11 @@ struct gendisk *__blk_alloc_disk(int node) if (!q) return NULL; - disk = __alloc_disk_node(0, node); + disk = __alloc_disk_node(q, node, lkclass); if (!disk) { blk_cleanup_queue(q); return NULL; } - disk->queue = q; return disk; } EXPORT_SYMBOL(__blk_alloc_disk); @@ -1362,3 +1362,8 @@ int bdev_read_only(struct block_device *bdev) return bdev->bd_read_only || get_disk_ro(bdev->bd_disk); } EXPORT_SYMBOL(bdev_read_only); + +void inc_diskseq(struct gendisk *disk) +{ + disk->diskseq = atomic64_inc_return(&diskseq); +} diff --git a/block/holder.c b/block/holder.c new file mode 100644 index 000000000000..9dc084182337 --- /dev/null +++ b/block/holder.c @@ -0,0 +1,174 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include + +struct bd_holder_disk { + struct list_head list; + struct block_device *bdev; + int refcnt; +}; + +static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev, + struct gendisk *disk) +{ + struct bd_holder_disk *holder; + + list_for_each_entry(holder, &disk->slave_bdevs, list) + if (holder->bdev == bdev) + return holder; + return NULL; +} + +static int add_symlink(struct kobject *from, struct kobject *to) +{ + return sysfs_create_link(from, to, kobject_name(to)); +} + +static void del_symlink(struct kobject *from, struct kobject *to) +{ + sysfs_remove_link(from, kobject_name(to)); +} + +static int __link_disk_holder(struct block_device *bdev, struct gendisk *disk) +{ + int ret; + + ret = add_symlink(disk->slave_dir, bdev_kobj(bdev)); + if (ret) + return ret; + ret = add_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); + if (ret) + del_symlink(disk->slave_dir, bdev_kobj(bdev)); + return ret; +} + +/** + * bd_link_disk_holder - create symlinks between holding disk and slave bdev + * @bdev: the claimed slave bdev + * @disk: the holding disk + * + * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. + * + * This functions creates the following sysfs symlinks. + * + * - from "slaves" directory of the holder @disk to the claimed @bdev + * - from "holders" directory of the @bdev to the holder @disk + * + * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is + * passed to bd_link_disk_holder(), then: + * + * /sys/block/dm-0/slaves/sda --> /sys/block/sda + * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0 + * + * The caller must have claimed @bdev before calling this function and + * ensure that both @bdev and @disk are valid during the creation and + * lifetime of these symlinks. + * + * CONTEXT: + * Might sleep. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) +{ + struct bd_holder_disk *holder; + int ret = 0; + + mutex_lock(&disk->open_mutex); + + WARN_ON_ONCE(!bdev->bd_holder); + + /* FIXME: remove the following once add_disk() handles errors */ + if (WARN_ON(!bdev->bd_holder_dir)) + goto out_unlock; + + holder = bd_find_holder_disk(bdev, disk); + if (holder) { + holder->refcnt++; + goto out_unlock; + } + + holder = kzalloc(sizeof(*holder), GFP_KERNEL); + if (!holder) { + ret = -ENOMEM; + goto out_unlock; + } + + INIT_LIST_HEAD(&holder->list); + holder->bdev = bdev; + holder->refcnt = 1; + if (disk->slave_dir) { + ret = __link_disk_holder(bdev, disk); + if (ret) { + kfree(holder); + goto out_unlock; + } + } + + list_add(&holder->list, &disk->slave_bdevs); + /* + * del_gendisk drops the initial reference to bd_holder_dir, so we need + * to keep our own here to allow for cleanup past that point. + */ + kobject_get(bdev->bd_holder_dir); + +out_unlock: + mutex_unlock(&disk->open_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(bd_link_disk_holder); + +static void __unlink_disk_holder(struct block_device *bdev, + struct gendisk *disk) +{ + del_symlink(disk->slave_dir, bdev_kobj(bdev)); + del_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); +} + +/** + * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder() + * @bdev: the calimed slave bdev + * @disk: the holding disk + * + * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. + * + * CONTEXT: + * Might sleep. + */ +void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) +{ + struct bd_holder_disk *holder; + + mutex_lock(&disk->open_mutex); + holder = bd_find_holder_disk(bdev, disk); + if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { + if (disk->slave_dir) + __unlink_disk_holder(bdev, disk); + kobject_put(bdev->bd_holder_dir); + list_del_init(&holder->list); + kfree(holder); + } + mutex_unlock(&disk->open_mutex); +} +EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); + +int bd_register_pending_holders(struct gendisk *disk) +{ + struct bd_holder_disk *holder; + int ret; + + mutex_lock(&disk->open_mutex); + list_for_each_entry(holder, &disk->slave_bdevs, list) { + ret = __link_disk_holder(holder->bdev, disk); + if (ret) + goto out_undo; + } + mutex_unlock(&disk->open_mutex); + return 0; + +out_undo: + list_for_each_entry_continue_reverse(holder, &disk->slave_bdevs, list) + __unlink_disk_holder(holder->bdev, disk); + mutex_unlock(&disk->open_mutex); + return ret; +} diff --git a/block/ioctl.c b/block/ioctl.c index 24beec9ca9c9..eb0491e90b9a 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -16,6 +16,7 @@ static int blkpg_do_ioctl(struct block_device *bdev, struct blkpg_partition __user *upart, int op) { + struct gendisk *disk = bdev->bd_disk; struct blkpg_partition p; long long start, length; @@ -30,7 +31,7 @@ static int blkpg_do_ioctl(struct block_device *bdev, return -EINVAL; if (op == BLKPG_DEL_PARTITION) - return bdev_del_partition(bdev, p.pno); + return bdev_del_partition(disk, p.pno); start = p.start >> SECTOR_SHIFT; length = p.length >> SECTOR_SHIFT; @@ -40,9 +41,9 @@ static int blkpg_do_ioctl(struct block_device *bdev, /* check if partition is aligned to blocksize */ if (p.start & (bdev_logical_block_size(bdev) - 1)) return -EINVAL; - return bdev_add_partition(bdev, p.pno, start, length); + return bdev_add_partition(disk, p.pno, start, length); case BLKPG_RESIZE_PARTITION: - return bdev_resize_partition(bdev, p.pno, start, length); + return bdev_resize_partition(disk, p.pno, start, length); default: return -EINVAL; } @@ -469,6 +470,8 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode, BLKDEV_DISCARD_SECURE); case BLKZEROOUT: return blk_ioctl_zeroout(bdev, mode, arg); + case BLKGETDISKSEQ: + return put_u64(argp, bdev->bd_disk->diskseq); case BLKREPORTZONE: return blkdev_report_zones_ioctl(bdev, mode, cmd, arg); case BLKRESETZONE: @@ -504,7 +507,7 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode, case BLKFRASET: if(!capable(CAP_SYS_ADMIN)) return -EACCES; - bdev->bd_bdi->ra_pages = (arg * 512) / PAGE_SIZE; + bdev->bd_disk->bdi->ra_pages = (arg * 512) / PAGE_SIZE; return 0; case BLKRRPART: return blkdev_reread_part(bdev, mode); @@ -554,7 +557,8 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, case BLKFRAGET: if (!argp) return -EINVAL; - return put_long(argp, (bdev->bd_bdi->ra_pages*PAGE_SIZE) / 512); + return put_long(argp, + (bdev->bd_disk->bdi->ra_pages * PAGE_SIZE) / 512); case BLKGETSIZE: size = i_size_read(bdev->bd_inode); if ((size >> 9) > ~0UL) @@ -626,7 +630,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) if (!argp) return -EINVAL; return compat_put_long(argp, - (bdev->bd_bdi->ra_pages * PAGE_SIZE) / 512); + (bdev->bd_disk->bdi->ra_pages * PAGE_SIZE) / 512); case BLKGETSIZE: size = i_size_read(bdev->bd_inode); if ((size >> 9) > ~0UL) diff --git a/block/ioprio.c b/block/ioprio.c index bee628f9f1b2..0e4ff245f2bf 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -74,9 +74,8 @@ int ioprio_check_cap(int ioprio) fallthrough; /* rt has prio field too */ case IOPRIO_CLASS_BE: - if (data >= IOPRIO_BE_NR || data < 0) + if (data >= IOPRIO_NR_LEVELS || data < 0) return -EINVAL; - break; case IOPRIO_CLASS_IDLE: break; @@ -171,7 +170,7 @@ static int get_task_ioprio(struct task_struct *p) ret = security_task_getioprio(p); if (ret) goto out; - ret = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, IOPRIO_NORM); + ret = IOPRIO_DEFAULT; task_lock(p); if (p->io_context) ret = p->io_context->ioprio; @@ -183,9 +182,9 @@ out: int ioprio_best(unsigned short aprio, unsigned short bprio) { if (!ioprio_valid(aprio)) - aprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM); + aprio = IOPRIO_DEFAULT; if (!ioprio_valid(bprio)) - bprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM); + bprio = IOPRIO_DEFAULT; return min(aprio, bprio); } diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 36920670dccc..3c3693c34f06 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -629,6 +629,8 @@ static int dd_request_merge(struct request_queue *q, struct request **rq, if (elv_bio_merge_ok(__rq, bio)) { *rq = __rq; + if (blk_discard_mergable(__rq)) + return ELEVATOR_DISCARD_MERGE; return ELEVATOR_FRONT_MERGE; } } diff --git a/block/partitions/Kconfig b/block/partitions/Kconfig index 6e2a649669e5..278593b8e4e9 100644 --- a/block/partitions/Kconfig +++ b/block/partitions/Kconfig @@ -264,7 +264,6 @@ config SYSV68_PARTITION config CMDLINE_PARTITION bool "Command line partition support" if PARTITION_ADVANCED - select BLK_CMDLINE_PARSER help Say Y here if you want to read the partition table from bootargs. The format for the command line is just like mtdparts. diff --git a/block/partitions/acorn.c b/block/partitions/acorn.c index c64c57b958bf..2c381c694c57 100644 --- a/block/partitions/acorn.c +++ b/block/partitions/acorn.c @@ -275,7 +275,7 @@ int adfspart_check_ADFS(struct parsed_partitions *state) /* * Work out start of non-adfs partition. */ - nr_sects = (state->bdev->bd_inode->i_size >> 9) - start_sect; + nr_sects = get_capacity(state->disk) - start_sect; if (start_sect) { switch (id) { @@ -540,7 +540,7 @@ int adfspart_check_EESOX(struct parsed_partitions *state) if (i != 0) { sector_t size; - size = get_capacity(state->bdev->bd_disk); + size = get_capacity(state->disk); put_partition(state, slot++, start, size - start); strlcat(state->pp_buf, "\n", PAGE_SIZE); } diff --git a/block/partitions/aix.c b/block/partitions/aix.c index c7b4fd1a4a97..85f4b967565e 100644 --- a/block/partitions/aix.c +++ b/block/partitions/aix.c @@ -66,22 +66,6 @@ struct pvd { #define LVM_MAXLVS 256 -/** - * last_lba(): return number of last logical block of device - * @bdev: block device - * - * Description: Returns last LBA value on success, 0 on error. - * This is stored (by sd and ide-geometry) in - * the part[0] entry for this disk, and is the number of - * physical sectors available on the disk. - */ -static u64 last_lba(struct block_device *bdev) -{ - if (!bdev || !bdev->bd_inode) - return 0; - return (bdev->bd_inode->i_size >> 9) - 1ULL; -} - /** * read_lba(): Read bytes from disk, starting at given LBA * @state @@ -89,7 +73,7 @@ static u64 last_lba(struct block_device *bdev) * @buffer * @count * - * Description: Reads @count bytes from @state->bdev into @buffer. + * Description: Reads @count bytes from @state->disk into @buffer. * Returns number of bytes read on success, 0 on error. */ static size_t read_lba(struct parsed_partitions *state, u64 lba, u8 *buffer, @@ -97,7 +81,7 @@ static size_t read_lba(struct parsed_partitions *state, u64 lba, u8 *buffer, { size_t totalreadcount = 0; - if (!buffer || lba + count / 512 > last_lba(state->bdev)) + if (!buffer || lba + count / 512 > get_capacity(state->disk) - 1ULL) return 0; while (count) { diff --git a/block/partitions/amiga.c b/block/partitions/amiga.c index 9526491d9aed..5c8624e26a54 100644 --- a/block/partitions/amiga.c +++ b/block/partitions/amiga.c @@ -34,7 +34,6 @@ int amiga_partition(struct parsed_partitions *state) int start_sect, nr_sects, blk, part, res = 0; int blksize = 1; /* Multiplier for disk block size */ int slot = 1; - char b[BDEVNAME_SIZE]; for (blk = 0; ; blk++, put_dev_sector(sect)) { if (blk == RDB_ALLOCATION_LIMIT) @@ -42,7 +41,7 @@ int amiga_partition(struct parsed_partitions *state) data = read_part_sector(state, blk, §); if (!data) { pr_err("Dev %s: unable to read RDB block %d\n", - bdevname(state->bdev, b), blk); + state->disk->disk_name, blk); res = -1; goto rdb_done; } @@ -64,7 +63,7 @@ int amiga_partition(struct parsed_partitions *state) } pr_err("Dev %s: RDB in block %d has bad checksum\n", - bdevname(state->bdev, b), blk); + state->disk->disk_name, blk); } /* blksize is blocks per 512 byte standard block */ @@ -84,7 +83,7 @@ int amiga_partition(struct parsed_partitions *state) data = read_part_sector(state, blk, §); if (!data) { pr_err("Dev %s: unable to read partition block %d\n", - bdevname(state->bdev, b), blk); + state->disk->disk_name, blk); res = -1; goto rdb_done; } diff --git a/block/partitions/atari.c b/block/partitions/atari.c index 2305840c8522..da5994175416 100644 --- a/block/partitions/atari.c +++ b/block/partitions/atari.c @@ -47,7 +47,7 @@ int atari_partition(struct parsed_partitions *state) * ATARI partition scheme supports 512 lba only. If this is not * the case, bail early to avoid miscalculating hd_size. */ - if (bdev_logical_block_size(state->bdev) != 512) + if (queue_logical_block_size(state->disk->queue) != 512) return 0; rs = read_part_sector(state, 0, §); @@ -55,7 +55,7 @@ int atari_partition(struct parsed_partitions *state) return -1; /* Verify this is an Atari rootsector: */ - hd_size = state->bdev->bd_inode->i_size >> 9; + hd_size = get_capacity(state->disk); if (!VALID_PARTITION(&rs->part[0], hd_size) && !VALID_PARTITION(&rs->part[1], hd_size) && !VALID_PARTITION(&rs->part[2], hd_size) && diff --git a/block/partitions/check.h b/block/partitions/check.h index c577e9ee67f0..d5b28e309d64 100644 --- a/block/partitions/check.h +++ b/block/partitions/check.h @@ -9,7 +9,7 @@ * description. */ struct parsed_partitions { - struct block_device *bdev; + struct gendisk *disk; char name[BDEVNAME_SIZE]; struct { sector_t from; diff --git a/block/partitions/cmdline.c b/block/partitions/cmdline.c index 8f545c36cde4..1af610f0ba8c 100644 --- a/block/partitions/cmdline.c +++ b/block/partitions/cmdline.c @@ -14,20 +14,248 @@ * For further information, see "Documentation/block/cmdline-partition.rst" * */ - -#include - +#include +#include +#include #include "check.h" + +/* partition flags */ +#define PF_RDONLY 0x01 /* Device is read only */ +#define PF_POWERUP_LOCK 0x02 /* Always locked after reset */ + +struct cmdline_subpart { + char name[BDEVNAME_SIZE]; /* partition name, such as 'rootfs' */ + sector_t from; + sector_t size; + int flags; + struct cmdline_subpart *next_subpart; +}; + +struct cmdline_parts { + char name[BDEVNAME_SIZE]; /* block device, such as 'mmcblk0' */ + unsigned int nr_subparts; + struct cmdline_subpart *subpart; + struct cmdline_parts *next_parts; +}; + +static int parse_subpart(struct cmdline_subpart **subpart, char *partdef) +{ + int ret = 0; + struct cmdline_subpart *new_subpart; + + *subpart = NULL; + + new_subpart = kzalloc(sizeof(struct cmdline_subpart), GFP_KERNEL); + if (!new_subpart) + return -ENOMEM; + + if (*partdef == '-') { + new_subpart->size = (sector_t)(~0ULL); + partdef++; + } else { + new_subpart->size = (sector_t)memparse(partdef, &partdef); + if (new_subpart->size < (sector_t)PAGE_SIZE) { + pr_warn("cmdline partition size is invalid."); + ret = -EINVAL; + goto fail; + } + } + + if (*partdef == '@') { + partdef++; + new_subpart->from = (sector_t)memparse(partdef, &partdef); + } else { + new_subpart->from = (sector_t)(~0ULL); + } + + if (*partdef == '(') { + int length; + char *next = strchr(++partdef, ')'); + + if (!next) { + pr_warn("cmdline partition format is invalid."); + ret = -EINVAL; + goto fail; + } + + length = min_t(int, next - partdef, + sizeof(new_subpart->name) - 1); + strncpy(new_subpart->name, partdef, length); + new_subpart->name[length] = '\0'; + + partdef = ++next; + } else + new_subpart->name[0] = '\0'; + + new_subpart->flags = 0; + + if (!strncmp(partdef, "ro", 2)) { + new_subpart->flags |= PF_RDONLY; + partdef += 2; + } + + if (!strncmp(partdef, "lk", 2)) { + new_subpart->flags |= PF_POWERUP_LOCK; + partdef += 2; + } + + *subpart = new_subpart; + return 0; +fail: + kfree(new_subpart); + return ret; +} + +static void free_subpart(struct cmdline_parts *parts) +{ + struct cmdline_subpart *subpart; + + while (parts->subpart) { + subpart = parts->subpart; + parts->subpart = subpart->next_subpart; + kfree(subpart); + } +} + +static int parse_parts(struct cmdline_parts **parts, const char *bdevdef) +{ + int ret = -EINVAL; + char *next; + int length; + struct cmdline_subpart **next_subpart; + struct cmdline_parts *newparts; + char buf[BDEVNAME_SIZE + 32 + 4]; + + *parts = NULL; + + newparts = kzalloc(sizeof(struct cmdline_parts), GFP_KERNEL); + if (!newparts) + return -ENOMEM; + + next = strchr(bdevdef, ':'); + if (!next) { + pr_warn("cmdline partition has no block device."); + goto fail; + } + + length = min_t(int, next - bdevdef, sizeof(newparts->name) - 1); + strncpy(newparts->name, bdevdef, length); + newparts->name[length] = '\0'; + newparts->nr_subparts = 0; + + next_subpart = &newparts->subpart; + + while (next && *(++next)) { + bdevdef = next; + next = strchr(bdevdef, ','); + + length = (!next) ? (sizeof(buf) - 1) : + min_t(int, next - bdevdef, sizeof(buf) - 1); + + strncpy(buf, bdevdef, length); + buf[length] = '\0'; + + ret = parse_subpart(next_subpart, buf); + if (ret) + goto fail; + + newparts->nr_subparts++; + next_subpart = &(*next_subpart)->next_subpart; + } + + if (!newparts->subpart) { + pr_warn("cmdline partition has no valid partition."); + ret = -EINVAL; + goto fail; + } + + *parts = newparts; + + return 0; +fail: + free_subpart(newparts); + kfree(newparts); + return ret; +} + +static void cmdline_parts_free(struct cmdline_parts **parts) +{ + struct cmdline_parts *next_parts; + + while (*parts) { + next_parts = (*parts)->next_parts; + free_subpart(*parts); + kfree(*parts); + *parts = next_parts; + } +} + +static int cmdline_parts_parse(struct cmdline_parts **parts, + const char *cmdline) +{ + int ret; + char *buf; + char *pbuf; + char *next; + struct cmdline_parts **next_parts; + + *parts = NULL; + + next = pbuf = buf = kstrdup(cmdline, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + next_parts = parts; + + while (next && *pbuf) { + next = strchr(pbuf, ';'); + if (next) + *next = '\0'; + + ret = parse_parts(next_parts, pbuf); + if (ret) + goto fail; + + if (next) + pbuf = ++next; + + next_parts = &(*next_parts)->next_parts; + } + + if (!*parts) { + pr_warn("cmdline partition has no valid partition."); + ret = -EINVAL; + goto fail; + } + + ret = 0; +done: + kfree(buf); + return ret; + +fail: + cmdline_parts_free(parts); + goto done; +} + +static struct cmdline_parts *cmdline_parts_find(struct cmdline_parts *parts, + const char *bdev) +{ + while (parts && strncmp(bdev, parts->name, sizeof(parts->name))) + parts = parts->next_parts; + return parts; +} + static char *cmdline; static struct cmdline_parts *bdev_parts; -static int add_part(int slot, struct cmdline_subpart *subpart, void *param) +static int add_part(int slot, struct cmdline_subpart *subpart, + struct parsed_partitions *state) { int label_min; struct partition_meta_info *info; char tmp[sizeof(info->volname) + 4]; - struct parsed_partitions *state = (struct parsed_partitions *)param; if (slot >= state->limit) return 1; @@ -50,6 +278,35 @@ static int add_part(int slot, struct cmdline_subpart *subpart, void *param) return 0; } +static int cmdline_parts_set(struct cmdline_parts *parts, sector_t disk_size, + struct parsed_partitions *state) +{ + sector_t from = 0; + struct cmdline_subpart *subpart; + int slot = 1; + + for (subpart = parts->subpart; subpart; + subpart = subpart->next_subpart, slot++) { + if (subpart->from == (sector_t)(~0ULL)) + subpart->from = from; + else + from = subpart->from; + + if (from >= disk_size) + break; + + if (subpart->size > (disk_size - from)) + subpart->size = disk_size - from; + + from += subpart->size; + + if (add_part(slot, subpart, state)) + break; + } + + return slot; +} + static int __init cmdline_parts_setup(char *s) { cmdline = s; @@ -123,7 +380,6 @@ static void cmdline_parts_verifier(int slot, struct parsed_partitions *state) int cmdline_partition(struct parsed_partitions *state) { sector_t disk_size; - char bdev[BDEVNAME_SIZE]; struct cmdline_parts *parts; if (cmdline) { @@ -140,14 +396,13 @@ int cmdline_partition(struct parsed_partitions *state) if (!bdev_parts) return 0; - bdevname(state->bdev, bdev); - parts = cmdline_parts_find(bdev_parts, bdev); + parts = cmdline_parts_find(bdev_parts, state->disk->disk_name); if (!parts) return 0; - disk_size = get_capacity(state->bdev->bd_disk) << 9; + disk_size = get_capacity(state->disk) << 9; - cmdline_parts_set(parts, disk_size, 1, add_part, (void *)state); + cmdline_parts_set(parts, disk_size, state); cmdline_parts_verifier(1, state); strlcat(state->pp_buf, "\n", PAGE_SIZE); diff --git a/block/partitions/core.c b/block/partitions/core.c index 4230d4f71879..58c4c362c94f 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -135,8 +135,8 @@ static struct parsed_partitions *check_partition(struct gendisk *hd) } state->pp_buf[0] = '\0'; - state->bdev = hd->part0; - disk_name(hd, 0, state->name); + state->disk = hd; + snprintf(state->name, BDEVNAME_SIZE, "%s", hd->disk_name); snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name); if (isdigit(state->name[strlen(state->name)-1])) sprintf(state->name, "p"); @@ -259,9 +259,8 @@ static const struct attribute_group *part_attr_groups[] = { static void part_release(struct device *dev) { - if (MAJOR(dev->devt) == BLOCK_EXT_MAJOR) - blk_free_ext_minor(MINOR(dev->devt)); - bdput(dev_to_bdev(dev)); + put_disk(dev_to_bdev(dev)->bd_disk); + iput(dev_to_bdev(dev)->bd_inode); } static int part_uevent(struct device *dev, struct kobj_uevent_env *env) @@ -281,12 +280,10 @@ struct device_type part_type = { .uevent = part_uevent, }; -/* - * Must be called either with open_mutex held, before a disk can be opened or - * after all disk users are gone. - */ static void delete_partition(struct block_device *part) { + lockdep_assert_held(&part->bd_disk->open_mutex); + fsync_bdev(part); __invalidate_device(part, true); @@ -351,20 +348,17 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, if (xa_load(&disk->part_tbl, partno)) return ERR_PTR(-EBUSY); + /* ensure we always have a reference to the whole disk */ + get_device(disk_to_dev(disk)); + + err = -ENOMEM; bdev = bdev_alloc(disk, partno); if (!bdev) - return ERR_PTR(-ENOMEM); + goto out_put_disk; bdev->bd_start_sect = start; bdev_set_nr_sectors(bdev, len); - if (info) { - err = -ENOMEM; - bdev->bd_meta_info = kmemdup(info, sizeof(*info), GFP_KERNEL); - if (!bdev->bd_meta_info) - goto out_bdput; - } - pdev = &bdev->bd_device; dname = dev_name(ddev); if (isdigit(dname[strlen(dname) - 1])) @@ -388,6 +382,13 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, } pdev->devt = devt; + if (info) { + err = -ENOMEM; + bdev->bd_meta_info = kmemdup(info, sizeof(*info), GFP_KERNEL); + if (!bdev->bd_meta_info) + goto out_put; + } + /* delay uevent until 'holders' subdir is created */ dev_set_uevent_suppress(pdev, 1); err = device_add(pdev); @@ -417,14 +418,13 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, kobject_uevent(&pdev->kobj, KOBJ_ADD); return bdev; -out_bdput: - bdput(bdev); - return ERR_PTR(err); out_del: kobject_put(bdev->bd_holder_dir); device_del(pdev); out_put: put_device(pdev); +out_put_disk: + put_disk(disk); return ERR_PTR(err); } @@ -449,15 +449,14 @@ static bool partition_overlaps(struct gendisk *disk, sector_t start, return overlap; } -int bdev_add_partition(struct block_device *bdev, int partno, - sector_t start, sector_t length) +int bdev_add_partition(struct gendisk *disk, int partno, sector_t start, + sector_t length) { struct block_device *part; - struct gendisk *disk = bdev->bd_disk; int ret; mutex_lock(&disk->open_mutex); - if (!(disk->flags & GENHD_FL_UP)) { + if (!disk_live(disk)) { ret = -ENXIO; goto out; } @@ -475,13 +474,13 @@ out: return ret; } -int bdev_del_partition(struct block_device *bdev, int partno) +int bdev_del_partition(struct gendisk *disk, int partno) { struct block_device *part = NULL; int ret = -ENXIO; - mutex_lock(&bdev->bd_disk->open_mutex); - part = xa_load(&bdev->bd_disk->part_tbl, partno); + mutex_lock(&disk->open_mutex); + part = xa_load(&disk->part_tbl, partno); if (!part) goto out_unlock; @@ -492,18 +491,18 @@ int bdev_del_partition(struct block_device *bdev, int partno) delete_partition(part); ret = 0; out_unlock: - mutex_unlock(&bdev->bd_disk->open_mutex); + mutex_unlock(&disk->open_mutex); return ret; } -int bdev_resize_partition(struct block_device *bdev, int partno, - sector_t start, sector_t length) +int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start, + sector_t length) { struct block_device *part = NULL; int ret = -ENXIO; - mutex_lock(&bdev->bd_disk->open_mutex); - part = xa_load(&bdev->bd_disk->part_tbl, partno); + mutex_lock(&disk->open_mutex); + part = xa_load(&disk->part_tbl, partno); if (!part) goto out_unlock; @@ -512,14 +511,14 @@ int bdev_resize_partition(struct block_device *bdev, int partno, goto out_unlock; ret = -EBUSY; - if (partition_overlaps(bdev->bd_disk, start, length, partno)) + if (partition_overlaps(disk, start, length, partno)) goto out_unlock; bdev_set_nr_sectors(part, length); ret = 0; out_unlock: - mutex_unlock(&bdev->bd_disk->open_mutex); + mutex_unlock(&disk->open_mutex); return ret; } @@ -667,7 +666,7 @@ int bdev_disk_changed(struct gendisk *disk, bool invalidate) lockdep_assert_held(&disk->open_mutex); - if (!(disk->flags & GENHD_FL_UP)) + if (!disk_live(disk)) return -ENXIO; rescan: @@ -715,10 +714,10 @@ EXPORT_SYMBOL_GPL(bdev_disk_changed); void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p) { - struct address_space *mapping = state->bdev->bd_inode->i_mapping; + struct address_space *mapping = state->disk->part0->bd_inode->i_mapping; struct page *page; - if (n >= get_capacity(state->bdev->bd_disk)) { + if (n >= get_capacity(state->disk)) { state->access_beyond_eod = true; return NULL; } diff --git a/block/partitions/efi.c b/block/partitions/efi.c index e2716792ecc1..7ca5c4c374d4 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -124,19 +124,17 @@ efi_crc32(const void *buf, unsigned long len) /** * last_lba(): return number of last logical block of device - * @bdev: block device + * @disk: block device * * Description: Returns last LBA value on success, 0 on error. * This is stored (by sd and ide-geometry) in * the part[0] entry for this disk, and is the number of * physical sectors available on the disk. */ -static u64 last_lba(struct block_device *bdev) +static u64 last_lba(struct gendisk *disk) { - if (!bdev || !bdev->bd_inode) - return 0; - return div_u64(bdev->bd_inode->i_size, - bdev_logical_block_size(bdev)) - 1ULL; + return div_u64(disk->part0->bd_inode->i_size, + queue_logical_block_size(disk->queue)) - 1ULL; } static inline int pmbr_part_valid(gpt_mbr_record *part) @@ -231,17 +229,17 @@ done: * @buffer: destination buffer * @count: bytes to read * - * Description: Reads @count bytes from @state->bdev into @buffer. + * Description: Reads @count bytes from @state->disk into @buffer. * Returns number of bytes read on success, 0 on error. */ static size_t read_lba(struct parsed_partitions *state, u64 lba, u8 *buffer, size_t count) { size_t totalreadcount = 0; - struct block_device *bdev = state->bdev; - sector_t n = lba * (bdev_logical_block_size(bdev) / 512); + sector_t n = lba * + (queue_logical_block_size(state->disk->queue) / 512); - if (!buffer || lba > last_lba(bdev)) + if (!buffer || lba > last_lba(state->disk)) return 0; while (count) { @@ -302,14 +300,14 @@ static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state, * @lba: the Logical Block Address of the partition table * * Description: returns GPT header on success, NULL on error. Allocates - * and fills a GPT header starting at @ from @state->bdev. + * and fills a GPT header starting at @ from @state->disk. * Note: remember to free gpt when finished with it. */ static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state, u64 lba) { gpt_header *gpt; - unsigned ssz = bdev_logical_block_size(state->bdev); + unsigned ssz = queue_logical_block_size(state->disk->queue); gpt = kmalloc(ssz, GFP_KERNEL); if (!gpt) @@ -356,10 +354,10 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba, /* Check the GUID Partition Table header size is too big */ if (le32_to_cpu((*gpt)->header_size) > - bdev_logical_block_size(state->bdev)) { + queue_logical_block_size(state->disk->queue)) { pr_debug("GUID Partition Table Header size is too large: %u > %u\n", le32_to_cpu((*gpt)->header_size), - bdev_logical_block_size(state->bdev)); + queue_logical_block_size(state->disk->queue)); goto fail; } @@ -395,7 +393,7 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba, /* Check the first_usable_lba and last_usable_lba are * within the disk. */ - lastlba = last_lba(state->bdev); + lastlba = last_lba(state->disk); if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) { pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n", (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba), @@ -587,13 +585,15 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, gpt_header *pgpt = NULL, *agpt = NULL; gpt_entry *pptes = NULL, *aptes = NULL; legacy_mbr *legacymbr; - sector_t total_sectors = i_size_read(state->bdev->bd_inode) >> 9; + struct gendisk *disk = state->disk; + const struct block_device_operations *fops = disk->fops; + sector_t total_sectors = get_capacity(state->disk); u64 lastlba; if (!ptes) return 0; - lastlba = last_lba(state->bdev); + lastlba = last_lba(state->disk); if (!force_gpt) { /* This will be added to the EFI Spec. per Intel after v1.02. */ legacymbr = kzalloc(sizeof(*legacymbr), GFP_KERNEL); @@ -621,6 +621,16 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, if (!good_agpt && force_gpt) good_agpt = is_gpt_valid(state, lastlba, &agpt, &aptes); + if (!good_agpt && force_gpt && fops->alternative_gpt_sector) { + sector_t agpt_sector; + int err; + + err = fops->alternative_gpt_sector(disk, &agpt_sector); + if (!err) + good_agpt = is_gpt_valid(state, agpt_sector, + &agpt, &aptes); + } + /* The obviously unsuccessful case */ if (!good_pgpt && !good_agpt) goto fail; @@ -705,7 +715,7 @@ int efi_partition(struct parsed_partitions *state) gpt_header *gpt = NULL; gpt_entry *ptes = NULL; u32 i; - unsigned ssz = bdev_logical_block_size(state->bdev) / 512; + unsigned ssz = queue_logical_block_size(state->disk->queue) / 512; if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) { kfree(gpt); @@ -722,7 +732,7 @@ int efi_partition(struct parsed_partitions *state) u64 size = le64_to_cpu(ptes[i].ending_lba) - le64_to_cpu(ptes[i].starting_lba) + 1ULL; - if (!is_pte_valid(&ptes[i], last_lba(state->bdev))) + if (!is_pte_valid(&ptes[i], last_lba(state->disk))) continue; put_partition(state, i+1, start * ssz, size * ssz); diff --git a/block/partitions/ibm.c b/block/partitions/ibm.c index 4b044e620d35..9bca396aef4a 100644 --- a/block/partitions/ibm.c +++ b/block/partitions/ibm.c @@ -290,8 +290,8 @@ static int find_cms1_partitions(struct parsed_partitions *state, int ibm_partition(struct parsed_partitions *state) { int (*fn)(struct gendisk *disk, dasd_information2_t *info); - struct block_device *bdev = state->bdev; - struct gendisk *disk = bdev->bd_disk; + struct gendisk *disk = state->disk; + struct block_device *bdev = disk->part0; int blocksize, res; loff_t i_size, offset, size; dasd_information2_t *info; diff --git a/block/partitions/ldm.c b/block/partitions/ldm.c index b8b518d7fb77..27f6c7d9c776 100644 --- a/block/partitions/ldm.c +++ b/block/partitions/ldm.c @@ -304,7 +304,7 @@ static bool ldm_validate_privheads(struct parsed_partitions *state, } } - num_sects = state->bdev->bd_inode->i_size >> 9; + num_sects = get_capacity(state->disk); if ((ph[0]->config_start > num_sects) || ((ph[0]->config_start + ph[0]->config_size) > num_sects)) { @@ -339,11 +339,11 @@ out: /** * ldm_validate_tocblocks - Validate the table of contents and its backups * @state: Partition check state including device holding the LDM Database - * @base: Offset, into @state->bdev, of the database + * @base: Offset, into @state->disk, of the database * @ldb: Cache of the database structures * * Find and compare the four tables of contents of the LDM Database stored on - * @state->bdev and return the parsed information into @toc1. + * @state->disk and return the parsed information into @toc1. * * The offsets and sizes of the configs are range-checked against a privhead. * @@ -486,8 +486,8 @@ out: * only likely to happen if the underlying device is strange. If that IS * the case we should return zero to let someone else try. * - * Return: 'true' @state->bdev is a dynamic disk - * 'false' @state->bdev is not a dynamic disk, or an error occurred + * Return: 'true' @state->disk is a dynamic disk + * 'false' @state->disk is not a dynamic disk, or an error occurred */ static bool ldm_validate_partition_table(struct parsed_partitions *state) { @@ -1340,7 +1340,7 @@ static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb) /** * ldm_get_vblks - Read the on-disk database of VBLKs into memory * @state: Partition check state including device holding the LDM Database - * @base: Offset, into @state->bdev, of the database + * @base: Offset, into @state->disk, of the database * @ldb: Cache of the database structures * * To use the information from the VBLKs, they need to be read from the disk, @@ -1432,10 +1432,10 @@ static void ldm_free_vblks (struct list_head *lh) * example, if the device is hda, we would have: hda1: LDM database, hda2, hda3, * and so on: the actual data containing partitions. * - * Return: 1 Success, @state->bdev is a dynamic disk and we handled it - * 0 Success, @state->bdev is not a dynamic disk + * Return: 1 Success, @state->disk is a dynamic disk and we handled it + * 0 Success, @state->disk is not a dynamic disk * -1 An error occurred before enough information had been read - * Or @state->bdev is a dynamic disk, but it may be corrupted + * Or @state->disk is a dynamic disk, but it may be corrupted */ int ldm_partition(struct parsed_partitions *state) { diff --git a/block/partitions/mac.c b/block/partitions/mac.c index b6095335636c..7b521df00a39 100644 --- a/block/partitions/mac.c +++ b/block/partitions/mac.c @@ -133,7 +133,7 @@ int mac_partition(struct parsed_partitions *state) } #ifdef CONFIG_PPC_PMAC if (found_root_goodness) - note_bootable_part(state->bdev->bd_dev, found_root, + note_bootable_part(state->disk->part0->bd_dev, found_root, found_root_goodness); #endif diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c index f5102596a984..b5d5c229cc3b 100644 --- a/block/partitions/msdos.c +++ b/block/partitions/msdos.c @@ -135,11 +135,12 @@ static void parse_extended(struct parsed_partitions *state, Sector sect; unsigned char *data; sector_t this_sector, this_size; - sector_t sector_size = bdev_logical_block_size(state->bdev) / 512; + sector_t sector_size; int loopct = 0; /* number of links followed without finding a data partition */ int i; + sector_size = queue_logical_block_size(state->disk->queue) / 512; this_sector = first_sector; this_size = first_size; @@ -579,7 +580,7 @@ static struct { int msdos_partition(struct parsed_partitions *state) { - sector_t sector_size = bdev_logical_block_size(state->bdev) / 512; + sector_t sector_size; Sector sect; unsigned char *data; struct msdos_partition *p; @@ -587,6 +588,7 @@ int msdos_partition(struct parsed_partitions *state) int slot; u32 disksig; + sector_size = queue_logical_block_size(state->disk->queue) / 512; data = read_part_sector(state, 0, §); if (!data) return -1; diff --git a/block/partitions/sgi.c b/block/partitions/sgi.c index 4273f1bb0515..9cc6b8c1eea4 100644 --- a/block/partitions/sgi.c +++ b/block/partitions/sgi.c @@ -43,7 +43,6 @@ int sgi_partition(struct parsed_partitions *state) Sector sect; struct sgi_disklabel *label; struct sgi_partition *p; - char b[BDEVNAME_SIZE]; label = read_part_sector(state, 0, §); if (!label) @@ -52,7 +51,7 @@ int sgi_partition(struct parsed_partitions *state) magic = label->magic_mushroom; if(be32_to_cpu(magic) != SGI_LABEL_MAGIC) { /*printk("Dev %s SGI disklabel: bad magic %08x\n", - bdevname(bdev, b), be32_to_cpu(magic));*/ + state->disk->disk_name, be32_to_cpu(magic));*/ put_dev_sector(sect); return 0; } @@ -63,7 +62,7 @@ int sgi_partition(struct parsed_partitions *state) } if(csum) { printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n", - bdevname(state->bdev, b)); + state->disk->disk_name); put_dev_sector(sect); return 0; } diff --git a/block/partitions/sun.c b/block/partitions/sun.c index 47dc53eccf77..ddf9e6def4b2 100644 --- a/block/partitions/sun.c +++ b/block/partitions/sun.c @@ -65,7 +65,6 @@ int sun_partition(struct parsed_partitions *state) } * label; struct sun_partition *p; unsigned long spc; - char b[BDEVNAME_SIZE]; int use_vtoc; int nparts; @@ -76,7 +75,7 @@ int sun_partition(struct parsed_partitions *state) p = label->partitions; if (be16_to_cpu(label->magic) != SUN_LABEL_MAGIC) { /* printk(KERN_INFO "Dev %s Sun disklabel: bad magic %04x\n", - bdevname(bdev, b), be16_to_cpu(label->magic)); */ + state->disk->disk_name, be16_to_cpu(label->magic)); */ put_dev_sector(sect); return 0; } @@ -86,7 +85,7 @@ int sun_partition(struct parsed_partitions *state) csum ^= *ush--; if (csum) { printk("Dev %s Sun disklabel: Csum bad, label corrupted\n", - bdevname(state->bdev, b)); + state->disk->disk_name); put_dev_sector(sect); return 0; } diff --git a/block/t10-pi.c b/block/t10-pi.c index d910534b3a41..00c203b2a921 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -147,11 +147,10 @@ static void t10_pi_type1_prepare(struct request *rq) break; bip_for_each_vec(iv, bip, iter) { - void *p, *pmap; unsigned int j; + void *p; - pmap = kmap_atomic(iv.bv_page); - p = pmap + iv.bv_offset; + p = bvec_kmap_local(&iv); for (j = 0; j < iv.bv_len; j += tuple_sz) { struct t10_pi_tuple *pi = p; @@ -161,8 +160,7 @@ static void t10_pi_type1_prepare(struct request *rq) ref_tag++; p += tuple_sz; } - - kunmap_atomic(pmap); + kunmap_local(p); } bip->bip_flags |= BIP_MAPPED_INTEGRITY; @@ -195,11 +193,10 @@ static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes) struct bvec_iter iter; bip_for_each_vec(iv, bip, iter) { - void *p, *pmap; unsigned int j; + void *p; - pmap = kmap_atomic(iv.bv_page); - p = pmap + iv.bv_offset; + p = bvec_kmap_local(&iv); for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) { struct t10_pi_tuple *pi = p; @@ -210,8 +207,7 @@ static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes) intervals--; p += tuple_sz; } - - kunmap_atomic(pmap); + kunmap_local(p); } } } diff --git a/drivers/Kconfig b/drivers/Kconfig index 8bad63417a50..30d2db37cc87 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -51,8 +51,6 @@ source "drivers/net/Kconfig" source "drivers/isdn/Kconfig" -source "drivers/lightnvm/Kconfig" - # input before char - char/joystick depends on it. As does USB. source "drivers/input/Kconfig" diff --git a/drivers/Makefile b/drivers/Makefile index 27c018bdf4de..be5d40ae1488 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -70,7 +70,6 @@ obj-$(CONFIG_FB_I810) += video/fbdev/i810/ obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/ obj-$(CONFIG_PARPORT) += parport/ -obj-$(CONFIG_NVM) += lightnvm/ obj-y += base/ block/ misc/ mfd/ nfc/ obj-$(CONFIG_LIBNVDIMM) += nvdimm/ obj-$(CONFIG_DAX) += dax/ diff --git a/drivers/ata/libahci.c b/drivers/ata/libahci.c index fec2e9754aed..5b3fa2cbe722 100644 --- a/drivers/ata/libahci.c +++ b/drivers/ata/libahci.c @@ -125,6 +125,7 @@ EXPORT_SYMBOL_GPL(ahci_shost_attrs); struct device_attribute *ahci_sdev_attrs[] = { &dev_attr_sw_activity, &dev_attr_unload_heads, + &dev_attr_ncq_prio_supported, &dev_attr_ncq_prio_enable, NULL }; diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 61c762961ca8..b8459c54f739 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -159,6 +159,12 @@ MODULE_DESCRIPTION("Library module for ATA devices"); MODULE_LICENSE("GPL"); MODULE_VERSION(DRV_VERSION); +static inline bool ata_dev_print_info(struct ata_device *dev) +{ + struct ata_eh_context *ehc = &dev->link->eh_context; + + return ehc->i.flags & ATA_EHI_PRINTINFO; +} static bool ata_sstatus_online(u32 sstatus) { @@ -706,11 +712,9 @@ int ata_build_rw_tf(struct ata_taskfile *tf, struct ata_device *dev, if (tf->flags & ATA_TFLAG_FUA) tf->device |= 1 << 7; - if (dev->flags & ATA_DFLAG_NCQ_PRIO) { - if (class == IOPRIO_CLASS_RT) - tf->hob_nsect |= ATA_PRIO_HIGH << - ATA_SHIFT_PRIO; - } + if (dev->flags & ATA_DFLAG_NCQ_PRIO_ENABLE && + class == IOPRIO_CLASS_RT) + tf->hob_nsect |= ATA_PRIO_HIGH << ATA_SHIFT_PRIO; } else if (dev->flags & ATA_DFLAG_LBA) { tf->flags |= ATA_TFLAG_LBA; @@ -1266,8 +1270,7 @@ static int ata_set_max_sectors(struct ata_device *dev, u64 new_sectors) */ static int ata_hpa_resize(struct ata_device *dev) { - struct ata_eh_context *ehc = &dev->link->eh_context; - int print_info = ehc->i.flags & ATA_EHI_PRINTINFO; + bool print_info = ata_dev_print_info(dev); bool unlock_hpa = ata_ignore_hpa || dev->flags & ATA_DFLAG_UNLOCK_HPA; u64 sectors = ata_id_n_sectors(dev->id); u64 native_sectors; @@ -2023,13 +2026,15 @@ retry: err_mask = ata_exec_internal(dev, &tf, NULL, DMA_FROM_DEVICE, buf, sectors * ATA_SECT_SIZE, 0); - if (err_mask && dma) { - dev->horkage |= ATA_HORKAGE_NO_DMA_LOG; - ata_dev_warn(dev, "READ LOG DMA EXT failed, trying PIO\n"); - goto retry; + if (err_mask) { + if (dma) { + dev->horkage |= ATA_HORKAGE_NO_DMA_LOG; + goto retry; + } + ata_dev_err(dev, "Read log page 0x%02x failed, Emask 0x%x\n", + (unsigned int)page, err_mask); } - DPRINTK("EXIT, err_mask=%x\n", err_mask); return err_mask; } @@ -2058,12 +2063,8 @@ static bool ata_identify_page_supported(struct ata_device *dev, u8 page) */ err = ata_read_log_page(dev, ATA_LOG_IDENTIFY_DEVICE, 0, ap->sector_buf, 1); - if (err) { - ata_dev_info(dev, - "failed to get Device Identify Log Emask 0x%x\n", - err); + if (err) return false; - } for (i = 0; i < ap->sector_buf[8]; i++) { if (ap->sector_buf[9 + i] == page) @@ -2127,11 +2128,7 @@ static void ata_dev_config_ncq_send_recv(struct ata_device *dev) } err_mask = ata_read_log_page(dev, ATA_LOG_NCQ_SEND_RECV, 0, ap->sector_buf, 1); - if (err_mask) { - ata_dev_dbg(dev, - "failed to get NCQ Send/Recv Log Emask 0x%x\n", - err_mask); - } else { + if (!err_mask) { u8 *cmds = dev->ncq_send_recv_cmds; dev->flags |= ATA_DFLAG_NCQ_SEND_RECV; @@ -2157,11 +2154,7 @@ static void ata_dev_config_ncq_non_data(struct ata_device *dev) } err_mask = ata_read_log_page(dev, ATA_LOG_NCQ_NON_DATA, 0, ap->sector_buf, 1); - if (err_mask) { - ata_dev_dbg(dev, - "failed to get NCQ Non-Data Log Emask 0x%x\n", - err_mask); - } else { + if (!err_mask) { u8 *cmds = dev->ncq_non_data_cmds; memcpy(cmds, ap->sector_buf, ATA_LOG_NCQ_NON_DATA_SIZE); @@ -2173,30 +2166,24 @@ static void ata_dev_config_ncq_prio(struct ata_device *dev) struct ata_port *ap = dev->link->ap; unsigned int err_mask; - if (!(dev->flags & ATA_DFLAG_NCQ_PRIO_ENABLE)) { - dev->flags &= ~ATA_DFLAG_NCQ_PRIO; - return; - } - err_mask = ata_read_log_page(dev, ATA_LOG_IDENTIFY_DEVICE, ATA_LOG_SATA_SETTINGS, ap->sector_buf, 1); - if (err_mask) { - ata_dev_dbg(dev, - "failed to get Identify Device data, Emask 0x%x\n", - err_mask); - return; - } + if (err_mask) + goto not_supported; - if (ap->sector_buf[ATA_LOG_NCQ_PRIO_OFFSET] & BIT(3)) { - dev->flags |= ATA_DFLAG_NCQ_PRIO; - } else { - dev->flags &= ~ATA_DFLAG_NCQ_PRIO; - ata_dev_dbg(dev, "SATA page does not support priority\n"); - } + if (!(ap->sector_buf[ATA_LOG_NCQ_PRIO_OFFSET] & BIT(3))) + goto not_supported; + dev->flags |= ATA_DFLAG_NCQ_PRIO; + + return; + +not_supported: + dev->flags &= ~ATA_DFLAG_NCQ_PRIO_ENABLE; + dev->flags &= ~ATA_DFLAG_NCQ_PRIO; } static int ata_dev_config_ncq(struct ata_device *dev, @@ -2346,11 +2333,8 @@ static void ata_dev_config_trusted(struct ata_device *dev) err = ata_read_log_page(dev, ATA_LOG_IDENTIFY_DEVICE, ATA_LOG_SECURITY, ap->sector_buf, 1); - if (err) { - ata_dev_dbg(dev, - "failed to read Security Log, Emask 0x%x\n", err); + if (err) return; - } trusted_cap = get_unaligned_le64(&ap->sector_buf[40]); if (!(trusted_cap & (1ULL << 63))) { @@ -2363,6 +2347,106 @@ static void ata_dev_config_trusted(struct ata_device *dev) dev->flags |= ATA_DFLAG_TRUSTED; } +static int ata_dev_config_lba(struct ata_device *dev) +{ + struct ata_port *ap = dev->link->ap; + const u16 *id = dev->id; + const char *lba_desc; + char ncq_desc[24]; + int ret; + + dev->flags |= ATA_DFLAG_LBA; + + if (ata_id_has_lba48(id)) { + lba_desc = "LBA48"; + dev->flags |= ATA_DFLAG_LBA48; + if (dev->n_sectors >= (1UL << 28) && + ata_id_has_flush_ext(id)) + dev->flags |= ATA_DFLAG_FLUSH_EXT; + } else { + lba_desc = "LBA"; + } + + /* config NCQ */ + ret = ata_dev_config_ncq(dev, ncq_desc, sizeof(ncq_desc)); + + /* print device info to dmesg */ + if (ata_msg_drv(ap) && ata_dev_print_info(dev)) + ata_dev_info(dev, + "%llu sectors, multi %u: %s %s\n", + (unsigned long long)dev->n_sectors, + dev->multi_count, lba_desc, ncq_desc); + + return ret; +} + +static void ata_dev_config_chs(struct ata_device *dev) +{ + struct ata_port *ap = dev->link->ap; + const u16 *id = dev->id; + + if (ata_id_current_chs_valid(id)) { + /* Current CHS translation is valid. */ + dev->cylinders = id[54]; + dev->heads = id[55]; + dev->sectors = id[56]; + } else { + /* Default translation */ + dev->cylinders = id[1]; + dev->heads = id[3]; + dev->sectors = id[6]; + } + + /* print device info to dmesg */ + if (ata_msg_drv(ap) && ata_dev_print_info(dev)) + ata_dev_info(dev, + "%llu sectors, multi %u, CHS %u/%u/%u\n", + (unsigned long long)dev->n_sectors, + dev->multi_count, dev->cylinders, + dev->heads, dev->sectors); +} + +static void ata_dev_config_devslp(struct ata_device *dev) +{ + u8 *sata_setting = dev->link->ap->sector_buf; + unsigned int err_mask; + int i, j; + + /* + * Check device sleep capability. Get DevSlp timing variables + * from SATA Settings page of Identify Device Data Log. + */ + if (!ata_id_has_devslp(dev->id)) + return; + + err_mask = ata_read_log_page(dev, + ATA_LOG_IDENTIFY_DEVICE, + ATA_LOG_SATA_SETTINGS, + sata_setting, 1); + if (err_mask) + return; + + dev->flags |= ATA_DFLAG_DEVSLP; + for (i = 0; i < ATA_LOG_DEVSLP_SIZE; i++) { + j = ATA_LOG_DEVSLP_OFFSET + i; + dev->devslp_timing[i] = sata_setting[j]; + } +} + +static void ata_dev_print_features(struct ata_device *dev) +{ + if (!(dev->flags & ATA_DFLAG_FEATURES_MASK)) + return; + + ata_dev_info(dev, + "Features:%s%s%s%s%s\n", + dev->flags & ATA_DFLAG_TRUSTED ? " Trust" : "", + dev->flags & ATA_DFLAG_DA ? " Dev-Attention" : "", + dev->flags & ATA_DFLAG_DEVSLP ? " Dev-Sleep" : "", + dev->flags & ATA_DFLAG_NCQ_SEND_RECV ? " NCQ-sndrcv" : "", + dev->flags & ATA_DFLAG_NCQ_PRIO ? " NCQ-prio" : ""); +} + /** * ata_dev_configure - Configure the specified ATA/ATAPI device * @dev: Target device to configure @@ -2379,8 +2463,7 @@ static void ata_dev_config_trusted(struct ata_device *dev) int ata_dev_configure(struct ata_device *dev) { struct ata_port *ap = dev->link->ap; - struct ata_eh_context *ehc = &dev->link->eh_context; - int print_info = ehc->i.flags & ATA_EHI_PRINTINFO; + bool print_info = ata_dev_print_info(dev); const u16 *id = dev->id; unsigned long xfer_mask; unsigned int err_mask; @@ -2507,91 +2590,28 @@ int ata_dev_configure(struct ata_device *dev) dev->multi_count = cnt; } + /* print device info to dmesg */ + if (ata_msg_drv(ap) && print_info) + ata_dev_info(dev, "%s: %s, %s, max %s\n", + revbuf, modelbuf, fwrevbuf, + ata_mode_string(xfer_mask)); + if (ata_id_has_lba(id)) { - const char *lba_desc; - char ncq_desc[24]; - - lba_desc = "LBA"; - dev->flags |= ATA_DFLAG_LBA; - if (ata_id_has_lba48(id)) { - dev->flags |= ATA_DFLAG_LBA48; - lba_desc = "LBA48"; - - if (dev->n_sectors >= (1UL << 28) && - ata_id_has_flush_ext(id)) - dev->flags |= ATA_DFLAG_FLUSH_EXT; - } - - /* config NCQ */ - rc = ata_dev_config_ncq(dev, ncq_desc, sizeof(ncq_desc)); + rc = ata_dev_config_lba(dev); if (rc) return rc; - - /* print device info to dmesg */ - if (ata_msg_drv(ap) && print_info) { - ata_dev_info(dev, "%s: %s, %s, max %s\n", - revbuf, modelbuf, fwrevbuf, - ata_mode_string(xfer_mask)); - ata_dev_info(dev, - "%llu sectors, multi %u: %s %s\n", - (unsigned long long)dev->n_sectors, - dev->multi_count, lba_desc, ncq_desc); - } } else { - /* CHS */ - - /* Default translation */ - dev->cylinders = id[1]; - dev->heads = id[3]; - dev->sectors = id[6]; - - if (ata_id_current_chs_valid(id)) { - /* Current CHS translation is valid. */ - dev->cylinders = id[54]; - dev->heads = id[55]; - dev->sectors = id[56]; - } - - /* print device info to dmesg */ - if (ata_msg_drv(ap) && print_info) { - ata_dev_info(dev, "%s: %s, %s, max %s\n", - revbuf, modelbuf, fwrevbuf, - ata_mode_string(xfer_mask)); - ata_dev_info(dev, - "%llu sectors, multi %u, CHS %u/%u/%u\n", - (unsigned long long)dev->n_sectors, - dev->multi_count, dev->cylinders, - dev->heads, dev->sectors); - } + ata_dev_config_chs(dev); } - /* Check and mark DevSlp capability. Get DevSlp timing variables - * from SATA Settings page of Identify Device Data Log. - */ - if (ata_id_has_devslp(dev->id)) { - u8 *sata_setting = ap->sector_buf; - int i, j; - - dev->flags |= ATA_DFLAG_DEVSLP; - err_mask = ata_read_log_page(dev, - ATA_LOG_IDENTIFY_DEVICE, - ATA_LOG_SATA_SETTINGS, - sata_setting, - 1); - if (err_mask) - ata_dev_dbg(dev, - "failed to get Identify Device Data, Emask 0x%x\n", - err_mask); - else - for (i = 0; i < ATA_LOG_DEVSLP_SIZE; i++) { - j = ATA_LOG_DEVSLP_OFFSET + i; - dev->devslp_timing[i] = sata_setting[j]; - } - } + ata_dev_config_devslp(dev); ata_dev_config_sense_reporting(dev); ata_dev_config_zac(dev); ata_dev_config_trusted(dev); dev->cdb_len = 32; + + if (ata_msg_drv(ap) && print_info) + ata_dev_print_features(dev); } /* ATAPI-specific feature tests */ @@ -5573,7 +5593,7 @@ int ata_host_start(struct ata_host *host) have_stop = 1; } - if (host->ops->host_stop) + if (host->ops && host->ops->host_stop) have_stop = 1; if (have_stop) { diff --git a/drivers/ata/libata-sata.c b/drivers/ata/libata-sata.c index 8adeab76dd38..8f3ff830ab0c 100644 --- a/drivers/ata/libata-sata.c +++ b/drivers/ata/libata-sata.c @@ -834,28 +834,46 @@ DEVICE_ATTR(link_power_management_policy, S_IRUGO | S_IWUSR, ata_scsi_lpm_show, ata_scsi_lpm_store); EXPORT_SYMBOL_GPL(dev_attr_link_power_management_policy); +static ssize_t ata_ncq_prio_supported_show(struct device *device, + struct device_attribute *attr, + char *buf) +{ + struct scsi_device *sdev = to_scsi_device(device); + struct ata_port *ap = ata_shost_to_port(sdev->host); + struct ata_device *dev; + bool ncq_prio_supported; + int rc = 0; + + spin_lock_irq(ap->lock); + dev = ata_scsi_find_dev(ap, sdev); + if (!dev) + rc = -ENODEV; + else + ncq_prio_supported = dev->flags & ATA_DFLAG_NCQ_PRIO; + spin_unlock_irq(ap->lock); + + return rc ? rc : sysfs_emit(buf, "%u\n", ncq_prio_supported); +} + +DEVICE_ATTR(ncq_prio_supported, S_IRUGO, ata_ncq_prio_supported_show, NULL); +EXPORT_SYMBOL_GPL(dev_attr_ncq_prio_supported); + static ssize_t ata_ncq_prio_enable_show(struct device *device, struct device_attribute *attr, char *buf) { struct scsi_device *sdev = to_scsi_device(device); - struct ata_port *ap; + struct ata_port *ap = ata_shost_to_port(sdev->host); struct ata_device *dev; bool ncq_prio_enable; int rc = 0; - ap = ata_shost_to_port(sdev->host); - spin_lock_irq(ap->lock); dev = ata_scsi_find_dev(ap, sdev); - if (!dev) { + if (!dev) rc = -ENODEV; - goto unlock; - } - - ncq_prio_enable = dev->flags & ATA_DFLAG_NCQ_PRIO_ENABLE; - -unlock: + else + ncq_prio_enable = dev->flags & ATA_DFLAG_NCQ_PRIO_ENABLE; spin_unlock_irq(ap->lock); return rc ? rc : snprintf(buf, 20, "%u\n", ncq_prio_enable); @@ -869,7 +887,7 @@ static ssize_t ata_ncq_prio_enable_store(struct device *device, struct ata_port *ap; struct ata_device *dev; long int input; - int rc; + int rc = 0; rc = kstrtol(buf, 10, &input); if (rc) @@ -883,27 +901,20 @@ static ssize_t ata_ncq_prio_enable_store(struct device *device, return -ENODEV; spin_lock_irq(ap->lock); + + if (!(dev->flags & ATA_DFLAG_NCQ_PRIO)) { + rc = -EINVAL; + goto unlock; + } + if (input) dev->flags |= ATA_DFLAG_NCQ_PRIO_ENABLE; else dev->flags &= ~ATA_DFLAG_NCQ_PRIO_ENABLE; - dev->link->eh_info.action |= ATA_EH_REVALIDATE; - dev->link->eh_info.flags |= ATA_EHI_QUIET; - ata_port_schedule_eh(ap); +unlock: spin_unlock_irq(ap->lock); - ata_port_wait_eh(ap); - - if (input) { - spin_lock_irq(ap->lock); - if (!(dev->flags & ATA_DFLAG_NCQ_PRIO)) { - dev->flags &= ~ATA_DFLAG_NCQ_PRIO_ENABLE; - rc = -EIO; - } - spin_unlock_irq(ap->lock); - } - return rc ? rc : len; } @@ -914,6 +925,7 @@ EXPORT_SYMBOL_GPL(dev_attr_ncq_prio_enable); struct device_attribute *ata_ncq_sdev_attrs[] = { &dev_attr_unload_heads, &dev_attr_ncq_prio_enable, + &dev_attr_ncq_prio_supported, NULL }; EXPORT_SYMBOL_GPL(ata_ncq_sdev_attrs); diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index b9588c52815d..0b7b4624e4df 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -1765,53 +1765,6 @@ struct ata_scsi_args { struct scsi_cmnd *cmd; }; -/** - * ata_scsi_rbuf_get - Map response buffer. - * @cmd: SCSI command containing buffer to be mapped. - * @flags: unsigned long variable to store irq enable status - * @copy_in: copy in from user buffer - * - * Prepare buffer for simulated SCSI commands. - * - * LOCKING: - * spin_lock_irqsave(ata_scsi_rbuf_lock) on success - * - * RETURNS: - * Pointer to response buffer. - */ -static void *ata_scsi_rbuf_get(struct scsi_cmnd *cmd, bool copy_in, - unsigned long *flags) -{ - spin_lock_irqsave(&ata_scsi_rbuf_lock, *flags); - - memset(ata_scsi_rbuf, 0, ATA_SCSI_RBUF_SIZE); - if (copy_in) - sg_copy_to_buffer(scsi_sglist(cmd), scsi_sg_count(cmd), - ata_scsi_rbuf, ATA_SCSI_RBUF_SIZE); - return ata_scsi_rbuf; -} - -/** - * ata_scsi_rbuf_put - Unmap response buffer. - * @cmd: SCSI command containing buffer to be unmapped. - * @copy_out: copy out result - * @flags: @flags passed to ata_scsi_rbuf_get() - * - * Returns rbuf buffer. The result is copied to @cmd's buffer if - * @copy_back is true. - * - * LOCKING: - * Unlocks ata_scsi_rbuf_lock. - */ -static inline void ata_scsi_rbuf_put(struct scsi_cmnd *cmd, bool copy_out, - unsigned long *flags) -{ - if (copy_out) - sg_copy_from_buffer(scsi_sglist(cmd), scsi_sg_count(cmd), - ata_scsi_rbuf, ATA_SCSI_RBUF_SIZE); - spin_unlock_irqrestore(&ata_scsi_rbuf_lock, *flags); -} - /** * ata_scsi_rbuf_fill - wrapper for SCSI command simulators * @args: device IDENTIFY data / SCSI command of interest. @@ -1830,14 +1783,19 @@ static inline void ata_scsi_rbuf_put(struct scsi_cmnd *cmd, bool copy_out, static void ata_scsi_rbuf_fill(struct ata_scsi_args *args, unsigned int (*actor)(struct ata_scsi_args *args, u8 *rbuf)) { - u8 *rbuf; unsigned int rc; struct scsi_cmnd *cmd = args->cmd; unsigned long flags; - rbuf = ata_scsi_rbuf_get(cmd, false, &flags); - rc = actor(args, rbuf); - ata_scsi_rbuf_put(cmd, rc == 0, &flags); + spin_lock_irqsave(&ata_scsi_rbuf_lock, flags); + + memset(ata_scsi_rbuf, 0, ATA_SCSI_RBUF_SIZE); + rc = actor(args, ata_scsi_rbuf); + if (rc == 0) + sg_copy_from_buffer(scsi_sglist(cmd), scsi_sg_count(cmd), + ata_scsi_rbuf, ATA_SCSI_RBUF_SIZE); + + spin_unlock_irqrestore(&ata_scsi_rbuf_lock, flags); if (rc == 0) cmd->result = SAM_STAT_GOOD; diff --git a/drivers/ata/sata_dwc_460ex.c b/drivers/ata/sata_dwc_460ex.c index f0ef844428bb..338c2e50f759 100644 --- a/drivers/ata/sata_dwc_460ex.c +++ b/drivers/ata/sata_dwc_460ex.c @@ -1259,24 +1259,20 @@ static int sata_dwc_probe(struct platform_device *ofdev) irq = irq_of_parse_and_map(np, 0); if (irq == NO_IRQ) { dev_err(&ofdev->dev, "no SATA DMA irq\n"); - err = -ENODEV; - goto error_out; + return -ENODEV; } #ifdef CONFIG_SATA_DWC_OLD_DMA if (!of_find_property(np, "dmas", NULL)) { err = sata_dwc_dma_init_old(ofdev, hsdev); if (err) - goto error_out; + return err; } #endif hsdev->phy = devm_phy_optional_get(hsdev->dev, "sata-phy"); - if (IS_ERR(hsdev->phy)) { - err = PTR_ERR(hsdev->phy); - hsdev->phy = NULL; - goto error_out; - } + if (IS_ERR(hsdev->phy)) + return PTR_ERR(hsdev->phy); err = phy_init(hsdev->phy); if (err) diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 95694113e38e..58ec167aa018 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -27,9 +27,6 @@ #include -#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) -#define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT) - /* * Each block ramdisk device has a radix_tree brd_pages of pages that stores * the pages containing the block device's contents. A brd page's ->index is diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index e7d0e637e632..44ccf8b4f4b2 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1364,7 +1364,7 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi if (b) { blk_stack_limits(&q->limits, &b->limits, 0); - blk_queue_update_readahead(q); + disk_update_readahead(device->vdisk); } fixup_discard_if_not_supported(q); fixup_write_zeroes(device, q); diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 13beb98a7c5a..5ca233644d70 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -905,13 +905,12 @@ static bool drbd_may_do_local_read(struct drbd_device *device, sector_t sector, static bool remote_due_to_read_balancing(struct drbd_device *device, sector_t sector, enum drbd_read_balancing rbm) { - struct backing_dev_info *bdi; int stripe_shift; switch (rbm) { case RB_CONGESTED_REMOTE: - bdi = device->ldev->backing_bdev->bd_disk->queue->backing_dev_info; - return bdi_read_congested(bdi); + return bdi_read_congested( + device->ldev->backing_bdev->bd_disk->bdi); case RB_LEAST_PENDING: return atomic_read(&device->local_cnt) > atomic_read(&device->ap_pending_cnt) + atomic_read(&device->rs_pending_cnt); diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 87460e0e5c72..fef79ea52e3e 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -4029,23 +4029,23 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) if (fdc_state[FDC(drive)].rawcmd == 1) fdc_state[FDC(drive)].rawcmd = 2; - if (mode & (FMODE_READ|FMODE_WRITE)) { - drive_state[drive].last_checked = 0; - clear_bit(FD_OPEN_SHOULD_FAIL_BIT, &drive_state[drive].flags); - if (bdev_check_media_change(bdev)) - floppy_revalidate(bdev->bd_disk); - if (test_bit(FD_DISK_CHANGED_BIT, &drive_state[drive].flags)) - goto out; - if (test_bit(FD_OPEN_SHOULD_FAIL_BIT, &drive_state[drive].flags)) + if (!(mode & FMODE_NDELAY)) { + if (mode & (FMODE_READ|FMODE_WRITE)) { + drive_state[drive].last_checked = 0; + clear_bit(FD_OPEN_SHOULD_FAIL_BIT, + &drive_state[drive].flags); + if (bdev_check_media_change(bdev)) + floppy_revalidate(bdev->bd_disk); + if (test_bit(FD_DISK_CHANGED_BIT, &drive_state[drive].flags)) + goto out; + if (test_bit(FD_OPEN_SHOULD_FAIL_BIT, &drive_state[drive].flags)) + goto out; + } + res = -EROFS; + if ((mode & FMODE_WRITE) && + !test_bit(FD_DISK_WRITABLE_BIT, &drive_state[drive].flags)) goto out; } - - res = -EROFS; - - if ((mode & FMODE_WRITE) && - !test_bit(FD_DISK_WRITABLE_BIT, &drive_state[drive].flags)) - goto out; - mutex_unlock(&open_lock); mutex_unlock(&floppy_mutex); return 0; diff --git a/drivers/block/loop.c b/drivers/block/loop.c index f0cdff0c5fbf..fa1c298a8cfb 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -774,6 +774,7 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, goto out_err; /* and ... switch */ + disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE); blk_mq_freeze_queue(lo->lo_queue); mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask); lo->lo_backing_file = file; @@ -1257,6 +1258,7 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, goto out_unlock; } + disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE); set_disk_ro(lo->lo_disk, (lo->lo_flags & LO_FLAGS_READ_ONLY) != 0); INIT_WORK(&lo->rootcg_work, loop_rootcg_workfn); @@ -1304,10 +1306,6 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, if (partscan) lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN; - /* Grab the block_device to prevent its destruction after we - * put /dev/loopXX inode. Later in __loop_clr_fd() we bdput(bdev). - */ - bdgrab(bdev); loop_global_unlock(lo, is_loop); if (partscan) loop_reread_partitions(lo); @@ -1398,7 +1396,6 @@ static int __loop_clr_fd(struct loop_device *lo, bool release) blk_queue_physical_block_size(lo->lo_queue, 512); blk_queue_io_min(lo->lo_queue, 512); if (bdev) { - bdput(bdev); invalidate_bdev(bdev); bdev->bd_inode->i_mapping->wb_err = 0; } @@ -1415,6 +1412,7 @@ static int __loop_clr_fd(struct loop_device *lo, bool release) partscan = lo->lo_flags & LO_FLAGS_PARTSCAN && bdev; lo_number = lo->lo_number; + disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE); out_unlock: mutex_unlock(&lo->lo_mutex); if (partscan) { @@ -2335,7 +2333,8 @@ static int loop_add(int i) lo->tag_set.queue_depth = 128; lo->tag_set.numa_node = NUMA_NO_NODE; lo->tag_set.cmd_size = sizeof(struct loop_cmd); - lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING; + lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING | + BLK_MQ_F_NO_SCHED_BY_DEFAULT; lo->tag_set.driver_data = lo; err = blk_mq_alloc_tag_set(&lo->tag_set); @@ -2391,6 +2390,8 @@ static int loop_add(int i) disk->fops = &lo_fops; disk->private_data = lo; disk->queue = lo->lo_queue; + disk->events = DISK_EVENT_MEDIA_CHANGE; + disk->event_flags = DISK_EVENT_FLAG_UEVENT; sprintf(disk->disk_name, "loop%d", i); add_disk(disk); mutex_unlock(&loop_ctl_mutex); diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 19f5d5a8b16a..5170a630778d 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -49,6 +49,7 @@ static DEFINE_IDR(nbd_index_idr); static DEFINE_MUTEX(nbd_index_mutex); +static struct workqueue_struct *nbd_del_wq; static int nbd_total_devices = 0; struct nbd_sock { @@ -113,12 +114,12 @@ struct nbd_device { struct mutex config_lock; struct gendisk *disk; struct workqueue_struct *recv_workq; + struct work_struct remove_work; struct list_head list; struct task_struct *task_recv; struct task_struct *task_setup; - struct completion *destroy_complete; unsigned long flags; char *backend; @@ -237,32 +238,36 @@ static void nbd_dev_remove(struct nbd_device *nbd) { struct gendisk *disk = nbd->disk; - if (disk) { - del_gendisk(disk); - blk_cleanup_disk(disk); - blk_mq_free_tag_set(&nbd->tag_set); - } + del_gendisk(disk); + blk_cleanup_disk(disk); + blk_mq_free_tag_set(&nbd->tag_set); /* - * Place this in the last just before the nbd is freed to - * make sure that the disk and the related kobject are also - * totally removed to avoid duplicate creation of the same - * one. + * Remove from idr after del_gendisk() completes, so if the same ID is + * reused, the following add_disk() will succeed. */ - if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) && nbd->destroy_complete) - complete(nbd->destroy_complete); + mutex_lock(&nbd_index_mutex); + idr_remove(&nbd_index_idr, nbd->index); + mutex_unlock(&nbd_index_mutex); kfree(nbd); } +static void nbd_dev_remove_work(struct work_struct *work) +{ + nbd_dev_remove(container_of(work, struct nbd_device, remove_work)); +} + static void nbd_put(struct nbd_device *nbd) { - if (refcount_dec_and_mutex_lock(&nbd->refs, - &nbd_index_mutex)) { - idr_remove(&nbd_index_idr, nbd->index); + if (!refcount_dec_and_test(&nbd->refs)) + return; + + /* Call del_gendisk() asynchrounously to prevent deadlock */ + if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags)) + queue_work(nbd_del_wq, &nbd->remove_work); + else nbd_dev_remove(nbd); - mutex_unlock(&nbd_index_mutex); - } } static int nbd_disconnected(struct nbd_config *config) @@ -1388,6 +1393,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, unsigned int cmd, unsigned long arg) { struct nbd_config *config = nbd->config; + loff_t bytesize; switch (cmd) { case NBD_DISCONNECT: @@ -1402,8 +1408,9 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, case NBD_SET_SIZE: return nbd_set_size(nbd, arg, config->blksize); case NBD_SET_SIZE_BLOCKS: - return nbd_set_size(nbd, arg * config->blksize, - config->blksize); + if (check_mul_overflow((loff_t)arg, config->blksize, &bytesize)) + return -EINVAL; + return nbd_set_size(nbd, bytesize, config->blksize); case NBD_SET_TIMEOUT: nbd_set_cmd_timeout(nbd, arg); return 0; @@ -1665,7 +1672,7 @@ static const struct blk_mq_ops nbd_mq_ops = { .timeout = nbd_xmit_timeout, }; -static int nbd_dev_add(int index) +static struct nbd_device *nbd_dev_add(int index, unsigned int refs) { struct nbd_device *nbd; struct gendisk *disk; @@ -1683,13 +1690,14 @@ static int nbd_dev_add(int index) nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING; nbd->tag_set.driver_data = nbd; - nbd->destroy_complete = NULL; + INIT_WORK(&nbd->remove_work, nbd_dev_remove_work); nbd->backend = NULL; err = blk_mq_alloc_tag_set(&nbd->tag_set); if (err) goto out_free_nbd; + mutex_lock(&nbd_index_mutex); if (index >= 0) { err = idr_alloc(&nbd_index_idr, nbd, index, index + 1, GFP_KERNEL); @@ -1700,9 +1708,10 @@ static int nbd_dev_add(int index) if (err >= 0) index = err; } + nbd->index = index; + mutex_unlock(&nbd_index_mutex); if (err < 0) goto out_free_tags; - nbd->index = index; disk = blk_mq_alloc_disk(&nbd->tag_set, NULL); if (IS_ERR(disk)) { @@ -1726,38 +1735,65 @@ static int nbd_dev_add(int index) mutex_init(&nbd->config_lock); refcount_set(&nbd->config_refs, 0); - refcount_set(&nbd->refs, 1); + /* + * Start out with a zero references to keep other threads from using + * this device until it is fully initialized. + */ + refcount_set(&nbd->refs, 0); INIT_LIST_HEAD(&nbd->list); disk->major = NBD_MAJOR; + + /* Too big first_minor can cause duplicate creation of + * sysfs files/links, since first_minor will be truncated to + * byte in __device_add_disk(). + */ disk->first_minor = index << part_shift; + if (disk->first_minor > 0xff) { + err = -EINVAL; + goto out_free_idr; + } + disk->minors = 1 << part_shift; disk->fops = &nbd_fops; disk->private_data = nbd; sprintf(disk->disk_name, "nbd%d", index); add_disk(disk); + + /* + * Now publish the device. + */ + refcount_set(&nbd->refs, refs); nbd_total_devices++; - return index; + return nbd; out_free_idr: + mutex_lock(&nbd_index_mutex); idr_remove(&nbd_index_idr, index); + mutex_unlock(&nbd_index_mutex); out_free_tags: blk_mq_free_tag_set(&nbd->tag_set); out_free_nbd: kfree(nbd); out: - return err; + return ERR_PTR(err); } -static int find_free_cb(int id, void *ptr, void *data) +static struct nbd_device *nbd_find_get_unused(void) { - struct nbd_device *nbd = ptr; - struct nbd_device **found = data; + struct nbd_device *nbd; + int id; - if (!refcount_read(&nbd->config_refs)) { - *found = nbd; - return 1; + lockdep_assert_held(&nbd_index_mutex); + + idr_for_each_entry(&nbd_index_idr, nbd, id) { + if (refcount_read(&nbd->config_refs) || + test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags)) + continue; + if (refcount_inc_not_zero(&nbd->refs)) + return nbd; } - return 0; + + return NULL; } /* Netlink interface. */ @@ -1806,8 +1842,7 @@ static int nbd_genl_size_set(struct genl_info *info, struct nbd_device *nbd) static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) { - DECLARE_COMPLETION_ONSTACK(destroy_complete); - struct nbd_device *nbd = NULL; + struct nbd_device *nbd; struct nbd_config *config; int index = -1; int ret; @@ -1829,56 +1864,30 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) again: mutex_lock(&nbd_index_mutex); if (index == -1) { - ret = idr_for_each(&nbd_index_idr, &find_free_cb, &nbd); - if (ret == 0) { - int new_index; - new_index = nbd_dev_add(-1); - if (new_index < 0) { - mutex_unlock(&nbd_index_mutex); - printk(KERN_ERR "nbd: failed to add new device\n"); - return new_index; - } - nbd = idr_find(&nbd_index_idr, new_index); - } + nbd = nbd_find_get_unused(); } else { nbd = idr_find(&nbd_index_idr, index); - if (!nbd) { - ret = nbd_dev_add(index); - if (ret < 0) { + if (nbd) { + if ((test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) && + test_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags)) || + !refcount_inc_not_zero(&nbd->refs)) { mutex_unlock(&nbd_index_mutex); - printk(KERN_ERR "nbd: failed to add new device\n"); - return ret; + pr_err("nbd: device at index %d is going down\n", + index); + return -EINVAL; } - nbd = idr_find(&nbd_index_idr, index); } } - if (!nbd) { - printk(KERN_ERR "nbd: couldn't find device at index %d\n", - index); - mutex_unlock(&nbd_index_mutex); - return -EINVAL; - } - - if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) && - test_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags)) { - nbd->destroy_complete = &destroy_complete; - mutex_unlock(&nbd_index_mutex); - - /* Wait untill the the nbd stuff is totally destroyed */ - wait_for_completion(&destroy_complete); - goto again; - } - - if (!refcount_inc_not_zero(&nbd->refs)) { - mutex_unlock(&nbd_index_mutex); - if (index == -1) - goto again; - printk(KERN_ERR "nbd: device at index %d is going down\n", - index); - return -EINVAL; - } mutex_unlock(&nbd_index_mutex); + if (!nbd) { + nbd = nbd_dev_add(index, 2); + if (IS_ERR(nbd)) { + pr_err("nbd: failed to add new device\n"); + return PTR_ERR(nbd); + } + } + mutex_lock(&nbd->config_lock); if (refcount_read(&nbd->config_refs)) { mutex_unlock(&nbd->config_lock); @@ -2424,16 +2433,21 @@ static int __init nbd_init(void) if (register_blkdev(NBD_MAJOR, "nbd")) return -EIO; + nbd_del_wq = alloc_workqueue("nbd-del", WQ_UNBOUND, 0); + if (!nbd_del_wq) { + unregister_blkdev(NBD_MAJOR, "nbd"); + return -ENOMEM; + } + if (genl_register_family(&nbd_genl_family)) { + destroy_workqueue(nbd_del_wq); unregister_blkdev(NBD_MAJOR, "nbd"); return -EINVAL; } nbd_dbg_init(); - mutex_lock(&nbd_index_mutex); for (i = 0; i < nbds_max; i++) - nbd_dev_add(i); - mutex_unlock(&nbd_index_mutex); + nbd_dev_add(i, 1); return 0; } @@ -2442,7 +2456,10 @@ static int nbd_exit_cb(int id, void *ptr, void *data) struct list_head *list = (struct list_head *)data; struct nbd_device *nbd = ptr; - list_add_tail(&nbd->list, list); + /* Skip nbd that is being removed asynchronously */ + if (refcount_read(&nbd->refs)) + list_add_tail(&nbd->list, list); + return 0; } @@ -2465,6 +2482,9 @@ static void __exit nbd_cleanup(void) nbd_put(nbd); } + /* Also wait for nbd_dev_remove_work() completes */ + destroy_workqueue(nbd_del_wq); + idr_destroy(&nbd_index_idr); genl_unregister_family(&nbd_genl_family); unregister_blkdev(NBD_MAJOR, "nbd"); diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index d734e9ee1546..187d779c8ca0 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -11,10 +11,6 @@ #include #include "null_blk.h" -#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) -#define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT) -#define SECTOR_MASK (PAGE_SECTORS - 1) - #define FREE_BATCH 16 #define TICKS_PER_SEC 50ULL @@ -1721,8 +1717,7 @@ static int null_gendisk_register(struct nullb *nullb) return ret; } - add_disk(disk); - return 0; + return add_disk(disk); } static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set) diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 538446b652de..0f26b2510a75 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -1183,10 +1183,8 @@ try_next_bio: wakeup = (pd->write_congestion_on > 0 && pd->bio_queue_size <= pd->write_congestion_off); spin_unlock(&pd->lock); - if (wakeup) { - clear_bdi_congested(pd->disk->queue->backing_dev_info, - BLK_RW_ASYNC); - } + if (wakeup) + clear_bdi_congested(pd->disk->bdi, BLK_RW_ASYNC); pkt->sleep_time = max(PACKET_WAIT_TIME, 1); pkt_set_state(pkt, PACKET_WAITING_STATE); @@ -2366,7 +2364,7 @@ static void pkt_make_request_write(struct request_queue *q, struct bio *bio) spin_lock(&pd->lock); if (pd->write_congestion_on > 0 && pd->bio_queue_size >= pd->write_congestion_on) { - set_bdi_congested(q->backing_dev_info, BLK_RW_ASYNC); + set_bdi_congested(bio->bi_bdev->bd_disk->bdi, BLK_RW_ASYNC); do { spin_unlock(&pd->lock); congestion_wait(BLK_RW_ASYNC, HZ); diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c index f374ea2c67ce..8d51efbe045d 100644 --- a/drivers/block/ps3disk.c +++ b/drivers/block/ps3disk.c @@ -83,26 +83,12 @@ static void ps3disk_scatter_gather(struct ps3_storage_device *dev, unsigned int offset = 0; struct req_iterator iter; struct bio_vec bvec; - unsigned int i = 0; - size_t size; - void *buf; rq_for_each_segment(bvec, req, iter) { - unsigned long flags; - dev_dbg(&dev->sbd.core, "%s:%u: bio %u: %u sectors from %llu\n", - __func__, __LINE__, i, bio_sectors(iter.bio), - iter.bio->bi_iter.bi_sector); - - size = bvec.bv_len; - buf = bvec_kmap_irq(&bvec, &flags); if (gather) - memcpy(dev->bounce_buf+offset, buf, size); + memcpy_from_bvec(dev->bounce_buf + offset, &bvec); else - memcpy(buf, dev->bounce_buf+offset, size); - offset += size; - flush_kernel_dcache_page(bvec.bv_page); - bvec_kunmap_irq(buf, &flags); - i++; + memcpy_to_bvec(&bvec, dev->bounce_buf + offset); } } diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c index 7fbf469651c4..c7b19e128b03 100644 --- a/drivers/block/ps3vram.c +++ b/drivers/block/ps3vram.c @@ -541,7 +541,7 @@ static struct bio *ps3vram_do_bio(struct ps3_system_bus_device *dev, bio_for_each_segment(bvec, bio, iter) { /* PS3 is ppc64, so we don't handle highmem */ - char *ptr = page_address(bvec.bv_page) + bvec.bv_offset; + char *ptr = bvec_virt(&bvec); size_t len = bvec.bv_len, retlen; dev_dbg(&dev->core, " %s %zu bytes at offset %llu\n", op, diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 90b947c96402..e65c9d706f6f 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1219,24 +1219,13 @@ static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev) rbd_dev->mapping.size = 0; } -static void zero_bvec(struct bio_vec *bv) -{ - void *buf; - unsigned long flags; - - buf = bvec_kmap_irq(bv, &flags); - memset(buf, 0, bv->bv_len); - flush_dcache_page(bv->bv_page); - bvec_kunmap_irq(buf, &flags); -} - static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes) { struct ceph_bio_iter it = *bio_pos; ceph_bio_iter_advance(&it, off); ceph_bio_iter_advance_step(&it, bytes, ({ - zero_bvec(&bv); + memzero_bvec(&bv); })); } @@ -1246,7 +1235,7 @@ static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes) ceph_bvec_iter_advance(&it, off); ceph_bvec_iter_advance_step(&it, bytes, ({ - zero_bvec(&bv); + memzero_bvec(&bv); })); } @@ -2997,8 +2986,7 @@ static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes) }; ceph_bvec_iter_advance_step(&it, bytes, ({ - if (memchr_inv(page_address(bv.bv_page) + bv.bv_offset, 0, - bv.bv_len)) + if (memchr_inv(bvec_virt(&bv), 0, bv.bv_len)) return false; })); return true; diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c index 324afdd63a96..4b93fd83bf79 100644 --- a/drivers/block/rnbd/rnbd-clt-sysfs.c +++ b/drivers/block/rnbd/rnbd-clt-sysfs.c @@ -227,17 +227,17 @@ static ssize_t state_show(struct kobject *kobj, switch (dev->dev_state) { case DEV_STATE_INIT: - return snprintf(page, PAGE_SIZE, "init\n"); + return sysfs_emit(page, "init\n"); case DEV_STATE_MAPPED: /* TODO fix cli tool before changing to proper state */ - return snprintf(page, PAGE_SIZE, "open\n"); + return sysfs_emit(page, "open\n"); case DEV_STATE_MAPPED_DISCONNECTED: /* TODO fix cli tool before changing to proper state */ - return snprintf(page, PAGE_SIZE, "closed\n"); + return sysfs_emit(page, "closed\n"); case DEV_STATE_UNMAPPED: - return snprintf(page, PAGE_SIZE, "unmapped\n"); + return sysfs_emit(page, "unmapped\n"); default: - return snprintf(page, PAGE_SIZE, "unknown\n"); + return sysfs_emit(page, "unknown\n"); } } @@ -263,7 +263,7 @@ static ssize_t mapping_path_show(struct kobject *kobj, dev = container_of(kobj, struct rnbd_clt_dev, kobj); - return scnprintf(page, PAGE_SIZE, "%s\n", dev->pathname); + return sysfs_emit(page, "%s\n", dev->pathname); } static struct kobj_attribute rnbd_clt_mapping_path_attr = @@ -276,8 +276,7 @@ static ssize_t access_mode_show(struct kobject *kobj, dev = container_of(kobj, struct rnbd_clt_dev, kobj); - return snprintf(page, PAGE_SIZE, "%s\n", - rnbd_access_mode_str(dev->access_mode)); + return sysfs_emit(page, "%s\n", rnbd_access_mode_str(dev->access_mode)); } static struct kobj_attribute rnbd_clt_access_mode = @@ -286,8 +285,8 @@ static struct kobj_attribute rnbd_clt_access_mode = static ssize_t rnbd_clt_unmap_dev_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return scnprintf(page, PAGE_SIZE, "Usage: echo > %s\n", - attr->attr.name); + return sysfs_emit(page, "Usage: echo > %s\n", + attr->attr.name); } static ssize_t rnbd_clt_unmap_dev_store(struct kobject *kobj, @@ -357,9 +356,8 @@ static ssize_t rnbd_clt_resize_dev_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return scnprintf(page, PAGE_SIZE, - "Usage: echo > %s\n", - attr->attr.name); + return sysfs_emit(page, "Usage: echo > %s\n", + attr->attr.name); } static ssize_t rnbd_clt_resize_dev_store(struct kobject *kobj, @@ -390,8 +388,7 @@ static struct kobj_attribute rnbd_clt_resize_dev_attr = static ssize_t rnbd_clt_remap_dev_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return scnprintf(page, PAGE_SIZE, "Usage: echo <1> > %s\n", - attr->attr.name); + return sysfs_emit(page, "Usage: echo <1> > %s\n", attr->attr.name); } static ssize_t rnbd_clt_remap_dev_store(struct kobject *kobj, @@ -436,7 +433,7 @@ static ssize_t session_show(struct kobject *kobj, struct kobj_attribute *attr, dev = container_of(kobj, struct rnbd_clt_dev, kobj); - return scnprintf(page, PAGE_SIZE, "%s\n", dev->sess->sessname); + return sysfs_emit(page, "%s\n", dev->sess->sessname); } static struct kobj_attribute rnbd_clt_session_attr = @@ -499,8 +496,8 @@ static ssize_t rnbd_clt_map_device_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return scnprintf(page, PAGE_SIZE, - "Usage: echo \"[dest_port=server port number] sessname= path=<[srcaddr@]dstaddr> [path=<[srcaddr@]dstaddr>] device_path= [access_mode=] [nr_poll_queues=]\" > %s\n\naddr ::= [ ip: | ip: | gid: ]\n", + return sysfs_emit(page, + "Usage: echo \"[dest_port=server port number] sessname= path=<[srcaddr@]dstaddr> [path=<[srcaddr@]dstaddr>] device_path= [access_mode=] [nr_poll_queues=]\" > %s\n\naddr ::= [ ip: | ip: | gid: ]\n", attr->attr.name); } diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index e9cc413495f0..bd4a41afbbfc 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -271,7 +271,7 @@ unlock: */ if (cpu_q) *cpup = cpu_q->cpu; - put_cpu_var(sess->cpu_rr); + put_cpu_ptr(sess->cpu_rr); if (q) rnbd_clt_dev_requeue(q); diff --git a/drivers/block/rnbd/rnbd-srv-sysfs.c b/drivers/block/rnbd/rnbd-srv-sysfs.c index acf5fced11ef..4db98e0e76f0 100644 --- a/drivers/block/rnbd/rnbd-srv-sysfs.c +++ b/drivers/block/rnbd/rnbd-srv-sysfs.c @@ -90,8 +90,8 @@ static ssize_t read_only_show(struct kobject *kobj, struct kobj_attribute *attr, sess_dev = container_of(kobj, struct rnbd_srv_sess_dev, kobj); - return scnprintf(page, PAGE_SIZE, "%d\n", - !(sess_dev->open_flags & FMODE_WRITE)); + return sysfs_emit(page, "%d\n", + !(sess_dev->open_flags & FMODE_WRITE)); } static struct kobj_attribute rnbd_srv_dev_session_ro_attr = @@ -105,8 +105,8 @@ static ssize_t access_mode_show(struct kobject *kobj, sess_dev = container_of(kobj, struct rnbd_srv_sess_dev, kobj); - return scnprintf(page, PAGE_SIZE, "%s\n", - rnbd_access_mode_str(sess_dev->access_mode)); + return sysfs_emit(page, "%s\n", + rnbd_access_mode_str(sess_dev->access_mode)); } static struct kobj_attribute rnbd_srv_dev_session_access_mode_attr = @@ -119,7 +119,7 @@ static ssize_t mapping_path_show(struct kobject *kobj, sess_dev = container_of(kobj, struct rnbd_srv_sess_dev, kobj); - return scnprintf(page, PAGE_SIZE, "%s\n", sess_dev->pathname); + return sysfs_emit(page, "%s\n", sess_dev->pathname); } static struct kobj_attribute rnbd_srv_dev_session_mapping_path_attr = @@ -128,8 +128,8 @@ static struct kobj_attribute rnbd_srv_dev_session_mapping_path_attr = static ssize_t rnbd_srv_dev_session_force_close_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return scnprintf(page, PAGE_SIZE, "Usage: echo 1 > %s\n", - attr->attr.name); + return sysfs_emit(page, "Usage: echo 1 > %s\n", + attr->attr.name); } static ssize_t rnbd_srv_dev_session_force_close_store(struct kobject *kobj, diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c index 7b54353ee92b..420cd952ddc4 100644 --- a/drivers/block/sx8.c +++ b/drivers/block/sx8.c @@ -1373,7 +1373,7 @@ static void carm_free_disk(struct carm_host *host, unsigned int port_no) if (!disk) return; - if (disk->flags & GENHD_FL_UP) + if (host->state > HST_DEV_ACTIVATE) del_gendisk(disk); blk_cleanup_disk(disk); } diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 1bb8b6827dc7..9a8b43402151 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -166,11 +166,8 @@ static inline void virtblk_request_done(struct request *req) { struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); - if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { - kfree(page_address(req->special_vec.bv_page) + - req->special_vec.bv_offset); - } - + if (req->rq_flags & RQF_SPECIAL_PAYLOAD) + kfree(bvec_virt(&req->special_vec)); blk_mq_end_request(req, virtblk_result(vbr)); } @@ -846,7 +843,7 @@ static int virtblk_probe(struct virtio_device *vdev) "block size is changed unexpectedly, now is %u\n", blk_size); err = -EINVAL; - goto err_cleanup_disk; + goto out_cleanup_disk; } /* Use topology information if available */ @@ -904,10 +901,13 @@ static int virtblk_probe(struct virtio_device *vdev) virtblk_update_capacity(vblk, false); virtio_device_ready(vdev); - device_add_disk(&vdev->dev, vblk->disk, virtblk_attr_groups); + err = device_add_disk(&vdev->dev, vblk->disk, virtblk_attr_groups); + if (err) + goto out_cleanup_disk; + return 0; -err_cleanup_disk: +out_cleanup_disk: blk_cleanup_disk(vblk->disk); out_free_tags: blk_mq_free_tag_set(&vblk->tag_set); diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index d83fee21f6c5..715bfa8aca7f 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -1092,7 +1092,6 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, err = xlbd_reserve_minors(minor, nr_minors); if (err) return err; - err = -ENODEV; memset(&info->tag_set, 0, sizeof(info->tag_set)); info->tag_set.ops = &blkfront_mq_ops; diff --git a/drivers/clocksource/exynos_mct.c b/drivers/clocksource/exynos_mct.c index fabad79baafc..5e3e96d3d1b9 100644 --- a/drivers/clocksource/exynos_mct.c +++ b/drivers/clocksource/exynos_mct.c @@ -51,6 +51,15 @@ #define TICK_BASE_CNT 1 +#ifdef CONFIG_ARM +/* Use values higher than ARM arch timer. See 6282edb72bed. */ +#define MCT_CLKSOURCE_RATING 450 +#define MCT_CLKEVENTS_RATING 500 +#else +#define MCT_CLKSOURCE_RATING 350 +#define MCT_CLKEVENTS_RATING 350 +#endif + enum { MCT_INT_SPI, MCT_INT_PPI @@ -206,7 +215,7 @@ static void exynos4_frc_resume(struct clocksource *cs) static struct clocksource mct_frc = { .name = "mct-frc", - .rating = 450, /* use value higher than ARM arch timer */ + .rating = MCT_CLKSOURCE_RATING, .read = exynos4_frc_read, .mask = CLOCKSOURCE_MASK(32), .flags = CLOCK_SOURCE_IS_CONTINUOUS, @@ -456,8 +465,9 @@ static int exynos4_mct_starting_cpu(unsigned int cpu) evt->set_state_oneshot = set_state_shutdown; evt->set_state_oneshot_stopped = set_state_shutdown; evt->tick_resume = set_state_shutdown; - evt->features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT; - evt->rating = 500; /* use value higher than ARM arch timer */ + evt->features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT | + CLOCK_EVT_FEAT_PERCPU; + evt->rating = MCT_CLKEVENTS_RATING, exynos4_mct_write(TICK_BASE_CNT, mevt->base + MCT_L_TCNTB_OFFSET); diff --git a/drivers/clocksource/ingenic-sysost.c b/drivers/clocksource/ingenic-sysost.c index a129840f14f9..cb6fc2f152d4 100644 --- a/drivers/clocksource/ingenic-sysost.c +++ b/drivers/clocksource/ingenic-sysost.c @@ -4,6 +4,7 @@ * Copyright (c) 2020 周琰杰 (Zhou Yanjie) */ +#include #include #include #include @@ -34,8 +35,6 @@ /* bits within the OSTCCR register */ #define OSTCCR_PRESCALE1_MASK 0x3 #define OSTCCR_PRESCALE2_MASK 0xc -#define OSTCCR_PRESCALE1_LSB 0 -#define OSTCCR_PRESCALE2_LSB 2 /* bits within the OSTCR register */ #define OSTCR_OST1CLR BIT(0) @@ -98,7 +97,7 @@ static unsigned long ingenic_ost_percpu_timer_recalc_rate(struct clk_hw *hw, prescale = readl(ost_clk->ost->base + info->ostccr_reg); - prescale = (prescale & OSTCCR_PRESCALE1_MASK) >> OSTCCR_PRESCALE1_LSB; + prescale = FIELD_GET(OSTCCR_PRESCALE1_MASK, prescale); return parent_rate >> (prescale * 2); } @@ -112,7 +111,7 @@ static unsigned long ingenic_ost_global_timer_recalc_rate(struct clk_hw *hw, prescale = readl(ost_clk->ost->base + info->ostccr_reg); - prescale = (prescale & OSTCCR_PRESCALE2_MASK) >> OSTCCR_PRESCALE2_LSB; + prescale = FIELD_GET(OSTCCR_PRESCALE2_MASK, prescale); return parent_rate >> (prescale * 2); } @@ -151,7 +150,8 @@ static int ingenic_ost_percpu_timer_set_rate(struct clk_hw *hw, unsigned long re int val; val = readl(ost_clk->ost->base + info->ostccr_reg); - val = (val & ~OSTCCR_PRESCALE1_MASK) | (prescale << OSTCCR_PRESCALE1_LSB); + val &= ~OSTCCR_PRESCALE1_MASK; + val |= FIELD_PREP(OSTCCR_PRESCALE1_MASK, prescale); writel(val, ost_clk->ost->base + info->ostccr_reg); return 0; @@ -166,7 +166,8 @@ static int ingenic_ost_global_timer_set_rate(struct clk_hw *hw, unsigned long re int val; val = readl(ost_clk->ost->base + info->ostccr_reg); - val = (val & ~OSTCCR_PRESCALE2_MASK) | (prescale << OSTCCR_PRESCALE2_LSB); + val &= ~OSTCCR_PRESCALE2_MASK; + val |= FIELD_PREP(OSTCCR_PRESCALE2_MASK, prescale); writel(val, ost_clk->ost->base + info->ostccr_reg); return 0; diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c index d7ed99f0001f..dd0956ad969c 100644 --- a/drivers/clocksource/sh_cmt.c +++ b/drivers/clocksource/sh_cmt.c @@ -579,7 +579,8 @@ static int sh_cmt_start(struct sh_cmt_channel *ch, unsigned long flag) ch->flags |= flag; /* setup timeout if no clockevent */ - if ((flag == FLAG_CLOCKSOURCE) && (!(ch->flags & FLAG_CLOCKEVENT))) + if (ch->cmt->num_channels == 1 && + flag == FLAG_CLOCKSOURCE && (!(ch->flags & FLAG_CLOCKEVENT))) __sh_cmt_set_next(ch, ch->max_match_value); out: raw_spin_unlock_irqrestore(&ch->lock, flags); @@ -621,20 +622,25 @@ static struct sh_cmt_channel *cs_to_sh_cmt(struct clocksource *cs) static u64 sh_cmt_clocksource_read(struct clocksource *cs) { struct sh_cmt_channel *ch = cs_to_sh_cmt(cs); - unsigned long flags; u32 has_wrapped; - u64 value; - u32 raw; - raw_spin_lock_irqsave(&ch->lock, flags); - value = ch->total_cycles; - raw = sh_cmt_get_counter(ch, &has_wrapped); + if (ch->cmt->num_channels == 1) { + unsigned long flags; + u64 value; + u32 raw; - if (unlikely(has_wrapped)) - raw += ch->match_value + 1; - raw_spin_unlock_irqrestore(&ch->lock, flags); + raw_spin_lock_irqsave(&ch->lock, flags); + value = ch->total_cycles; + raw = sh_cmt_get_counter(ch, &has_wrapped); - return value + raw; + if (unlikely(has_wrapped)) + raw += ch->match_value + 1; + raw_spin_unlock_irqrestore(&ch->lock, flags); + + return value + raw; + } + + return sh_cmt_get_counter(ch, &has_wrapped); } static int sh_cmt_clocksource_enable(struct clocksource *cs) @@ -697,7 +703,7 @@ static int sh_cmt_register_clocksource(struct sh_cmt_channel *ch, cs->disable = sh_cmt_clocksource_disable; cs->suspend = sh_cmt_clocksource_suspend; cs->resume = sh_cmt_clocksource_resume; - cs->mask = CLOCKSOURCE_MASK(sizeof(u64) * 8); + cs->mask = CLOCKSOURCE_MASK(ch->cmt->info->width); cs->flags = CLOCK_SOURCE_IS_CONTINUOUS; dev_info(&ch->cmt->pdev->dev, "ch%u: used as clock source\n", diff --git a/drivers/clocksource/timer-fttmr010.c b/drivers/clocksource/timer-fttmr010.c index edb1d5f193f5..126fb1f259b2 100644 --- a/drivers/clocksource/timer-fttmr010.c +++ b/drivers/clocksource/timer-fttmr010.c @@ -271,9 +271,7 @@ static irqreturn_t ast2600_timer_interrupt(int irq, void *dev_id) } static int __init fttmr010_common_init(struct device_node *np, - bool is_aspeed, - int (*timer_shutdown)(struct clock_event_device *), - irq_handler_t irq_handler) + bool is_aspeed, bool is_ast2600) { struct fttmr010 *fttmr010; int irq; @@ -374,8 +372,6 @@ static int __init fttmr010_common_init(struct device_node *np, fttmr010->tick_rate); } - fttmr010->timer_shutdown = timer_shutdown; - /* * Setup clockevent timer (interrupt-driven) on timer 1. */ @@ -383,8 +379,18 @@ static int __init fttmr010_common_init(struct device_node *np, writel(0, fttmr010->base + TIMER1_LOAD); writel(0, fttmr010->base + TIMER1_MATCH1); writel(0, fttmr010->base + TIMER1_MATCH2); - ret = request_irq(irq, irq_handler, IRQF_TIMER, - "FTTMR010-TIMER1", &fttmr010->clkevt); + + if (is_ast2600) { + fttmr010->timer_shutdown = ast2600_timer_shutdown; + ret = request_irq(irq, ast2600_timer_interrupt, + IRQF_TIMER, "FTTMR010-TIMER1", + &fttmr010->clkevt); + } else { + fttmr010->timer_shutdown = fttmr010_timer_shutdown; + ret = request_irq(irq, fttmr010_timer_interrupt, + IRQF_TIMER, "FTTMR010-TIMER1", + &fttmr010->clkevt); + } if (ret) { pr_err("FTTMR010-TIMER1 no IRQ\n"); goto out_unmap; @@ -432,23 +438,17 @@ out_disable_clock: static __init int ast2600_timer_init(struct device_node *np) { - return fttmr010_common_init(np, true, - ast2600_timer_shutdown, - ast2600_timer_interrupt); + return fttmr010_common_init(np, true, true); } static __init int aspeed_timer_init(struct device_node *np) { - return fttmr010_common_init(np, true, - fttmr010_timer_shutdown, - fttmr010_timer_interrupt); + return fttmr010_common_init(np, true, false); } static __init int fttmr010_timer_init(struct device_node *np) { - return fttmr010_common_init(np, false, - fttmr010_timer_shutdown, - fttmr010_timer_interrupt); + return fttmr010_common_init(np, false, false); } TIMER_OF_DECLARE(fttmr010, "faraday,fttmr010", fttmr010_timer_init); diff --git a/drivers/clocksource/timer-mediatek.c b/drivers/clocksource/timer-mediatek.c index ab63b95e414f..7bcb4a3f26fb 100644 --- a/drivers/clocksource/timer-mediatek.c +++ b/drivers/clocksource/timer-mediatek.c @@ -60,9 +60,9 @@ * SYST_CON_EN: Clock enable. Shall be set to * - Start timer countdown. * - Allow timeout ticks being updated. - * - Allow changing interrupt functions. + * - Allow changing interrupt status,like clear irq pending. * - * SYST_CON_IRQ_EN: Set to allow interrupt. + * SYST_CON_IRQ_EN: Set to enable interrupt. * * SYST_CON_IRQ_CLR: Set to clear interrupt. */ @@ -75,6 +75,7 @@ static void __iomem *gpt_sched_reg __read_mostly; static void mtk_syst_ack_irq(struct timer_of *to) { /* Clear and disable interrupt */ + writel(SYST_CON_EN, SYST_CON_REG(to)); writel(SYST_CON_IRQ_CLR | SYST_CON_EN, SYST_CON_REG(to)); } @@ -111,6 +112,9 @@ static int mtk_syst_clkevt_next_event(unsigned long ticks, static int mtk_syst_clkevt_shutdown(struct clock_event_device *clkevt) { + /* Clear any irq */ + mtk_syst_ack_irq(to_timer_of(clkevt)); + /* Disable timer */ writel(0, SYST_CON_REG(to_timer_of(clkevt))); diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig deleted file mode 100644 index 04caa0f2d445..000000000000 --- a/drivers/lightnvm/Kconfig +++ /dev/null @@ -1,44 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -# -# Open-Channel SSD NVM configuration -# - -menuconfig NVM - bool "Open-Channel SSD target support (DEPRECATED)" - depends on BLOCK - help - Say Y here to get to enable Open-channel SSDs. - - Open-Channel SSDs implement a set of extension to SSDs, that - exposes direct access to the underlying non-volatile memory. - - If you say N, all options in this submenu will be skipped and disabled - only do this if you know what you are doing. - - This code is deprecated and will be removed in Linux 5.15. - -if NVM - -config NVM_PBLK - tristate "Physical Block Device Open-Channel SSD target" - select CRC32 - help - Allows an open-channel SSD to be exposed as a block device to the - host. The target assumes the device exposes raw flash and must be - explicitly managed by the host. - - Please note the disk format is considered EXPERIMENTAL for now. - -if NVM_PBLK - -config NVM_PBLK_DEBUG - bool "PBlk Debug Support" - default n - help - Enables debug support for pblk. This includes extra checks, more - vocal error messages, and extra tracking fields in the pblk sysfs - entries. - -endif # NVM_PBLK_DEBUG - -endif # NVM diff --git a/drivers/lightnvm/Makefile b/drivers/lightnvm/Makefile deleted file mode 100644 index 97d9d7c71550..000000000000 --- a/drivers/lightnvm/Makefile +++ /dev/null @@ -1,11 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# -# Makefile for Open-Channel SSDs. -# - -obj-$(CONFIG_NVM) := core.o -obj-$(CONFIG_NVM_PBLK) += pblk.o -pblk-y := pblk-init.o pblk-core.o pblk-rb.o \ - pblk-write.o pblk-cache.o pblk-read.o \ - pblk-gc.o pblk-recovery.o pblk-map.o \ - pblk-rl.o pblk-sysfs.o diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c deleted file mode 100644 index cf8a75494833..000000000000 --- a/drivers/lightnvm/core.c +++ /dev/null @@ -1,1440 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2015 IT University of Copenhagen. All rights reserved. - * Initial release: Matias Bjorling - */ - -#define pr_fmt(fmt) "nvm: " fmt - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static LIST_HEAD(nvm_tgt_types); -static DECLARE_RWSEM(nvm_tgtt_lock); -static LIST_HEAD(nvm_devices); -static DECLARE_RWSEM(nvm_lock); - -/* Map between virtual and physical channel and lun */ -struct nvm_ch_map { - int ch_off; - int num_lun; - int *lun_offs; -}; - -struct nvm_dev_map { - struct nvm_ch_map *chnls; - int num_ch; -}; - -static void nvm_free(struct kref *ref); - -static struct nvm_target *nvm_find_target(struct nvm_dev *dev, const char *name) -{ - struct nvm_target *tgt; - - list_for_each_entry(tgt, &dev->targets, list) - if (!strcmp(name, tgt->disk->disk_name)) - return tgt; - - return NULL; -} - -static bool nvm_target_exists(const char *name) -{ - struct nvm_dev *dev; - struct nvm_target *tgt; - bool ret = false; - - down_write(&nvm_lock); - list_for_each_entry(dev, &nvm_devices, devices) { - mutex_lock(&dev->mlock); - list_for_each_entry(tgt, &dev->targets, list) { - if (!strcmp(name, tgt->disk->disk_name)) { - ret = true; - mutex_unlock(&dev->mlock); - goto out; - } - } - mutex_unlock(&dev->mlock); - } - -out: - up_write(&nvm_lock); - return ret; -} - -static int nvm_reserve_luns(struct nvm_dev *dev, int lun_begin, int lun_end) -{ - int i; - - for (i = lun_begin; i <= lun_end; i++) { - if (test_and_set_bit(i, dev->lun_map)) { - pr_err("lun %d already allocated\n", i); - goto err; - } - } - - return 0; -err: - while (--i >= lun_begin) - clear_bit(i, dev->lun_map); - - return -EBUSY; -} - -static void nvm_release_luns_err(struct nvm_dev *dev, int lun_begin, - int lun_end) -{ - int i; - - for (i = lun_begin; i <= lun_end; i++) - WARN_ON(!test_and_clear_bit(i, dev->lun_map)); -} - -static void nvm_remove_tgt_dev(struct nvm_tgt_dev *tgt_dev, int clear) -{ - struct nvm_dev *dev = tgt_dev->parent; - struct nvm_dev_map *dev_map = tgt_dev->map; - int i, j; - - for (i = 0; i < dev_map->num_ch; i++) { - struct nvm_ch_map *ch_map = &dev_map->chnls[i]; - int *lun_offs = ch_map->lun_offs; - int ch = i + ch_map->ch_off; - - if (clear) { - for (j = 0; j < ch_map->num_lun; j++) { - int lun = j + lun_offs[j]; - int lunid = (ch * dev->geo.num_lun) + lun; - - WARN_ON(!test_and_clear_bit(lunid, - dev->lun_map)); - } - } - - kfree(ch_map->lun_offs); - } - - kfree(dev_map->chnls); - kfree(dev_map); - - kfree(tgt_dev->luns); - kfree(tgt_dev); -} - -static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev, - u16 lun_begin, u16 lun_end, - u16 op) -{ - struct nvm_tgt_dev *tgt_dev = NULL; - struct nvm_dev_map *dev_rmap = dev->rmap; - struct nvm_dev_map *dev_map; - struct ppa_addr *luns; - int num_lun = lun_end - lun_begin + 1; - int luns_left = num_lun; - int num_ch = num_lun / dev->geo.num_lun; - int num_ch_mod = num_lun % dev->geo.num_lun; - int bch = lun_begin / dev->geo.num_lun; - int blun = lun_begin % dev->geo.num_lun; - int lunid = 0; - int lun_balanced = 1; - int sec_per_lun, prev_num_lun; - int i, j; - - num_ch = (num_ch_mod == 0) ? num_ch : num_ch + 1; - - dev_map = kmalloc(sizeof(struct nvm_dev_map), GFP_KERNEL); - if (!dev_map) - goto err_dev; - - dev_map->chnls = kcalloc(num_ch, sizeof(struct nvm_ch_map), GFP_KERNEL); - if (!dev_map->chnls) - goto err_chnls; - - luns = kcalloc(num_lun, sizeof(struct ppa_addr), GFP_KERNEL); - if (!luns) - goto err_luns; - - prev_num_lun = (luns_left > dev->geo.num_lun) ? - dev->geo.num_lun : luns_left; - for (i = 0; i < num_ch; i++) { - struct nvm_ch_map *ch_rmap = &dev_rmap->chnls[i + bch]; - int *lun_roffs = ch_rmap->lun_offs; - struct nvm_ch_map *ch_map = &dev_map->chnls[i]; - int *lun_offs; - int luns_in_chnl = (luns_left > dev->geo.num_lun) ? - dev->geo.num_lun : luns_left; - - if (lun_balanced && prev_num_lun != luns_in_chnl) - lun_balanced = 0; - - ch_map->ch_off = ch_rmap->ch_off = bch; - ch_map->num_lun = luns_in_chnl; - - lun_offs = kcalloc(luns_in_chnl, sizeof(int), GFP_KERNEL); - if (!lun_offs) - goto err_ch; - - for (j = 0; j < luns_in_chnl; j++) { - luns[lunid].ppa = 0; - luns[lunid].a.ch = i; - luns[lunid++].a.lun = j; - - lun_offs[j] = blun; - lun_roffs[j + blun] = blun; - } - - ch_map->lun_offs = lun_offs; - - /* when starting a new channel, lun offset is reset */ - blun = 0; - luns_left -= luns_in_chnl; - } - - dev_map->num_ch = num_ch; - - tgt_dev = kmalloc(sizeof(struct nvm_tgt_dev), GFP_KERNEL); - if (!tgt_dev) - goto err_ch; - - /* Inherit device geometry from parent */ - memcpy(&tgt_dev->geo, &dev->geo, sizeof(struct nvm_geo)); - - /* Target device only owns a portion of the physical device */ - tgt_dev->geo.num_ch = num_ch; - tgt_dev->geo.num_lun = (lun_balanced) ? prev_num_lun : -1; - tgt_dev->geo.all_luns = num_lun; - tgt_dev->geo.all_chunks = num_lun * dev->geo.num_chk; - - tgt_dev->geo.op = op; - - sec_per_lun = dev->geo.clba * dev->geo.num_chk; - tgt_dev->geo.total_secs = num_lun * sec_per_lun; - - tgt_dev->q = dev->q; - tgt_dev->map = dev_map; - tgt_dev->luns = luns; - tgt_dev->parent = dev; - - return tgt_dev; -err_ch: - while (--i >= 0) - kfree(dev_map->chnls[i].lun_offs); - kfree(luns); -err_luns: - kfree(dev_map->chnls); -err_chnls: - kfree(dev_map); -err_dev: - return tgt_dev; -} - -static struct nvm_tgt_type *__nvm_find_target_type(const char *name) -{ - struct nvm_tgt_type *tt; - - list_for_each_entry(tt, &nvm_tgt_types, list) - if (!strcmp(name, tt->name)) - return tt; - - return NULL; -} - -static struct nvm_tgt_type *nvm_find_target_type(const char *name) -{ - struct nvm_tgt_type *tt; - - down_write(&nvm_tgtt_lock); - tt = __nvm_find_target_type(name); - up_write(&nvm_tgtt_lock); - - return tt; -} - -static int nvm_config_check_luns(struct nvm_geo *geo, int lun_begin, - int lun_end) -{ - if (lun_begin > lun_end || lun_end >= geo->all_luns) { - pr_err("lun out of bound (%u:%u > %u)\n", - lun_begin, lun_end, geo->all_luns - 1); - return -EINVAL; - } - - return 0; -} - -static int __nvm_config_simple(struct nvm_dev *dev, - struct nvm_ioctl_create_simple *s) -{ - struct nvm_geo *geo = &dev->geo; - - if (s->lun_begin == -1 && s->lun_end == -1) { - s->lun_begin = 0; - s->lun_end = geo->all_luns - 1; - } - - return nvm_config_check_luns(geo, s->lun_begin, s->lun_end); -} - -static int __nvm_config_extended(struct nvm_dev *dev, - struct nvm_ioctl_create_extended *e) -{ - if (e->lun_begin == 0xFFFF && e->lun_end == 0xFFFF) { - e->lun_begin = 0; - e->lun_end = dev->geo.all_luns - 1; - } - - /* op not set falls into target's default */ - if (e->op == 0xFFFF) { - e->op = NVM_TARGET_DEFAULT_OP; - } else if (e->op < NVM_TARGET_MIN_OP || e->op > NVM_TARGET_MAX_OP) { - pr_err("invalid over provisioning value\n"); - return -EINVAL; - } - - return nvm_config_check_luns(&dev->geo, e->lun_begin, e->lun_end); -} - -static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) -{ - struct nvm_ioctl_create_extended e; - struct gendisk *tdisk; - struct nvm_tgt_type *tt; - struct nvm_target *t; - struct nvm_tgt_dev *tgt_dev; - void *targetdata; - unsigned int mdts; - int ret; - - switch (create->conf.type) { - case NVM_CONFIG_TYPE_SIMPLE: - ret = __nvm_config_simple(dev, &create->conf.s); - if (ret) - return ret; - - e.lun_begin = create->conf.s.lun_begin; - e.lun_end = create->conf.s.lun_end; - e.op = NVM_TARGET_DEFAULT_OP; - break; - case NVM_CONFIG_TYPE_EXTENDED: - ret = __nvm_config_extended(dev, &create->conf.e); - if (ret) - return ret; - - e = create->conf.e; - break; - default: - pr_err("config type not valid\n"); - return -EINVAL; - } - - tt = nvm_find_target_type(create->tgttype); - if (!tt) { - pr_err("target type %s not found\n", create->tgttype); - return -EINVAL; - } - - if ((tt->flags & NVM_TGT_F_HOST_L2P) != (dev->geo.dom & NVM_RSP_L2P)) { - pr_err("device is incompatible with target L2P type.\n"); - return -EINVAL; - } - - if (nvm_target_exists(create->tgtname)) { - pr_err("target name already exists (%s)\n", - create->tgtname); - return -EINVAL; - } - - ret = nvm_reserve_luns(dev, e.lun_begin, e.lun_end); - if (ret) - return ret; - - t = kmalloc(sizeof(struct nvm_target), GFP_KERNEL); - if (!t) { - ret = -ENOMEM; - goto err_reserve; - } - - tgt_dev = nvm_create_tgt_dev(dev, e.lun_begin, e.lun_end, e.op); - if (!tgt_dev) { - pr_err("could not create target device\n"); - ret = -ENOMEM; - goto err_t; - } - - tdisk = blk_alloc_disk(dev->q->node); - if (!tdisk) { - ret = -ENOMEM; - goto err_dev; - } - - strlcpy(tdisk->disk_name, create->tgtname, sizeof(tdisk->disk_name)); - tdisk->major = 0; - tdisk->first_minor = 0; - tdisk->fops = tt->bops; - - targetdata = tt->init(tgt_dev, tdisk, create->flags); - if (IS_ERR(targetdata)) { - ret = PTR_ERR(targetdata); - goto err_init; - } - - tdisk->private_data = targetdata; - tdisk->queue->queuedata = targetdata; - - mdts = (dev->geo.csecs >> 9) * NVM_MAX_VLBA; - if (dev->geo.mdts) { - mdts = min_t(u32, dev->geo.mdts, - (dev->geo.csecs >> 9) * NVM_MAX_VLBA); - } - blk_queue_max_hw_sectors(tdisk->queue, mdts); - - set_capacity(tdisk, tt->capacity(targetdata)); - add_disk(tdisk); - - if (tt->sysfs_init && tt->sysfs_init(tdisk)) { - ret = -ENOMEM; - goto err_sysfs; - } - - t->type = tt; - t->disk = tdisk; - t->dev = tgt_dev; - - mutex_lock(&dev->mlock); - list_add_tail(&t->list, &dev->targets); - mutex_unlock(&dev->mlock); - - __module_get(tt->owner); - - return 0; -err_sysfs: - if (tt->exit) - tt->exit(targetdata, true); -err_init: - blk_cleanup_disk(tdisk); -err_dev: - nvm_remove_tgt_dev(tgt_dev, 0); -err_t: - kfree(t); -err_reserve: - nvm_release_luns_err(dev, e.lun_begin, e.lun_end); - return ret; -} - -static void __nvm_remove_target(struct nvm_target *t, bool graceful) -{ - struct nvm_tgt_type *tt = t->type; - struct gendisk *tdisk = t->disk; - - del_gendisk(tdisk); - - if (tt->sysfs_exit) - tt->sysfs_exit(tdisk); - - if (tt->exit) - tt->exit(tdisk->private_data, graceful); - - nvm_remove_tgt_dev(t->dev, 1); - blk_cleanup_disk(tdisk); - module_put(t->type->owner); - - list_del(&t->list); - kfree(t); -} - -/** - * nvm_remove_tgt - Removes a target from the media manager - * @remove: ioctl structure with target name to remove. - * - * Returns: - * 0: on success - * 1: on not found - * <0: on error - */ -static int nvm_remove_tgt(struct nvm_ioctl_remove *remove) -{ - struct nvm_target *t = NULL; - struct nvm_dev *dev; - - down_read(&nvm_lock); - list_for_each_entry(dev, &nvm_devices, devices) { - mutex_lock(&dev->mlock); - t = nvm_find_target(dev, remove->tgtname); - if (t) { - mutex_unlock(&dev->mlock); - break; - } - mutex_unlock(&dev->mlock); - } - up_read(&nvm_lock); - - if (!t) { - pr_err("failed to remove target %s\n", - remove->tgtname); - return 1; - } - - __nvm_remove_target(t, true); - kref_put(&dev->ref, nvm_free); - - return 0; -} - -static int nvm_register_map(struct nvm_dev *dev) -{ - struct nvm_dev_map *rmap; - int i, j; - - rmap = kmalloc(sizeof(struct nvm_dev_map), GFP_KERNEL); - if (!rmap) - goto err_rmap; - - rmap->chnls = kcalloc(dev->geo.num_ch, sizeof(struct nvm_ch_map), - GFP_KERNEL); - if (!rmap->chnls) - goto err_chnls; - - for (i = 0; i < dev->geo.num_ch; i++) { - struct nvm_ch_map *ch_rmap; - int *lun_roffs; - int luns_in_chnl = dev->geo.num_lun; - - ch_rmap = &rmap->chnls[i]; - - ch_rmap->ch_off = -1; - ch_rmap->num_lun = luns_in_chnl; - - lun_roffs = kcalloc(luns_in_chnl, sizeof(int), GFP_KERNEL); - if (!lun_roffs) - goto err_ch; - - for (j = 0; j < luns_in_chnl; j++) - lun_roffs[j] = -1; - - ch_rmap->lun_offs = lun_roffs; - } - - dev->rmap = rmap; - - return 0; -err_ch: - while (--i >= 0) - kfree(rmap->chnls[i].lun_offs); -err_chnls: - kfree(rmap); -err_rmap: - return -ENOMEM; -} - -static void nvm_unregister_map(struct nvm_dev *dev) -{ - struct nvm_dev_map *rmap = dev->rmap; - int i; - - for (i = 0; i < dev->geo.num_ch; i++) - kfree(rmap->chnls[i].lun_offs); - - kfree(rmap->chnls); - kfree(rmap); -} - -static void nvm_map_to_dev(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p) -{ - struct nvm_dev_map *dev_map = tgt_dev->map; - struct nvm_ch_map *ch_map = &dev_map->chnls[p->a.ch]; - int lun_off = ch_map->lun_offs[p->a.lun]; - - p->a.ch += ch_map->ch_off; - p->a.lun += lun_off; -} - -static void nvm_map_to_tgt(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p) -{ - struct nvm_dev *dev = tgt_dev->parent; - struct nvm_dev_map *dev_rmap = dev->rmap; - struct nvm_ch_map *ch_rmap = &dev_rmap->chnls[p->a.ch]; - int lun_roff = ch_rmap->lun_offs[p->a.lun]; - - p->a.ch -= ch_rmap->ch_off; - p->a.lun -= lun_roff; -} - -static void nvm_ppa_tgt_to_dev(struct nvm_tgt_dev *tgt_dev, - struct ppa_addr *ppa_list, int nr_ppas) -{ - int i; - - for (i = 0; i < nr_ppas; i++) { - nvm_map_to_dev(tgt_dev, &ppa_list[i]); - ppa_list[i] = generic_to_dev_addr(tgt_dev->parent, ppa_list[i]); - } -} - -static void nvm_ppa_dev_to_tgt(struct nvm_tgt_dev *tgt_dev, - struct ppa_addr *ppa_list, int nr_ppas) -{ - int i; - - for (i = 0; i < nr_ppas; i++) { - ppa_list[i] = dev_to_generic_addr(tgt_dev->parent, ppa_list[i]); - nvm_map_to_tgt(tgt_dev, &ppa_list[i]); - } -} - -static void nvm_rq_tgt_to_dev(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) -{ - struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd); - - nvm_ppa_tgt_to_dev(tgt_dev, ppa_list, rqd->nr_ppas); -} - -static void nvm_rq_dev_to_tgt(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) -{ - struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd); - - nvm_ppa_dev_to_tgt(tgt_dev, ppa_list, rqd->nr_ppas); -} - -int nvm_register_tgt_type(struct nvm_tgt_type *tt) -{ - int ret = 0; - - down_write(&nvm_tgtt_lock); - if (__nvm_find_target_type(tt->name)) - ret = -EEXIST; - else - list_add(&tt->list, &nvm_tgt_types); - up_write(&nvm_tgtt_lock); - - return ret; -} -EXPORT_SYMBOL(nvm_register_tgt_type); - -void nvm_unregister_tgt_type(struct nvm_tgt_type *tt) -{ - if (!tt) - return; - - down_write(&nvm_tgtt_lock); - list_del(&tt->list); - up_write(&nvm_tgtt_lock); -} -EXPORT_SYMBOL(nvm_unregister_tgt_type); - -void *nvm_dev_dma_alloc(struct nvm_dev *dev, gfp_t mem_flags, - dma_addr_t *dma_handler) -{ - return dev->ops->dev_dma_alloc(dev, dev->dma_pool, mem_flags, - dma_handler); -} -EXPORT_SYMBOL(nvm_dev_dma_alloc); - -void nvm_dev_dma_free(struct nvm_dev *dev, void *addr, dma_addr_t dma_handler) -{ - dev->ops->dev_dma_free(dev->dma_pool, addr, dma_handler); -} -EXPORT_SYMBOL(nvm_dev_dma_free); - -static struct nvm_dev *nvm_find_nvm_dev(const char *name) -{ - struct nvm_dev *dev; - - list_for_each_entry(dev, &nvm_devices, devices) - if (!strcmp(name, dev->name)) - return dev; - - return NULL; -} - -static int nvm_set_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd, - const struct ppa_addr *ppas, int nr_ppas) -{ - struct nvm_dev *dev = tgt_dev->parent; - struct nvm_geo *geo = &tgt_dev->geo; - int i, plane_cnt, pl_idx; - struct ppa_addr ppa; - - if (geo->pln_mode == NVM_PLANE_SINGLE && nr_ppas == 1) { - rqd->nr_ppas = nr_ppas; - rqd->ppa_addr = ppas[0]; - - return 0; - } - - rqd->nr_ppas = nr_ppas; - rqd->ppa_list = nvm_dev_dma_alloc(dev, GFP_KERNEL, &rqd->dma_ppa_list); - if (!rqd->ppa_list) { - pr_err("failed to allocate dma memory\n"); - return -ENOMEM; - } - - plane_cnt = geo->pln_mode; - rqd->nr_ppas *= plane_cnt; - - for (i = 0; i < nr_ppas; i++) { - for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) { - ppa = ppas[i]; - ppa.g.pl = pl_idx; - rqd->ppa_list[(pl_idx * nr_ppas) + i] = ppa; - } - } - - return 0; -} - -static void nvm_free_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, - struct nvm_rq *rqd) -{ - if (!rqd->ppa_list) - return; - - nvm_dev_dma_free(tgt_dev->parent, rqd->ppa_list, rqd->dma_ppa_list); -} - -static int nvm_set_flags(struct nvm_geo *geo, struct nvm_rq *rqd) -{ - int flags = 0; - - if (geo->version == NVM_OCSSD_SPEC_20) - return 0; - - if (rqd->is_seq) - flags |= geo->pln_mode >> 1; - - if (rqd->opcode == NVM_OP_PREAD) - flags |= (NVM_IO_SCRAMBLE_ENABLE | NVM_IO_SUSPEND); - else if (rqd->opcode == NVM_OP_PWRITE) - flags |= NVM_IO_SCRAMBLE_ENABLE; - - return flags; -} - -int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd, void *buf) -{ - struct nvm_dev *dev = tgt_dev->parent; - int ret; - - if (!dev->ops->submit_io) - return -ENODEV; - - nvm_rq_tgt_to_dev(tgt_dev, rqd); - - rqd->dev = tgt_dev; - rqd->flags = nvm_set_flags(&tgt_dev->geo, rqd); - - /* In case of error, fail with right address format */ - ret = dev->ops->submit_io(dev, rqd, buf); - if (ret) - nvm_rq_dev_to_tgt(tgt_dev, rqd); - return ret; -} -EXPORT_SYMBOL(nvm_submit_io); - -static void nvm_sync_end_io(struct nvm_rq *rqd) -{ - struct completion *waiting = rqd->private; - - complete(waiting); -} - -static int nvm_submit_io_wait(struct nvm_dev *dev, struct nvm_rq *rqd, - void *buf) -{ - DECLARE_COMPLETION_ONSTACK(wait); - int ret = 0; - - rqd->end_io = nvm_sync_end_io; - rqd->private = &wait; - - ret = dev->ops->submit_io(dev, rqd, buf); - if (ret) - return ret; - - wait_for_completion_io(&wait); - - return 0; -} - -int nvm_submit_io_sync(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd, - void *buf) -{ - struct nvm_dev *dev = tgt_dev->parent; - int ret; - - if (!dev->ops->submit_io) - return -ENODEV; - - nvm_rq_tgt_to_dev(tgt_dev, rqd); - - rqd->dev = tgt_dev; - rqd->flags = nvm_set_flags(&tgt_dev->geo, rqd); - - ret = nvm_submit_io_wait(dev, rqd, buf); - - return ret; -} -EXPORT_SYMBOL(nvm_submit_io_sync); - -void nvm_end_io(struct nvm_rq *rqd) -{ - struct nvm_tgt_dev *tgt_dev = rqd->dev; - - /* Convert address space */ - if (tgt_dev) - nvm_rq_dev_to_tgt(tgt_dev, rqd); - - if (rqd->end_io) - rqd->end_io(rqd); -} -EXPORT_SYMBOL(nvm_end_io); - -static int nvm_submit_io_sync_raw(struct nvm_dev *dev, struct nvm_rq *rqd) -{ - if (!dev->ops->submit_io) - return -ENODEV; - - rqd->dev = NULL; - rqd->flags = nvm_set_flags(&dev->geo, rqd); - - return nvm_submit_io_wait(dev, rqd, NULL); -} - -static int nvm_bb_chunk_sense(struct nvm_dev *dev, struct ppa_addr ppa) -{ - struct nvm_rq rqd = { NULL }; - struct bio bio; - struct bio_vec bio_vec; - struct page *page; - int ret; - - page = alloc_page(GFP_KERNEL); - if (!page) - return -ENOMEM; - - bio_init(&bio, &bio_vec, 1); - bio_add_page(&bio, page, PAGE_SIZE, 0); - bio_set_op_attrs(&bio, REQ_OP_READ, 0); - - rqd.bio = &bio; - rqd.opcode = NVM_OP_PREAD; - rqd.is_seq = 1; - rqd.nr_ppas = 1; - rqd.ppa_addr = generic_to_dev_addr(dev, ppa); - - ret = nvm_submit_io_sync_raw(dev, &rqd); - __free_page(page); - if (ret) - return ret; - - return rqd.error; -} - -/* - * Scans a 1.2 chunk first and last page to determine if its state. - * If the chunk is found to be open, also scan it to update the write - * pointer. - */ -static int nvm_bb_chunk_scan(struct nvm_dev *dev, struct ppa_addr ppa, - struct nvm_chk_meta *meta) -{ - struct nvm_geo *geo = &dev->geo; - int ret, pg, pl; - - /* sense first page */ - ret = nvm_bb_chunk_sense(dev, ppa); - if (ret < 0) /* io error */ - return ret; - else if (ret == 0) /* valid data */ - meta->state = NVM_CHK_ST_OPEN; - else if (ret > 0) { - /* - * If empty page, the chunk is free, else it is an - * actual io error. In that case, mark it offline. - */ - switch (ret) { - case NVM_RSP_ERR_EMPTYPAGE: - meta->state = NVM_CHK_ST_FREE; - return 0; - case NVM_RSP_ERR_FAILCRC: - case NVM_RSP_ERR_FAILECC: - case NVM_RSP_WARN_HIGHECC: - meta->state = NVM_CHK_ST_OPEN; - goto scan; - default: - return -ret; /* other io error */ - } - } - - /* sense last page */ - ppa.g.pg = geo->num_pg - 1; - ppa.g.pl = geo->num_pln - 1; - - ret = nvm_bb_chunk_sense(dev, ppa); - if (ret < 0) /* io error */ - return ret; - else if (ret == 0) { /* Chunk fully written */ - meta->state = NVM_CHK_ST_CLOSED; - meta->wp = geo->clba; - return 0; - } else if (ret > 0) { - switch (ret) { - case NVM_RSP_ERR_EMPTYPAGE: - case NVM_RSP_ERR_FAILCRC: - case NVM_RSP_ERR_FAILECC: - case NVM_RSP_WARN_HIGHECC: - meta->state = NVM_CHK_ST_OPEN; - break; - default: - return -ret; /* other io error */ - } - } - -scan: - /* - * chunk is open, we scan sequentially to update the write pointer. - * We make the assumption that targets write data across all planes - * before moving to the next page. - */ - for (pg = 0; pg < geo->num_pg; pg++) { - for (pl = 0; pl < geo->num_pln; pl++) { - ppa.g.pg = pg; - ppa.g.pl = pl; - - ret = nvm_bb_chunk_sense(dev, ppa); - if (ret < 0) /* io error */ - return ret; - else if (ret == 0) { - meta->wp += geo->ws_min; - } else if (ret > 0) { - switch (ret) { - case NVM_RSP_ERR_EMPTYPAGE: - return 0; - case NVM_RSP_ERR_FAILCRC: - case NVM_RSP_ERR_FAILECC: - case NVM_RSP_WARN_HIGHECC: - meta->wp += geo->ws_min; - break; - default: - return -ret; /* other io error */ - } - } - } - } - - return 0; -} - -/* - * folds a bad block list from its plane representation to its - * chunk representation. - * - * If any of the planes status are bad or grown bad, the chunk is marked - * offline. If not bad, the first plane state acts as the chunk state. - */ -static int nvm_bb_to_chunk(struct nvm_dev *dev, struct ppa_addr ppa, - u8 *blks, int nr_blks, struct nvm_chk_meta *meta) -{ - struct nvm_geo *geo = &dev->geo; - int ret, blk, pl, offset, blktype; - - for (blk = 0; blk < geo->num_chk; blk++) { - offset = blk * geo->pln_mode; - blktype = blks[offset]; - - for (pl = 0; pl < geo->pln_mode; pl++) { - if (blks[offset + pl] & - (NVM_BLK_T_BAD|NVM_BLK_T_GRWN_BAD)) { - blktype = blks[offset + pl]; - break; - } - } - - ppa.g.blk = blk; - - meta->wp = 0; - meta->type = NVM_CHK_TP_W_SEQ; - meta->wi = 0; - meta->slba = generic_to_dev_addr(dev, ppa).ppa; - meta->cnlb = dev->geo.clba; - - if (blktype == NVM_BLK_T_FREE) { - ret = nvm_bb_chunk_scan(dev, ppa, meta); - if (ret) - return ret; - } else { - meta->state = NVM_CHK_ST_OFFLINE; - } - - meta++; - } - - return 0; -} - -static int nvm_get_bb_meta(struct nvm_dev *dev, sector_t slba, - int nchks, struct nvm_chk_meta *meta) -{ - struct nvm_geo *geo = &dev->geo; - struct ppa_addr ppa; - u8 *blks; - int ch, lun, nr_blks; - int ret = 0; - - ppa.ppa = slba; - ppa = dev_to_generic_addr(dev, ppa); - - if (ppa.g.blk != 0) - return -EINVAL; - - if ((nchks % geo->num_chk) != 0) - return -EINVAL; - - nr_blks = geo->num_chk * geo->pln_mode; - - blks = kmalloc(nr_blks, GFP_KERNEL); - if (!blks) - return -ENOMEM; - - for (ch = ppa.g.ch; ch < geo->num_ch; ch++) { - for (lun = ppa.g.lun; lun < geo->num_lun; lun++) { - struct ppa_addr ppa_gen, ppa_dev; - - if (!nchks) - goto done; - - ppa_gen.ppa = 0; - ppa_gen.g.ch = ch; - ppa_gen.g.lun = lun; - ppa_dev = generic_to_dev_addr(dev, ppa_gen); - - ret = dev->ops->get_bb_tbl(dev, ppa_dev, blks); - if (ret) - goto done; - - ret = nvm_bb_to_chunk(dev, ppa_gen, blks, nr_blks, - meta); - if (ret) - goto done; - - meta += geo->num_chk; - nchks -= geo->num_chk; - } - } -done: - kfree(blks); - return ret; -} - -int nvm_get_chunk_meta(struct nvm_tgt_dev *tgt_dev, struct ppa_addr ppa, - int nchks, struct nvm_chk_meta *meta) -{ - struct nvm_dev *dev = tgt_dev->parent; - - nvm_ppa_tgt_to_dev(tgt_dev, &ppa, 1); - - if (dev->geo.version == NVM_OCSSD_SPEC_12) - return nvm_get_bb_meta(dev, (sector_t)ppa.ppa, nchks, meta); - - return dev->ops->get_chk_meta(dev, (sector_t)ppa.ppa, nchks, meta); -} -EXPORT_SYMBOL_GPL(nvm_get_chunk_meta); - -int nvm_set_chunk_meta(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas, - int nr_ppas, int type) -{ - struct nvm_dev *dev = tgt_dev->parent; - struct nvm_rq rqd; - int ret; - - if (dev->geo.version == NVM_OCSSD_SPEC_20) - return 0; - - if (nr_ppas > NVM_MAX_VLBA) { - pr_err("unable to update all blocks atomically\n"); - return -EINVAL; - } - - memset(&rqd, 0, sizeof(struct nvm_rq)); - - nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas); - nvm_rq_tgt_to_dev(tgt_dev, &rqd); - - ret = dev->ops->set_bb_tbl(dev, &rqd.ppa_addr, rqd.nr_ppas, type); - nvm_free_rqd_ppalist(tgt_dev, &rqd); - if (ret) - return -EINVAL; - - return 0; -} -EXPORT_SYMBOL_GPL(nvm_set_chunk_meta); - -static int nvm_core_init(struct nvm_dev *dev) -{ - struct nvm_geo *geo = &dev->geo; - int ret; - - dev->lun_map = kcalloc(BITS_TO_LONGS(geo->all_luns), - sizeof(unsigned long), GFP_KERNEL); - if (!dev->lun_map) - return -ENOMEM; - - INIT_LIST_HEAD(&dev->area_list); - INIT_LIST_HEAD(&dev->targets); - mutex_init(&dev->mlock); - spin_lock_init(&dev->lock); - - ret = nvm_register_map(dev); - if (ret) - goto err_fmtype; - - return 0; -err_fmtype: - kfree(dev->lun_map); - return ret; -} - -static void nvm_free(struct kref *ref) -{ - struct nvm_dev *dev = container_of(ref, struct nvm_dev, ref); - - if (dev->dma_pool) - dev->ops->destroy_dma_pool(dev->dma_pool); - - if (dev->rmap) - nvm_unregister_map(dev); - - kfree(dev->lun_map); - kfree(dev); -} - -static int nvm_init(struct nvm_dev *dev) -{ - struct nvm_geo *geo = &dev->geo; - int ret = -EINVAL; - - if (dev->ops->identity(dev)) { - pr_err("device could not be identified\n"); - goto err; - } - - pr_debug("ver:%u.%u nvm_vendor:%x\n", geo->major_ver_id, - geo->minor_ver_id, geo->vmnt); - - ret = nvm_core_init(dev); - if (ret) { - pr_err("could not initialize core structures.\n"); - goto err; - } - - pr_info("registered %s [%u/%u/%u/%u/%u]\n", - dev->name, dev->geo.ws_min, dev->geo.ws_opt, - dev->geo.num_chk, dev->geo.all_luns, - dev->geo.num_ch); - return 0; -err: - pr_err("failed to initialize nvm\n"); - return ret; -} - -struct nvm_dev *nvm_alloc_dev(int node) -{ - struct nvm_dev *dev; - - dev = kzalloc_node(sizeof(struct nvm_dev), GFP_KERNEL, node); - if (dev) - kref_init(&dev->ref); - - return dev; -} -EXPORT_SYMBOL(nvm_alloc_dev); - -int nvm_register(struct nvm_dev *dev) -{ - int ret, exp_pool_size; - - pr_warn_once("lightnvm support is deprecated and will be removed in Linux 5.15.\n"); - - if (!dev->q || !dev->ops) { - kref_put(&dev->ref, nvm_free); - return -EINVAL; - } - - ret = nvm_init(dev); - if (ret) { - kref_put(&dev->ref, nvm_free); - return ret; - } - - exp_pool_size = max_t(int, PAGE_SIZE, - (NVM_MAX_VLBA * (sizeof(u64) + dev->geo.sos))); - exp_pool_size = round_up(exp_pool_size, PAGE_SIZE); - - dev->dma_pool = dev->ops->create_dma_pool(dev, "ppalist", - exp_pool_size); - if (!dev->dma_pool) { - pr_err("could not create dma pool\n"); - kref_put(&dev->ref, nvm_free); - return -ENOMEM; - } - - /* register device with a supported media manager */ - down_write(&nvm_lock); - list_add(&dev->devices, &nvm_devices); - up_write(&nvm_lock); - - return 0; -} -EXPORT_SYMBOL(nvm_register); - -void nvm_unregister(struct nvm_dev *dev) -{ - struct nvm_target *t, *tmp; - - mutex_lock(&dev->mlock); - list_for_each_entry_safe(t, tmp, &dev->targets, list) { - if (t->dev->parent != dev) - continue; - __nvm_remove_target(t, false); - kref_put(&dev->ref, nvm_free); - } - mutex_unlock(&dev->mlock); - - down_write(&nvm_lock); - list_del(&dev->devices); - up_write(&nvm_lock); - - kref_put(&dev->ref, nvm_free); -} -EXPORT_SYMBOL(nvm_unregister); - -static int __nvm_configure_create(struct nvm_ioctl_create *create) -{ - struct nvm_dev *dev; - int ret; - - down_write(&nvm_lock); - dev = nvm_find_nvm_dev(create->dev); - up_write(&nvm_lock); - - if (!dev) { - pr_err("device not found\n"); - return -EINVAL; - } - - kref_get(&dev->ref); - ret = nvm_create_tgt(dev, create); - if (ret) - kref_put(&dev->ref, nvm_free); - - return ret; -} - -static long nvm_ioctl_info(struct file *file, void __user *arg) -{ - struct nvm_ioctl_info *info; - struct nvm_tgt_type *tt; - int tgt_iter = 0; - - info = memdup_user(arg, sizeof(struct nvm_ioctl_info)); - if (IS_ERR(info)) - return PTR_ERR(info); - - info->version[0] = NVM_VERSION_MAJOR; - info->version[1] = NVM_VERSION_MINOR; - info->version[2] = NVM_VERSION_PATCH; - - down_write(&nvm_tgtt_lock); - list_for_each_entry(tt, &nvm_tgt_types, list) { - struct nvm_ioctl_info_tgt *tgt = &info->tgts[tgt_iter]; - - tgt->version[0] = tt->version[0]; - tgt->version[1] = tt->version[1]; - tgt->version[2] = tt->version[2]; - strncpy(tgt->tgtname, tt->name, NVM_TTYPE_NAME_MAX); - - tgt_iter++; - } - - info->tgtsize = tgt_iter; - up_write(&nvm_tgtt_lock); - - if (copy_to_user(arg, info, sizeof(struct nvm_ioctl_info))) { - kfree(info); - return -EFAULT; - } - - kfree(info); - return 0; -} - -static long nvm_ioctl_get_devices(struct file *file, void __user *arg) -{ - struct nvm_ioctl_get_devices *devices; - struct nvm_dev *dev; - int i = 0; - - devices = kzalloc(sizeof(struct nvm_ioctl_get_devices), GFP_KERNEL); - if (!devices) - return -ENOMEM; - - down_write(&nvm_lock); - list_for_each_entry(dev, &nvm_devices, devices) { - struct nvm_ioctl_device_info *info = &devices->info[i]; - - strlcpy(info->devname, dev->name, sizeof(info->devname)); - - /* kept for compatibility */ - info->bmversion[0] = 1; - info->bmversion[1] = 0; - info->bmversion[2] = 0; - strlcpy(info->bmname, "gennvm", sizeof(info->bmname)); - i++; - - if (i >= ARRAY_SIZE(devices->info)) { - pr_err("max %zd devices can be reported.\n", - ARRAY_SIZE(devices->info)); - break; - } - } - up_write(&nvm_lock); - - devices->nr_devices = i; - - if (copy_to_user(arg, devices, - sizeof(struct nvm_ioctl_get_devices))) { - kfree(devices); - return -EFAULT; - } - - kfree(devices); - return 0; -} - -static long nvm_ioctl_dev_create(struct file *file, void __user *arg) -{ - struct nvm_ioctl_create create; - - if (copy_from_user(&create, arg, sizeof(struct nvm_ioctl_create))) - return -EFAULT; - - if (create.conf.type == NVM_CONFIG_TYPE_EXTENDED && - create.conf.e.rsv != 0) { - pr_err("reserved config field in use\n"); - return -EINVAL; - } - - create.dev[DISK_NAME_LEN - 1] = '\0'; - create.tgttype[NVM_TTYPE_NAME_MAX - 1] = '\0'; - create.tgtname[DISK_NAME_LEN - 1] = '\0'; - - if (create.flags != 0) { - __u32 flags = create.flags; - - /* Check for valid flags */ - if (flags & NVM_TARGET_FACTORY) - flags &= ~NVM_TARGET_FACTORY; - - if (flags) { - pr_err("flag not supported\n"); - return -EINVAL; - } - } - - return __nvm_configure_create(&create); -} - -static long nvm_ioctl_dev_remove(struct file *file, void __user *arg) -{ - struct nvm_ioctl_remove remove; - - if (copy_from_user(&remove, arg, sizeof(struct nvm_ioctl_remove))) - return -EFAULT; - - remove.tgtname[DISK_NAME_LEN - 1] = '\0'; - - if (remove.flags != 0) { - pr_err("no flags supported\n"); - return -EINVAL; - } - - return nvm_remove_tgt(&remove); -} - -/* kept for compatibility reasons */ -static long nvm_ioctl_dev_init(struct file *file, void __user *arg) -{ - struct nvm_ioctl_dev_init init; - - if (copy_from_user(&init, arg, sizeof(struct nvm_ioctl_dev_init))) - return -EFAULT; - - if (init.flags != 0) { - pr_err("no flags supported\n"); - return -EINVAL; - } - - return 0; -} - -/* Kept for compatibility reasons */ -static long nvm_ioctl_dev_factory(struct file *file, void __user *arg) -{ - struct nvm_ioctl_dev_factory fact; - - if (copy_from_user(&fact, arg, sizeof(struct nvm_ioctl_dev_factory))) - return -EFAULT; - - fact.dev[DISK_NAME_LEN - 1] = '\0'; - - if (fact.flags & ~(NVM_FACTORY_NR_BITS - 1)) - return -EINVAL; - - return 0; -} - -static long nvm_ctl_ioctl(struct file *file, uint cmd, unsigned long arg) -{ - void __user *argp = (void __user *)arg; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - switch (cmd) { - case NVM_INFO: - return nvm_ioctl_info(file, argp); - case NVM_GET_DEVICES: - return nvm_ioctl_get_devices(file, argp); - case NVM_DEV_CREATE: - return nvm_ioctl_dev_create(file, argp); - case NVM_DEV_REMOVE: - return nvm_ioctl_dev_remove(file, argp); - case NVM_DEV_INIT: - return nvm_ioctl_dev_init(file, argp); - case NVM_DEV_FACTORY: - return nvm_ioctl_dev_factory(file, argp); - } - return 0; -} - -static const struct file_operations _ctl_fops = { - .open = nonseekable_open, - .unlocked_ioctl = nvm_ctl_ioctl, - .owner = THIS_MODULE, - .llseek = noop_llseek, -}; - -static struct miscdevice _nvm_misc = { - .minor = MISC_DYNAMIC_MINOR, - .name = "lightnvm", - .nodename = "lightnvm/control", - .fops = &_ctl_fops, -}; -builtin_misc_device(_nvm_misc); diff --git a/drivers/lightnvm/pblk-cache.c b/drivers/lightnvm/pblk-cache.c deleted file mode 100644 index f185f1a00008..000000000000 --- a/drivers/lightnvm/pblk-cache.c +++ /dev/null @@ -1,137 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2016 CNEX Labs - * Initial release: Javier Gonzalez - * Matias Bjorling - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * pblk-cache.c - pblk's write cache - */ - -#include "pblk.h" - -void pblk_write_to_cache(struct pblk *pblk, struct bio *bio, - unsigned long flags) -{ - struct pblk_w_ctx w_ctx; - sector_t lba = pblk_get_lba(bio); - unsigned long start_time; - unsigned int bpos, pos; - int nr_entries = pblk_get_secs(bio); - int i, ret; - - start_time = bio_start_io_acct(bio); - - /* Update the write buffer head (mem) with the entries that we can - * write. The write in itself cannot fail, so there is no need to - * rollback from here on. - */ -retry: - ret = pblk_rb_may_write_user(&pblk->rwb, bio, nr_entries, &bpos); - switch (ret) { - case NVM_IO_REQUEUE: - io_schedule(); - goto retry; - case NVM_IO_ERR: - pblk_pipeline_stop(pblk); - bio_io_error(bio); - goto out; - } - - pblk_ppa_set_empty(&w_ctx.ppa); - w_ctx.flags = flags; - if (bio->bi_opf & REQ_PREFLUSH) { - w_ctx.flags |= PBLK_FLUSH_ENTRY; - pblk_write_kick(pblk); - } - - if (unlikely(!bio_has_data(bio))) - goto out; - - for (i = 0; i < nr_entries; i++) { - void *data = bio_data(bio); - - w_ctx.lba = lba + i; - - pos = pblk_rb_wrap_pos(&pblk->rwb, bpos + i); - pblk_rb_write_entry_user(&pblk->rwb, data, w_ctx, pos); - - bio_advance(bio, PBLK_EXPOSED_PAGE_SIZE); - } - - atomic64_add(nr_entries, &pblk->user_wa); - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_add(nr_entries, &pblk->inflight_writes); - atomic_long_add(nr_entries, &pblk->req_writes); -#endif - - pblk_rl_inserted(&pblk->rl, nr_entries); - -out: - bio_end_io_acct(bio, start_time); - pblk_write_should_kick(pblk); - - if (ret == NVM_IO_DONE) - bio_endio(bio); -} - -/* - * On GC the incoming lbas are not necessarily sequential. Also, some of the - * lbas might not be valid entries, which are marked as empty by the GC thread - */ -int pblk_write_gc_to_cache(struct pblk *pblk, struct pblk_gc_rq *gc_rq) -{ - struct pblk_w_ctx w_ctx; - unsigned int bpos, pos; - void *data = gc_rq->data; - int i, valid_entries; - - /* Update the write buffer head (mem) with the entries that we can - * write. The write in itself cannot fail, so there is no need to - * rollback from here on. - */ -retry: - if (!pblk_rb_may_write_gc(&pblk->rwb, gc_rq->secs_to_gc, &bpos)) { - io_schedule(); - goto retry; - } - - w_ctx.flags = PBLK_IOTYPE_GC; - pblk_ppa_set_empty(&w_ctx.ppa); - - for (i = 0, valid_entries = 0; i < gc_rq->nr_secs; i++) { - if (gc_rq->lba_list[i] == ADDR_EMPTY) - continue; - - w_ctx.lba = gc_rq->lba_list[i]; - - pos = pblk_rb_wrap_pos(&pblk->rwb, bpos + valid_entries); - pblk_rb_write_entry_gc(&pblk->rwb, data, w_ctx, gc_rq->line, - gc_rq->paddr_list[i], pos); - - data += PBLK_EXPOSED_PAGE_SIZE; - valid_entries++; - } - - WARN_ONCE(gc_rq->secs_to_gc != valid_entries, - "pblk: inconsistent GC write\n"); - - atomic64_add(valid_entries, &pblk->gc_wa); - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_add(valid_entries, &pblk->inflight_writes); - atomic_long_add(valid_entries, &pblk->recov_gc_writes); -#endif - - pblk_write_should_kick(pblk); - return NVM_IO_OK; -} diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c deleted file mode 100644 index 33d39d3dd343..000000000000 --- a/drivers/lightnvm/pblk-core.c +++ /dev/null @@ -1,2151 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2016 CNEX Labs - * Initial release: Javier Gonzalez - * Matias Bjorling - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * pblk-core.c - pblk's core functionality - * - */ - -#define CREATE_TRACE_POINTS - -#include "pblk.h" -#include "pblk-trace.h" - -static void pblk_line_mark_bb(struct work_struct *work) -{ - struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws, - ws); - struct pblk *pblk = line_ws->pblk; - struct nvm_tgt_dev *dev = pblk->dev; - struct ppa_addr *ppa = line_ws->priv; - int ret; - - ret = nvm_set_chunk_meta(dev, ppa, 1, NVM_BLK_T_GRWN_BAD); - if (ret) { - struct pblk_line *line; - int pos; - - line = pblk_ppa_to_line(pblk, *ppa); - pos = pblk_ppa_to_pos(&dev->geo, *ppa); - - pblk_err(pblk, "failed to mark bb, line:%d, pos:%d\n", - line->id, pos); - } - - kfree(ppa); - mempool_free(line_ws, &pblk->gen_ws_pool); -} - -static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line, - struct ppa_addr ppa_addr) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct ppa_addr *ppa; - int pos = pblk_ppa_to_pos(geo, ppa_addr); - - pblk_debug(pblk, "erase failed: line:%d, pos:%d\n", line->id, pos); - atomic_long_inc(&pblk->erase_failed); - - atomic_dec(&line->blk_in_line); - if (test_and_set_bit(pos, line->blk_bitmap)) - pblk_err(pblk, "attempted to erase bb: line:%d, pos:%d\n", - line->id, pos); - - /* Not necessary to mark bad blocks on 2.0 spec. */ - if (geo->version == NVM_OCSSD_SPEC_20) - return; - - ppa = kmalloc(sizeof(struct ppa_addr), GFP_ATOMIC); - if (!ppa) - return; - - *ppa = ppa_addr; - pblk_gen_run_ws(pblk, NULL, ppa, pblk_line_mark_bb, - GFP_ATOMIC, pblk->bb_wq); -} - -static void __pblk_end_io_erase(struct pblk *pblk, struct nvm_rq *rqd) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct nvm_chk_meta *chunk; - struct pblk_line *line; - int pos; - - line = pblk_ppa_to_line(pblk, rqd->ppa_addr); - pos = pblk_ppa_to_pos(geo, rqd->ppa_addr); - chunk = &line->chks[pos]; - - atomic_dec(&line->left_seblks); - - if (rqd->error) { - trace_pblk_chunk_reset(pblk_disk_name(pblk), - &rqd->ppa_addr, PBLK_CHUNK_RESET_FAILED); - - chunk->state = NVM_CHK_ST_OFFLINE; - pblk_mark_bb(pblk, line, rqd->ppa_addr); - } else { - trace_pblk_chunk_reset(pblk_disk_name(pblk), - &rqd->ppa_addr, PBLK_CHUNK_RESET_DONE); - - chunk->state = NVM_CHK_ST_FREE; - } - - trace_pblk_chunk_state(pblk_disk_name(pblk), &rqd->ppa_addr, - chunk->state); - - atomic_dec(&pblk->inflight_io); -} - -/* Erase completion assumes that only one block is erased at the time */ -static void pblk_end_io_erase(struct nvm_rq *rqd) -{ - struct pblk *pblk = rqd->private; - - __pblk_end_io_erase(pblk, rqd); - mempool_free(rqd, &pblk->e_rq_pool); -} - -/* - * Get information for all chunks from the device. - * - * The caller is responsible for freeing (vmalloc) the returned structure - */ -struct nvm_chk_meta *pblk_get_chunk_meta(struct pblk *pblk) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct nvm_chk_meta *meta; - struct ppa_addr ppa; - unsigned long len; - int ret; - - ppa.ppa = 0; - - len = geo->all_chunks * sizeof(*meta); - meta = vzalloc(len); - if (!meta) - return ERR_PTR(-ENOMEM); - - ret = nvm_get_chunk_meta(dev, ppa, geo->all_chunks, meta); - if (ret) { - vfree(meta); - return ERR_PTR(-EIO); - } - - return meta; -} - -struct nvm_chk_meta *pblk_chunk_get_off(struct pblk *pblk, - struct nvm_chk_meta *meta, - struct ppa_addr ppa) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - int ch_off = ppa.m.grp * geo->num_chk * geo->num_lun; - int lun_off = ppa.m.pu * geo->num_chk; - int chk_off = ppa.m.chk; - - return meta + ch_off + lun_off + chk_off; -} - -void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line, - u64 paddr) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct list_head *move_list = NULL; - - /* Lines being reclaimed (GC'ed) cannot be invalidated. Before the L2P - * table is modified with reclaimed sectors, a check is done to endure - * that newer updates are not overwritten. - */ - spin_lock(&line->lock); - WARN_ON(line->state == PBLK_LINESTATE_FREE); - - if (test_and_set_bit(paddr, line->invalid_bitmap)) { - WARN_ONCE(1, "pblk: double invalidate\n"); - spin_unlock(&line->lock); - return; - } - le32_add_cpu(line->vsc, -1); - - if (line->state == PBLK_LINESTATE_CLOSED) - move_list = pblk_line_gc_list(pblk, line); - spin_unlock(&line->lock); - - if (move_list) { - spin_lock(&l_mg->gc_lock); - spin_lock(&line->lock); - /* Prevent moving a line that has just been chosen for GC */ - if (line->state == PBLK_LINESTATE_GC) { - spin_unlock(&line->lock); - spin_unlock(&l_mg->gc_lock); - return; - } - spin_unlock(&line->lock); - - list_move_tail(&line->list, move_list); - spin_unlock(&l_mg->gc_lock); - } -} - -void pblk_map_invalidate(struct pblk *pblk, struct ppa_addr ppa) -{ - struct pblk_line *line; - u64 paddr; - -#ifdef CONFIG_NVM_PBLK_DEBUG - /* Callers must ensure that the ppa points to a device address */ - BUG_ON(pblk_addr_in_cache(ppa)); - BUG_ON(pblk_ppa_empty(ppa)); -#endif - - line = pblk_ppa_to_line(pblk, ppa); - paddr = pblk_dev_ppa_to_line_addr(pblk, ppa); - - __pblk_map_invalidate(pblk, line, paddr); -} - -static void pblk_invalidate_range(struct pblk *pblk, sector_t slba, - unsigned int nr_secs) -{ - sector_t lba; - - spin_lock(&pblk->trans_lock); - for (lba = slba; lba < slba + nr_secs; lba++) { - struct ppa_addr ppa; - - ppa = pblk_trans_map_get(pblk, lba); - - if (!pblk_addr_in_cache(ppa) && !pblk_ppa_empty(ppa)) - pblk_map_invalidate(pblk, ppa); - - pblk_ppa_set_empty(&ppa); - pblk_trans_map_set(pblk, lba, ppa); - } - spin_unlock(&pblk->trans_lock); -} - -int pblk_alloc_rqd_meta(struct pblk *pblk, struct nvm_rq *rqd) -{ - struct nvm_tgt_dev *dev = pblk->dev; - - rqd->meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, - &rqd->dma_meta_list); - if (!rqd->meta_list) - return -ENOMEM; - - if (rqd->nr_ppas == 1) - return 0; - - rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size(pblk); - rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size(pblk); - - return 0; -} - -void pblk_free_rqd_meta(struct pblk *pblk, struct nvm_rq *rqd) -{ - struct nvm_tgt_dev *dev = pblk->dev; - - if (rqd->meta_list) - nvm_dev_dma_free(dev->parent, rqd->meta_list, - rqd->dma_meta_list); -} - -/* Caller must guarantee that the request is a valid type */ -struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int type) -{ - mempool_t *pool; - struct nvm_rq *rqd; - int rq_size; - - switch (type) { - case PBLK_WRITE: - case PBLK_WRITE_INT: - pool = &pblk->w_rq_pool; - rq_size = pblk_w_rq_size; - break; - case PBLK_READ: - pool = &pblk->r_rq_pool; - rq_size = pblk_g_rq_size; - break; - default: - pool = &pblk->e_rq_pool; - rq_size = pblk_g_rq_size; - } - - rqd = mempool_alloc(pool, GFP_KERNEL); - memset(rqd, 0, rq_size); - - return rqd; -} - -/* Typically used on completion path. Cannot guarantee request consistency */ -void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int type) -{ - mempool_t *pool; - - switch (type) { - case PBLK_WRITE: - kfree(((struct pblk_c_ctx *)nvm_rq_to_pdu(rqd))->lun_bitmap); - fallthrough; - case PBLK_WRITE_INT: - pool = &pblk->w_rq_pool; - break; - case PBLK_READ: - pool = &pblk->r_rq_pool; - break; - case PBLK_ERASE: - pool = &pblk->e_rq_pool; - break; - default: - pblk_err(pblk, "trying to free unknown rqd type\n"); - return; - } - - pblk_free_rqd_meta(pblk, rqd); - mempool_free(rqd, pool); -} - -void pblk_bio_free_pages(struct pblk *pblk, struct bio *bio, int off, - int nr_pages) -{ - struct bio_vec *bv; - struct page *page; - int i, e, nbv = 0; - - for (i = 0; i < bio->bi_vcnt; i++) { - bv = &bio->bi_io_vec[i]; - page = bv->bv_page; - for (e = 0; e < bv->bv_len; e += PBLK_EXPOSED_PAGE_SIZE, nbv++) - if (nbv >= off) - mempool_free(page++, &pblk->page_bio_pool); - } -} - -int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags, - int nr_pages) -{ - struct request_queue *q = pblk->dev->q; - struct page *page; - int i, ret; - - for (i = 0; i < nr_pages; i++) { - page = mempool_alloc(&pblk->page_bio_pool, flags); - - ret = bio_add_pc_page(q, bio, page, PBLK_EXPOSED_PAGE_SIZE, 0); - if (ret != PBLK_EXPOSED_PAGE_SIZE) { - pblk_err(pblk, "could not add page to bio\n"); - mempool_free(page, &pblk->page_bio_pool); - goto err; - } - } - - return 0; -err: - pblk_bio_free_pages(pblk, bio, (bio->bi_vcnt - i), i); - return -1; -} - -void pblk_write_kick(struct pblk *pblk) -{ - wake_up_process(pblk->writer_ts); - mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(1000)); -} - -void pblk_write_timer_fn(struct timer_list *t) -{ - struct pblk *pblk = from_timer(pblk, t, wtimer); - - /* kick the write thread every tick to flush outstanding data */ - pblk_write_kick(pblk); -} - -void pblk_write_should_kick(struct pblk *pblk) -{ - unsigned int secs_avail = pblk_rb_read_count(&pblk->rwb); - - if (secs_avail >= pblk->min_write_pgs_data) - pblk_write_kick(pblk); -} - -static void pblk_wait_for_meta(struct pblk *pblk) -{ - do { - if (!atomic_read(&pblk->inflight_io)) - break; - - schedule(); - } while (1); -} - -static void pblk_flush_writer(struct pblk *pblk) -{ - pblk_rb_flush(&pblk->rwb); - do { - if (!pblk_rb_sync_count(&pblk->rwb)) - break; - - pblk_write_kick(pblk); - schedule(); - } while (1); -} - -struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct list_head *move_list = NULL; - int packed_meta = (le32_to_cpu(*line->vsc) / pblk->min_write_pgs_data) - * (pblk->min_write_pgs - pblk->min_write_pgs_data); - int vsc = le32_to_cpu(*line->vsc) + packed_meta; - - lockdep_assert_held(&line->lock); - - if (line->w_err_gc->has_write_err) { - if (line->gc_group != PBLK_LINEGC_WERR) { - line->gc_group = PBLK_LINEGC_WERR; - move_list = &l_mg->gc_werr_list; - pblk_rl_werr_line_in(&pblk->rl); - } - } else if (!vsc) { - if (line->gc_group != PBLK_LINEGC_FULL) { - line->gc_group = PBLK_LINEGC_FULL; - move_list = &l_mg->gc_full_list; - } - } else if (vsc < lm->high_thrs) { - if (line->gc_group != PBLK_LINEGC_HIGH) { - line->gc_group = PBLK_LINEGC_HIGH; - move_list = &l_mg->gc_high_list; - } - } else if (vsc < lm->mid_thrs) { - if (line->gc_group != PBLK_LINEGC_MID) { - line->gc_group = PBLK_LINEGC_MID; - move_list = &l_mg->gc_mid_list; - } - } else if (vsc < line->sec_in_line) { - if (line->gc_group != PBLK_LINEGC_LOW) { - line->gc_group = PBLK_LINEGC_LOW; - move_list = &l_mg->gc_low_list; - } - } else if (vsc == line->sec_in_line) { - if (line->gc_group != PBLK_LINEGC_EMPTY) { - line->gc_group = PBLK_LINEGC_EMPTY; - move_list = &l_mg->gc_empty_list; - } - } else { - line->state = PBLK_LINESTATE_CORRUPT; - trace_pblk_line_state(pblk_disk_name(pblk), line->id, - line->state); - - line->gc_group = PBLK_LINEGC_NONE; - move_list = &l_mg->corrupt_list; - pblk_err(pblk, "corrupted vsc for line %d, vsc:%d (%d/%d/%d)\n", - line->id, vsc, - line->sec_in_line, - lm->high_thrs, lm->mid_thrs); - } - - return move_list; -} - -void pblk_discard(struct pblk *pblk, struct bio *bio) -{ - sector_t slba = pblk_get_lba(bio); - sector_t nr_secs = pblk_get_secs(bio); - - pblk_invalidate_range(pblk, slba, nr_secs); -} - -void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd) -{ - atomic_long_inc(&pblk->write_failed); -#ifdef CONFIG_NVM_PBLK_DEBUG - pblk_print_failed_rqd(pblk, rqd, rqd->error); -#endif -} - -void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd) -{ - /* Empty page read is not necessarily an error (e.g., L2P recovery) */ - if (rqd->error == NVM_RSP_ERR_EMPTYPAGE) { - atomic_long_inc(&pblk->read_empty); - return; - } - - switch (rqd->error) { - case NVM_RSP_WARN_HIGHECC: - atomic_long_inc(&pblk->read_high_ecc); - break; - case NVM_RSP_ERR_FAILECC: - case NVM_RSP_ERR_FAILCRC: - atomic_long_inc(&pblk->read_failed); - break; - default: - pblk_err(pblk, "unknown read error:%d\n", rqd->error); - } -#ifdef CONFIG_NVM_PBLK_DEBUG - pblk_print_failed_rqd(pblk, rqd, rqd->error); -#endif -} - -void pblk_set_sec_per_write(struct pblk *pblk, int sec_per_write) -{ - pblk->sec_per_write = sec_per_write; -} - -int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd, void *buf) -{ - struct nvm_tgt_dev *dev = pblk->dev; - - atomic_inc(&pblk->inflight_io); - -#ifdef CONFIG_NVM_PBLK_DEBUG - if (pblk_check_io(pblk, rqd)) - return NVM_IO_ERR; -#endif - - return nvm_submit_io(dev, rqd, buf); -} - -void pblk_check_chunk_state_update(struct pblk *pblk, struct nvm_rq *rqd) -{ - struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd); - - int i; - - for (i = 0; i < rqd->nr_ppas; i++) { - struct ppa_addr *ppa = &ppa_list[i]; - struct nvm_chk_meta *chunk = pblk_dev_ppa_to_chunk(pblk, *ppa); - u64 caddr = pblk_dev_ppa_to_chunk_addr(pblk, *ppa); - - if (caddr == 0) - trace_pblk_chunk_state(pblk_disk_name(pblk), - ppa, NVM_CHK_ST_OPEN); - else if (caddr == (chunk->cnlb - 1)) - trace_pblk_chunk_state(pblk_disk_name(pblk), - ppa, NVM_CHK_ST_CLOSED); - } -} - -int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd, void *buf) -{ - struct nvm_tgt_dev *dev = pblk->dev; - int ret; - - atomic_inc(&pblk->inflight_io); - -#ifdef CONFIG_NVM_PBLK_DEBUG - if (pblk_check_io(pblk, rqd)) - return NVM_IO_ERR; -#endif - - ret = nvm_submit_io_sync(dev, rqd, buf); - - if (trace_pblk_chunk_state_enabled() && !ret && - rqd->opcode == NVM_OP_PWRITE) - pblk_check_chunk_state_update(pblk, rqd); - - return ret; -} - -static int pblk_submit_io_sync_sem(struct pblk *pblk, struct nvm_rq *rqd, - void *buf) -{ - struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd); - int ret; - - pblk_down_chunk(pblk, ppa_list[0]); - ret = pblk_submit_io_sync(pblk, rqd, buf); - pblk_up_chunk(pblk, ppa_list[0]); - - return ret; -} - -int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail, - unsigned long secs_to_flush, bool skip_meta) -{ - int max = pblk->sec_per_write; - int min = pblk->min_write_pgs; - int secs_to_sync = 0; - - if (skip_meta && pblk->min_write_pgs_data != pblk->min_write_pgs) - min = max = pblk->min_write_pgs_data; - - if (secs_avail >= max) - secs_to_sync = max; - else if (secs_avail >= min) - secs_to_sync = min * (secs_avail / min); - else if (secs_to_flush) - secs_to_sync = min; - - return secs_to_sync; -} - -void pblk_dealloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs) -{ - u64 addr; - int i; - - spin_lock(&line->lock); - addr = find_next_zero_bit(line->map_bitmap, - pblk->lm.sec_per_line, line->cur_sec); - line->cur_sec = addr - nr_secs; - - for (i = 0; i < nr_secs; i++, line->cur_sec--) - WARN_ON(!test_and_clear_bit(line->cur_sec, line->map_bitmap)); - spin_unlock(&line->lock); -} - -u64 __pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs) -{ - u64 addr; - int i; - - lockdep_assert_held(&line->lock); - - /* logic error: ppa out-of-bounds. Prevent generating bad address */ - if (line->cur_sec + nr_secs > pblk->lm.sec_per_line) { - WARN(1, "pblk: page allocation out of bounds\n"); - nr_secs = pblk->lm.sec_per_line - line->cur_sec; - } - - line->cur_sec = addr = find_next_zero_bit(line->map_bitmap, - pblk->lm.sec_per_line, line->cur_sec); - for (i = 0; i < nr_secs; i++, line->cur_sec++) - WARN_ON(test_and_set_bit(line->cur_sec, line->map_bitmap)); - - return addr; -} - -u64 pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs) -{ - u64 addr; - - /* Lock needed in case a write fails and a recovery needs to remap - * failed write buffer entries - */ - spin_lock(&line->lock); - addr = __pblk_alloc_page(pblk, line, nr_secs); - line->left_msecs -= nr_secs; - WARN(line->left_msecs < 0, "pblk: page allocation out of bounds\n"); - spin_unlock(&line->lock); - - return addr; -} - -u64 pblk_lookup_page(struct pblk *pblk, struct pblk_line *line) -{ - u64 paddr; - - spin_lock(&line->lock); - paddr = find_next_zero_bit(line->map_bitmap, - pblk->lm.sec_per_line, line->cur_sec); - spin_unlock(&line->lock); - - return paddr; -} - -u64 pblk_line_smeta_start(struct pblk *pblk, struct pblk_line *line) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line_meta *lm = &pblk->lm; - int bit; - - /* This usually only happens on bad lines */ - bit = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line); - if (bit >= lm->blk_per_line) - return -1; - - return bit * geo->ws_opt; -} - -int pblk_line_smeta_read(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_line_meta *lm = &pblk->lm; - struct ppa_addr *ppa_list; - struct nvm_rq rqd; - u64 paddr = pblk_line_smeta_start(pblk, line); - int i, ret; - - memset(&rqd, 0, sizeof(struct nvm_rq)); - - ret = pblk_alloc_rqd_meta(pblk, &rqd); - if (ret) - return ret; - - rqd.opcode = NVM_OP_PREAD; - rqd.nr_ppas = lm->smeta_sec; - rqd.is_seq = 1; - ppa_list = nvm_rq_to_ppa_list(&rqd); - - for (i = 0; i < lm->smeta_sec; i++, paddr++) - ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id); - - ret = pblk_submit_io_sync(pblk, &rqd, line->smeta); - if (ret) { - pblk_err(pblk, "smeta I/O submission failed: %d\n", ret); - goto clear_rqd; - } - - atomic_dec(&pblk->inflight_io); - - if (rqd.error && rqd.error != NVM_RSP_WARN_HIGHECC) { - pblk_log_read_err(pblk, &rqd); - ret = -EIO; - } - -clear_rqd: - pblk_free_rqd_meta(pblk, &rqd); - return ret; -} - -static int pblk_line_smeta_write(struct pblk *pblk, struct pblk_line *line, - u64 paddr) -{ - struct pblk_line_meta *lm = &pblk->lm; - struct ppa_addr *ppa_list; - struct nvm_rq rqd; - __le64 *lba_list = emeta_to_lbas(pblk, line->emeta->buf); - __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); - int i, ret; - - memset(&rqd, 0, sizeof(struct nvm_rq)); - - ret = pblk_alloc_rqd_meta(pblk, &rqd); - if (ret) - return ret; - - rqd.opcode = NVM_OP_PWRITE; - rqd.nr_ppas = lm->smeta_sec; - rqd.is_seq = 1; - ppa_list = nvm_rq_to_ppa_list(&rqd); - - for (i = 0; i < lm->smeta_sec; i++, paddr++) { - struct pblk_sec_meta *meta = pblk_get_meta(pblk, - rqd.meta_list, i); - - ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id); - meta->lba = lba_list[paddr] = addr_empty; - } - - ret = pblk_submit_io_sync_sem(pblk, &rqd, line->smeta); - if (ret) { - pblk_err(pblk, "smeta I/O submission failed: %d\n", ret); - goto clear_rqd; - } - - atomic_dec(&pblk->inflight_io); - - if (rqd.error) { - pblk_log_write_err(pblk, &rqd); - ret = -EIO; - } - -clear_rqd: - pblk_free_rqd_meta(pblk, &rqd); - return ret; -} - -int pblk_line_emeta_read(struct pblk *pblk, struct pblk_line *line, - void *emeta_buf) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line_meta *lm = &pblk->lm; - void *ppa_list_buf, *meta_list; - struct ppa_addr *ppa_list; - struct nvm_rq rqd; - u64 paddr = line->emeta_ssec; - dma_addr_t dma_ppa_list, dma_meta_list; - int min = pblk->min_write_pgs; - int left_ppas = lm->emeta_sec[0]; - int line_id = line->id; - int rq_ppas, rq_len; - int i, j; - int ret; - - meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, - &dma_meta_list); - if (!meta_list) - return -ENOMEM; - - ppa_list_buf = meta_list + pblk_dma_meta_size(pblk); - dma_ppa_list = dma_meta_list + pblk_dma_meta_size(pblk); - -next_rq: - memset(&rqd, 0, sizeof(struct nvm_rq)); - - rq_ppas = pblk_calc_secs(pblk, left_ppas, 0, false); - rq_len = rq_ppas * geo->csecs; - - rqd.meta_list = meta_list; - rqd.ppa_list = ppa_list_buf; - rqd.dma_meta_list = dma_meta_list; - rqd.dma_ppa_list = dma_ppa_list; - rqd.opcode = NVM_OP_PREAD; - rqd.nr_ppas = rq_ppas; - ppa_list = nvm_rq_to_ppa_list(&rqd); - - for (i = 0; i < rqd.nr_ppas; ) { - struct ppa_addr ppa = addr_to_gen_ppa(pblk, paddr, line_id); - int pos = pblk_ppa_to_pos(geo, ppa); - - if (pblk_io_aligned(pblk, rq_ppas)) - rqd.is_seq = 1; - - while (test_bit(pos, line->blk_bitmap)) { - paddr += min; - if (pblk_boundary_paddr_checks(pblk, paddr)) { - ret = -EINTR; - goto free_rqd_dma; - } - - ppa = addr_to_gen_ppa(pblk, paddr, line_id); - pos = pblk_ppa_to_pos(geo, ppa); - } - - if (pblk_boundary_paddr_checks(pblk, paddr + min)) { - ret = -EINTR; - goto free_rqd_dma; - } - - for (j = 0; j < min; j++, i++, paddr++) - ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line_id); - } - - ret = pblk_submit_io_sync(pblk, &rqd, emeta_buf); - if (ret) { - pblk_err(pblk, "emeta I/O submission failed: %d\n", ret); - goto free_rqd_dma; - } - - atomic_dec(&pblk->inflight_io); - - if (rqd.error && rqd.error != NVM_RSP_WARN_HIGHECC) { - pblk_log_read_err(pblk, &rqd); - ret = -EIO; - goto free_rqd_dma; - } - - emeta_buf += rq_len; - left_ppas -= rq_ppas; - if (left_ppas) - goto next_rq; - -free_rqd_dma: - nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list); - return ret; -} - -static void pblk_setup_e_rq(struct pblk *pblk, struct nvm_rq *rqd, - struct ppa_addr ppa) -{ - rqd->opcode = NVM_OP_ERASE; - rqd->ppa_addr = ppa; - rqd->nr_ppas = 1; - rqd->is_seq = 1; - rqd->bio = NULL; -} - -static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa) -{ - struct nvm_rq rqd = {NULL}; - int ret; - - trace_pblk_chunk_reset(pblk_disk_name(pblk), &ppa, - PBLK_CHUNK_RESET_START); - - pblk_setup_e_rq(pblk, &rqd, ppa); - - /* The write thread schedules erases so that it minimizes disturbances - * with writes. Thus, there is no need to take the LUN semaphore. - */ - ret = pblk_submit_io_sync(pblk, &rqd, NULL); - rqd.private = pblk; - __pblk_end_io_erase(pblk, &rqd); - - return ret; -} - -int pblk_line_erase(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_line_meta *lm = &pblk->lm; - struct ppa_addr ppa; - int ret, bit = -1; - - /* Erase only good blocks, one at a time */ - do { - spin_lock(&line->lock); - bit = find_next_zero_bit(line->erase_bitmap, lm->blk_per_line, - bit + 1); - if (bit >= lm->blk_per_line) { - spin_unlock(&line->lock); - break; - } - - ppa = pblk->luns[bit].bppa; /* set ch and lun */ - ppa.a.blk = line->id; - - atomic_dec(&line->left_eblks); - WARN_ON(test_and_set_bit(bit, line->erase_bitmap)); - spin_unlock(&line->lock); - - ret = pblk_blk_erase_sync(pblk, ppa); - if (ret) { - pblk_err(pblk, "failed to erase line %d\n", line->id); - return ret; - } - } while (1); - - return 0; -} - -static void pblk_line_setup_metadata(struct pblk_line *line, - struct pblk_line_mgmt *l_mg, - struct pblk_line_meta *lm) -{ - int meta_line; - - lockdep_assert_held(&l_mg->free_lock); - -retry_meta: - meta_line = find_first_zero_bit(&l_mg->meta_bitmap, PBLK_DATA_LINES); - if (meta_line == PBLK_DATA_LINES) { - spin_unlock(&l_mg->free_lock); - io_schedule(); - spin_lock(&l_mg->free_lock); - goto retry_meta; - } - - set_bit(meta_line, &l_mg->meta_bitmap); - line->meta_line = meta_line; - - line->smeta = l_mg->sline_meta[meta_line]; - line->emeta = l_mg->eline_meta[meta_line]; - - memset(line->smeta, 0, lm->smeta_len); - memset(line->emeta->buf, 0, lm->emeta_len[0]); - - line->emeta->mem = 0; - atomic_set(&line->emeta->sync, 0); -} - -/* For now lines are always assumed full lines. Thus, smeta former and current - * lun bitmaps are omitted. - */ -static int pblk_line_init_metadata(struct pblk *pblk, struct pblk_line *line, - struct pblk_line *cur) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_emeta *emeta = line->emeta; - struct line_emeta *emeta_buf = emeta->buf; - struct line_smeta *smeta_buf = (struct line_smeta *)line->smeta; - int nr_blk_line; - - /* After erasing the line, new bad blocks might appear and we risk - * having an invalid line - */ - nr_blk_line = lm->blk_per_line - - bitmap_weight(line->blk_bitmap, lm->blk_per_line); - if (nr_blk_line < lm->min_blk_line) { - spin_lock(&l_mg->free_lock); - spin_lock(&line->lock); - line->state = PBLK_LINESTATE_BAD; - trace_pblk_line_state(pblk_disk_name(pblk), line->id, - line->state); - spin_unlock(&line->lock); - - list_add_tail(&line->list, &l_mg->bad_list); - spin_unlock(&l_mg->free_lock); - - pblk_debug(pblk, "line %d is bad\n", line->id); - - return 0; - } - - /* Run-time metadata */ - line->lun_bitmap = ((void *)(smeta_buf)) + sizeof(struct line_smeta); - - /* Mark LUNs allocated in this line (all for now) */ - bitmap_set(line->lun_bitmap, 0, lm->lun_bitmap_len); - - smeta_buf->header.identifier = cpu_to_le32(PBLK_MAGIC); - export_guid(smeta_buf->header.uuid, &pblk->instance_uuid); - smeta_buf->header.id = cpu_to_le32(line->id); - smeta_buf->header.type = cpu_to_le16(line->type); - smeta_buf->header.version_major = SMETA_VERSION_MAJOR; - smeta_buf->header.version_minor = SMETA_VERSION_MINOR; - - /* Start metadata */ - smeta_buf->seq_nr = cpu_to_le64(line->seq_nr); - smeta_buf->window_wr_lun = cpu_to_le32(geo->all_luns); - - /* Fill metadata among lines */ - if (cur) { - memcpy(line->lun_bitmap, cur->lun_bitmap, lm->lun_bitmap_len); - smeta_buf->prev_id = cpu_to_le32(cur->id); - cur->emeta->buf->next_id = cpu_to_le32(line->id); - } else { - smeta_buf->prev_id = cpu_to_le32(PBLK_LINE_EMPTY); - } - - /* All smeta must be set at this point */ - smeta_buf->header.crc = cpu_to_le32( - pblk_calc_meta_header_crc(pblk, &smeta_buf->header)); - smeta_buf->crc = cpu_to_le32(pblk_calc_smeta_crc(pblk, smeta_buf)); - - /* End metadata */ - memcpy(&emeta_buf->header, &smeta_buf->header, - sizeof(struct line_header)); - - emeta_buf->header.version_major = EMETA_VERSION_MAJOR; - emeta_buf->header.version_minor = EMETA_VERSION_MINOR; - emeta_buf->header.crc = cpu_to_le32( - pblk_calc_meta_header_crc(pblk, &emeta_buf->header)); - - emeta_buf->seq_nr = cpu_to_le64(line->seq_nr); - emeta_buf->nr_lbas = cpu_to_le64(line->sec_in_line); - emeta_buf->nr_valid_lbas = cpu_to_le64(0); - emeta_buf->next_id = cpu_to_le32(PBLK_LINE_EMPTY); - emeta_buf->crc = cpu_to_le32(0); - emeta_buf->prev_id = smeta_buf->prev_id; - - return 1; -} - -static int pblk_line_alloc_bitmaps(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - - line->map_bitmap = mempool_alloc(l_mg->bitmap_pool, GFP_KERNEL); - if (!line->map_bitmap) - return -ENOMEM; - - memset(line->map_bitmap, 0, lm->sec_bitmap_len); - - /* will be initialized using bb info from map_bitmap */ - line->invalid_bitmap = mempool_alloc(l_mg->bitmap_pool, GFP_KERNEL); - if (!line->invalid_bitmap) { - mempool_free(line->map_bitmap, l_mg->bitmap_pool); - line->map_bitmap = NULL; - return -ENOMEM; - } - - return 0; -} - -/* For now lines are always assumed full lines. Thus, smeta former and current - * lun bitmaps are omitted. - */ -static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line, - int init) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - u64 off; - int bit = -1; - int emeta_secs; - - line->sec_in_line = lm->sec_per_line; - - /* Capture bad block information on line mapping bitmaps */ - while ((bit = find_next_bit(line->blk_bitmap, lm->blk_per_line, - bit + 1)) < lm->blk_per_line) { - off = bit * geo->ws_opt; - bitmap_shift_left(l_mg->bb_aux, l_mg->bb_template, off, - lm->sec_per_line); - bitmap_or(line->map_bitmap, line->map_bitmap, l_mg->bb_aux, - lm->sec_per_line); - line->sec_in_line -= geo->clba; - } - - /* Mark smeta metadata sectors as bad sectors */ - bit = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line); - off = bit * geo->ws_opt; - bitmap_set(line->map_bitmap, off, lm->smeta_sec); - line->sec_in_line -= lm->smeta_sec; - line->cur_sec = off + lm->smeta_sec; - - if (init && pblk_line_smeta_write(pblk, line, off)) { - pblk_debug(pblk, "line smeta I/O failed. Retry\n"); - return 0; - } - - bitmap_copy(line->invalid_bitmap, line->map_bitmap, lm->sec_per_line); - - /* Mark emeta metadata sectors as bad sectors. We need to consider bad - * blocks to make sure that there are enough sectors to store emeta - */ - emeta_secs = lm->emeta_sec[0]; - off = lm->sec_per_line; - while (emeta_secs) { - off -= geo->ws_opt; - if (!test_bit(off, line->invalid_bitmap)) { - bitmap_set(line->invalid_bitmap, off, geo->ws_opt); - emeta_secs -= geo->ws_opt; - } - } - - line->emeta_ssec = off; - line->sec_in_line -= lm->emeta_sec[0]; - line->nr_valid_lbas = 0; - line->left_msecs = line->sec_in_line; - *line->vsc = cpu_to_le32(line->sec_in_line); - - if (lm->sec_per_line - line->sec_in_line != - bitmap_weight(line->invalid_bitmap, lm->sec_per_line)) { - spin_lock(&line->lock); - line->state = PBLK_LINESTATE_BAD; - trace_pblk_line_state(pblk_disk_name(pblk), line->id, - line->state); - spin_unlock(&line->lock); - - list_add_tail(&line->list, &l_mg->bad_list); - pblk_err(pblk, "unexpected line %d is bad\n", line->id); - - return 0; - } - - return 1; -} - -static int pblk_prepare_new_line(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_line_meta *lm = &pblk->lm; - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - int blk_to_erase = atomic_read(&line->blk_in_line); - int i; - - for (i = 0; i < lm->blk_per_line; i++) { - struct pblk_lun *rlun = &pblk->luns[i]; - int pos = pblk_ppa_to_pos(geo, rlun->bppa); - int state = line->chks[pos].state; - - /* Free chunks should not be erased */ - if (state & NVM_CHK_ST_FREE) { - set_bit(pblk_ppa_to_pos(geo, rlun->bppa), - line->erase_bitmap); - blk_to_erase--; - } - } - - return blk_to_erase; -} - -static int pblk_line_prepare(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_line_meta *lm = &pblk->lm; - int blk_in_line = atomic_read(&line->blk_in_line); - int blk_to_erase; - - /* Bad blocks do not need to be erased */ - bitmap_copy(line->erase_bitmap, line->blk_bitmap, lm->blk_per_line); - - spin_lock(&line->lock); - - /* If we have not written to this line, we need to mark up free chunks - * as already erased - */ - if (line->state == PBLK_LINESTATE_NEW) { - blk_to_erase = pblk_prepare_new_line(pblk, line); - line->state = PBLK_LINESTATE_FREE; - trace_pblk_line_state(pblk_disk_name(pblk), line->id, - line->state); - } else { - blk_to_erase = blk_in_line; - } - - if (blk_in_line < lm->min_blk_line) { - spin_unlock(&line->lock); - return -EAGAIN; - } - - if (line->state != PBLK_LINESTATE_FREE) { - WARN(1, "pblk: corrupted line %d, state %d\n", - line->id, line->state); - spin_unlock(&line->lock); - return -EINTR; - } - - line->state = PBLK_LINESTATE_OPEN; - trace_pblk_line_state(pblk_disk_name(pblk), line->id, - line->state); - - atomic_set(&line->left_eblks, blk_to_erase); - atomic_set(&line->left_seblks, blk_to_erase); - - line->meta_distance = lm->meta_distance; - spin_unlock(&line->lock); - - kref_init(&line->ref); - atomic_set(&line->sec_to_update, 0); - - return 0; -} - -/* Line allocations in the recovery path are always single threaded */ -int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - int ret; - - spin_lock(&l_mg->free_lock); - l_mg->data_line = line; - list_del(&line->list); - - ret = pblk_line_prepare(pblk, line); - if (ret) { - list_add(&line->list, &l_mg->free_list); - spin_unlock(&l_mg->free_lock); - return ret; - } - spin_unlock(&l_mg->free_lock); - - ret = pblk_line_alloc_bitmaps(pblk, line); - if (ret) - goto fail; - - if (!pblk_line_init_bb(pblk, line, 0)) { - ret = -EINTR; - goto fail; - } - - pblk_rl_free_lines_dec(&pblk->rl, line, true); - return 0; - -fail: - spin_lock(&l_mg->free_lock); - list_add(&line->list, &l_mg->free_list); - spin_unlock(&l_mg->free_lock); - - return ret; -} - -void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - - mempool_free(line->map_bitmap, l_mg->bitmap_pool); - line->map_bitmap = NULL; - line->smeta = NULL; - line->emeta = NULL; -} - -static void pblk_line_reinit(struct pblk_line *line) -{ - *line->vsc = cpu_to_le32(EMPTY_ENTRY); - - line->map_bitmap = NULL; - line->invalid_bitmap = NULL; - line->smeta = NULL; - line->emeta = NULL; -} - -void pblk_line_free(struct pblk_line *line) -{ - struct pblk *pblk = line->pblk; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - - mempool_free(line->map_bitmap, l_mg->bitmap_pool); - mempool_free(line->invalid_bitmap, l_mg->bitmap_pool); - - pblk_line_reinit(line); -} - -struct pblk_line *pblk_line_get(struct pblk *pblk) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_line *line; - int ret, bit; - - lockdep_assert_held(&l_mg->free_lock); - -retry: - if (list_empty(&l_mg->free_list)) { - pblk_err(pblk, "no free lines\n"); - return NULL; - } - - line = list_first_entry(&l_mg->free_list, struct pblk_line, list); - list_del(&line->list); - l_mg->nr_free_lines--; - - bit = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line); - if (unlikely(bit >= lm->blk_per_line)) { - spin_lock(&line->lock); - line->state = PBLK_LINESTATE_BAD; - trace_pblk_line_state(pblk_disk_name(pblk), line->id, - line->state); - spin_unlock(&line->lock); - - list_add_tail(&line->list, &l_mg->bad_list); - - pblk_debug(pblk, "line %d is bad\n", line->id); - goto retry; - } - - ret = pblk_line_prepare(pblk, line); - if (ret) { - switch (ret) { - case -EAGAIN: - list_add(&line->list, &l_mg->bad_list); - goto retry; - case -EINTR: - list_add(&line->list, &l_mg->corrupt_list); - goto retry; - default: - pblk_err(pblk, "failed to prepare line %d\n", line->id); - list_add(&line->list, &l_mg->free_list); - l_mg->nr_free_lines++; - return NULL; - } - } - - return line; -} - -static struct pblk_line *pblk_line_retry(struct pblk *pblk, - struct pblk_line *line) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line *retry_line; - -retry: - spin_lock(&l_mg->free_lock); - retry_line = pblk_line_get(pblk); - if (!retry_line) { - l_mg->data_line = NULL; - spin_unlock(&l_mg->free_lock); - return NULL; - } - - retry_line->map_bitmap = line->map_bitmap; - retry_line->invalid_bitmap = line->invalid_bitmap; - retry_line->smeta = line->smeta; - retry_line->emeta = line->emeta; - retry_line->meta_line = line->meta_line; - - pblk_line_reinit(line); - - l_mg->data_line = retry_line; - spin_unlock(&l_mg->free_lock); - - pblk_rl_free_lines_dec(&pblk->rl, line, false); - - if (pblk_line_erase(pblk, retry_line)) - goto retry; - - return retry_line; -} - -static void pblk_set_space_limit(struct pblk *pblk) -{ - struct pblk_rl *rl = &pblk->rl; - - atomic_set(&rl->rb_space, 0); -} - -struct pblk_line *pblk_line_get_first_data(struct pblk *pblk) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line *line; - - spin_lock(&l_mg->free_lock); - line = pblk_line_get(pblk); - if (!line) { - spin_unlock(&l_mg->free_lock); - return NULL; - } - - line->seq_nr = l_mg->d_seq_nr++; - line->type = PBLK_LINETYPE_DATA; - l_mg->data_line = line; - - pblk_line_setup_metadata(line, l_mg, &pblk->lm); - - /* Allocate next line for preparation */ - l_mg->data_next = pblk_line_get(pblk); - if (!l_mg->data_next) { - /* If we cannot get a new line, we need to stop the pipeline. - * Only allow as many writes in as we can store safely and then - * fail gracefully - */ - pblk_set_space_limit(pblk); - - l_mg->data_next = NULL; - } else { - l_mg->data_next->seq_nr = l_mg->d_seq_nr++; - l_mg->data_next->type = PBLK_LINETYPE_DATA; - } - spin_unlock(&l_mg->free_lock); - - if (pblk_line_alloc_bitmaps(pblk, line)) - return NULL; - - if (pblk_line_erase(pblk, line)) { - line = pblk_line_retry(pblk, line); - if (!line) - return NULL; - } - -retry_setup: - if (!pblk_line_init_metadata(pblk, line, NULL)) { - line = pblk_line_retry(pblk, line); - if (!line) - return NULL; - - goto retry_setup; - } - - if (!pblk_line_init_bb(pblk, line, 1)) { - line = pblk_line_retry(pblk, line); - if (!line) - return NULL; - - goto retry_setup; - } - - pblk_rl_free_lines_dec(&pblk->rl, line, true); - - return line; -} - -void pblk_ppa_to_line_put(struct pblk *pblk, struct ppa_addr ppa) -{ - struct pblk_line *line; - - line = pblk_ppa_to_line(pblk, ppa); - kref_put(&line->ref, pblk_line_put_wq); -} - -void pblk_rq_to_line_put(struct pblk *pblk, struct nvm_rq *rqd) -{ - struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd); - int i; - - for (i = 0; i < rqd->nr_ppas; i++) - pblk_ppa_to_line_put(pblk, ppa_list[i]); -} - -static void pblk_stop_writes(struct pblk *pblk, struct pblk_line *line) -{ - lockdep_assert_held(&pblk->l_mg.free_lock); - - pblk_set_space_limit(pblk); - pblk->state = PBLK_STATE_STOPPING; - trace_pblk_state(pblk_disk_name(pblk), pblk->state); -} - -static void pblk_line_close_meta_sync(struct pblk *pblk) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_line *line, *tline; - LIST_HEAD(list); - - spin_lock(&l_mg->close_lock); - if (list_empty(&l_mg->emeta_list)) { - spin_unlock(&l_mg->close_lock); - return; - } - - list_cut_position(&list, &l_mg->emeta_list, l_mg->emeta_list.prev); - spin_unlock(&l_mg->close_lock); - - list_for_each_entry_safe(line, tline, &list, list) { - struct pblk_emeta *emeta = line->emeta; - - while (emeta->mem < lm->emeta_len[0]) { - int ret; - - ret = pblk_submit_meta_io(pblk, line); - if (ret) { - pblk_err(pblk, "sync meta line %d failed (%d)\n", - line->id, ret); - return; - } - } - } - - pblk_wait_for_meta(pblk); - flush_workqueue(pblk->close_wq); -} - -void __pblk_pipeline_flush(struct pblk *pblk) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - int ret; - - spin_lock(&l_mg->free_lock); - if (pblk->state == PBLK_STATE_RECOVERING || - pblk->state == PBLK_STATE_STOPPED) { - spin_unlock(&l_mg->free_lock); - return; - } - pblk->state = PBLK_STATE_RECOVERING; - trace_pblk_state(pblk_disk_name(pblk), pblk->state); - spin_unlock(&l_mg->free_lock); - - pblk_flush_writer(pblk); - pblk_wait_for_meta(pblk); - - ret = pblk_recov_pad(pblk); - if (ret) { - pblk_err(pblk, "could not close data on teardown(%d)\n", ret); - return; - } - - flush_workqueue(pblk->bb_wq); - pblk_line_close_meta_sync(pblk); -} - -void __pblk_pipeline_stop(struct pblk *pblk) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - - spin_lock(&l_mg->free_lock); - pblk->state = PBLK_STATE_STOPPED; - trace_pblk_state(pblk_disk_name(pblk), pblk->state); - l_mg->data_line = NULL; - l_mg->data_next = NULL; - spin_unlock(&l_mg->free_lock); -} - -void pblk_pipeline_stop(struct pblk *pblk) -{ - __pblk_pipeline_flush(pblk); - __pblk_pipeline_stop(pblk); -} - -struct pblk_line *pblk_line_replace_data(struct pblk *pblk) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line *cur, *new = NULL; - unsigned int left_seblks; - - new = l_mg->data_next; - if (!new) - goto out; - - spin_lock(&l_mg->free_lock); - cur = l_mg->data_line; - l_mg->data_line = new; - - pblk_line_setup_metadata(new, l_mg, &pblk->lm); - spin_unlock(&l_mg->free_lock); - -retry_erase: - left_seblks = atomic_read(&new->left_seblks); - if (left_seblks) { - /* If line is not fully erased, erase it */ - if (atomic_read(&new->left_eblks)) { - if (pblk_line_erase(pblk, new)) - goto out; - } else { - io_schedule(); - } - goto retry_erase; - } - - if (pblk_line_alloc_bitmaps(pblk, new)) - return NULL; - -retry_setup: - if (!pblk_line_init_metadata(pblk, new, cur)) { - new = pblk_line_retry(pblk, new); - if (!new) - goto out; - - goto retry_setup; - } - - if (!pblk_line_init_bb(pblk, new, 1)) { - new = pblk_line_retry(pblk, new); - if (!new) - goto out; - - goto retry_setup; - } - - pblk_rl_free_lines_dec(&pblk->rl, new, true); - - /* Allocate next line for preparation */ - spin_lock(&l_mg->free_lock); - l_mg->data_next = pblk_line_get(pblk); - if (!l_mg->data_next) { - /* If we cannot get a new line, we need to stop the pipeline. - * Only allow as many writes in as we can store safely and then - * fail gracefully - */ - pblk_stop_writes(pblk, new); - l_mg->data_next = NULL; - } else { - l_mg->data_next->seq_nr = l_mg->d_seq_nr++; - l_mg->data_next->type = PBLK_LINETYPE_DATA; - } - spin_unlock(&l_mg->free_lock); - -out: - return new; -} - -static void __pblk_line_put(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_gc *gc = &pblk->gc; - - spin_lock(&line->lock); - WARN_ON(line->state != PBLK_LINESTATE_GC); - if (line->w_err_gc->has_gc_err) { - spin_unlock(&line->lock); - pblk_err(pblk, "line %d had errors during GC\n", line->id); - pblk_put_line_back(pblk, line); - line->w_err_gc->has_gc_err = 0; - return; - } - - line->state = PBLK_LINESTATE_FREE; - trace_pblk_line_state(pblk_disk_name(pblk), line->id, - line->state); - line->gc_group = PBLK_LINEGC_NONE; - pblk_line_free(line); - - if (line->w_err_gc->has_write_err) { - pblk_rl_werr_line_out(&pblk->rl); - line->w_err_gc->has_write_err = 0; - } - - spin_unlock(&line->lock); - atomic_dec(&gc->pipeline_gc); - - spin_lock(&l_mg->free_lock); - list_add_tail(&line->list, &l_mg->free_list); - l_mg->nr_free_lines++; - spin_unlock(&l_mg->free_lock); - - pblk_rl_free_lines_inc(&pblk->rl, line); -} - -static void pblk_line_put_ws(struct work_struct *work) -{ - struct pblk_line_ws *line_put_ws = container_of(work, - struct pblk_line_ws, ws); - struct pblk *pblk = line_put_ws->pblk; - struct pblk_line *line = line_put_ws->line; - - __pblk_line_put(pblk, line); - mempool_free(line_put_ws, &pblk->gen_ws_pool); -} - -void pblk_line_put(struct kref *ref) -{ - struct pblk_line *line = container_of(ref, struct pblk_line, ref); - struct pblk *pblk = line->pblk; - - __pblk_line_put(pblk, line); -} - -void pblk_line_put_wq(struct kref *ref) -{ - struct pblk_line *line = container_of(ref, struct pblk_line, ref); - struct pblk *pblk = line->pblk; - struct pblk_line_ws *line_put_ws; - - line_put_ws = mempool_alloc(&pblk->gen_ws_pool, GFP_ATOMIC); - if (!line_put_ws) - return; - - line_put_ws->pblk = pblk; - line_put_ws->line = line; - line_put_ws->priv = NULL; - - INIT_WORK(&line_put_ws->ws, pblk_line_put_ws); - queue_work(pblk->r_end_wq, &line_put_ws->ws); -} - -int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa) -{ - struct nvm_rq *rqd; - int err; - - rqd = pblk_alloc_rqd(pblk, PBLK_ERASE); - - pblk_setup_e_rq(pblk, rqd, ppa); - - rqd->end_io = pblk_end_io_erase; - rqd->private = pblk; - - trace_pblk_chunk_reset(pblk_disk_name(pblk), - &ppa, PBLK_CHUNK_RESET_START); - - /* The write thread schedules erases so that it minimizes disturbances - * with writes. Thus, there is no need to take the LUN semaphore. - */ - err = pblk_submit_io(pblk, rqd, NULL); - if (err) { - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - - pblk_err(pblk, "could not async erase line:%d,blk:%d\n", - pblk_ppa_to_line_id(ppa), - pblk_ppa_to_pos(geo, ppa)); - } - - return err; -} - -struct pblk_line *pblk_line_get_data(struct pblk *pblk) -{ - return pblk->l_mg.data_line; -} - -/* For now, always erase next line */ -struct pblk_line *pblk_line_get_erase(struct pblk *pblk) -{ - return pblk->l_mg.data_next; -} - -int pblk_line_is_full(struct pblk_line *line) -{ - return (line->left_msecs == 0); -} - -static void pblk_line_should_sync_meta(struct pblk *pblk) -{ - if (pblk_rl_is_limit(&pblk->rl)) - pblk_line_close_meta_sync(pblk); -} - -void pblk_line_close(struct pblk *pblk, struct pblk_line *line) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct list_head *move_list; - int i; - -#ifdef CONFIG_NVM_PBLK_DEBUG - WARN(!bitmap_full(line->map_bitmap, lm->sec_per_line), - "pblk: corrupt closed line %d\n", line->id); -#endif - - spin_lock(&l_mg->free_lock); - WARN_ON(!test_and_clear_bit(line->meta_line, &l_mg->meta_bitmap)); - spin_unlock(&l_mg->free_lock); - - spin_lock(&l_mg->gc_lock); - spin_lock(&line->lock); - WARN_ON(line->state != PBLK_LINESTATE_OPEN); - line->state = PBLK_LINESTATE_CLOSED; - move_list = pblk_line_gc_list(pblk, line); - list_add_tail(&line->list, move_list); - - mempool_free(line->map_bitmap, l_mg->bitmap_pool); - line->map_bitmap = NULL; - line->smeta = NULL; - line->emeta = NULL; - - for (i = 0; i < lm->blk_per_line; i++) { - struct pblk_lun *rlun = &pblk->luns[i]; - int pos = pblk_ppa_to_pos(geo, rlun->bppa); - int state = line->chks[pos].state; - - if (!(state & NVM_CHK_ST_OFFLINE)) - state = NVM_CHK_ST_CLOSED; - } - - spin_unlock(&line->lock); - spin_unlock(&l_mg->gc_lock); - - trace_pblk_line_state(pblk_disk_name(pblk), line->id, - line->state); -} - -void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_emeta *emeta = line->emeta; - struct line_emeta *emeta_buf = emeta->buf; - struct wa_counters *wa = emeta_to_wa(lm, emeta_buf); - - /* No need for exact vsc value; avoid a big line lock and take aprox. */ - memcpy(emeta_to_vsc(pblk, emeta_buf), l_mg->vsc_list, lm->vsc_list_len); - memcpy(emeta_to_bb(emeta_buf), line->blk_bitmap, lm->blk_bitmap_len); - - wa->user = cpu_to_le64(atomic64_read(&pblk->user_wa)); - wa->pad = cpu_to_le64(atomic64_read(&pblk->pad_wa)); - wa->gc = cpu_to_le64(atomic64_read(&pblk->gc_wa)); - - if (le32_to_cpu(emeta_buf->header.identifier) != PBLK_MAGIC) { - emeta_buf->header.identifier = cpu_to_le32(PBLK_MAGIC); - export_guid(emeta_buf->header.uuid, &pblk->instance_uuid); - emeta_buf->header.id = cpu_to_le32(line->id); - emeta_buf->header.type = cpu_to_le16(line->type); - emeta_buf->header.version_major = EMETA_VERSION_MAJOR; - emeta_buf->header.version_minor = EMETA_VERSION_MINOR; - emeta_buf->header.crc = cpu_to_le32( - pblk_calc_meta_header_crc(pblk, &emeta_buf->header)); - } - - emeta_buf->nr_valid_lbas = cpu_to_le64(line->nr_valid_lbas); - emeta_buf->crc = cpu_to_le32(pblk_calc_emeta_crc(pblk, emeta_buf)); - - spin_lock(&l_mg->close_lock); - spin_lock(&line->lock); - - /* Update the in-memory start address for emeta, in case it has - * shifted due to write errors - */ - if (line->emeta_ssec != line->cur_sec) - line->emeta_ssec = line->cur_sec; - - list_add_tail(&line->list, &l_mg->emeta_list); - spin_unlock(&line->lock); - spin_unlock(&l_mg->close_lock); - - pblk_line_should_sync_meta(pblk); -} - -static void pblk_save_lba_list(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_line_meta *lm = &pblk->lm; - unsigned int lba_list_size = lm->emeta_len[2]; - struct pblk_w_err_gc *w_err_gc = line->w_err_gc; - struct pblk_emeta *emeta = line->emeta; - - w_err_gc->lba_list = kvmalloc(lba_list_size, GFP_KERNEL); - memcpy(w_err_gc->lba_list, emeta_to_lbas(pblk, emeta->buf), - lba_list_size); -} - -void pblk_line_close_ws(struct work_struct *work) -{ - struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws, - ws); - struct pblk *pblk = line_ws->pblk; - struct pblk_line *line = line_ws->line; - struct pblk_w_err_gc *w_err_gc = line->w_err_gc; - - /* Write errors makes the emeta start address stored in smeta invalid, - * so keep a copy of the lba list until we've gc'd the line - */ - if (w_err_gc->has_write_err) - pblk_save_lba_list(pblk, line); - - pblk_line_close(pblk, line); - mempool_free(line_ws, &pblk->gen_ws_pool); -} - -void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv, - void (*work)(struct work_struct *), gfp_t gfp_mask, - struct workqueue_struct *wq) -{ - struct pblk_line_ws *line_ws; - - line_ws = mempool_alloc(&pblk->gen_ws_pool, gfp_mask); - if (!line_ws) { - pblk_err(pblk, "pblk: could not allocate memory\n"); - return; - } - - line_ws->pblk = pblk; - line_ws->line = line; - line_ws->priv = priv; - - INIT_WORK(&line_ws->ws, work); - queue_work(wq, &line_ws->ws); -} - -static void __pblk_down_chunk(struct pblk *pblk, int pos) -{ - struct pblk_lun *rlun = &pblk->luns[pos]; - int ret; - - /* - * Only send one inflight I/O per LUN. Since we map at a page - * granurality, all ppas in the I/O will map to the same LUN - */ - - ret = down_timeout(&rlun->wr_sem, msecs_to_jiffies(30000)); - if (ret == -ETIME || ret == -EINTR) - pblk_err(pblk, "taking lun semaphore timed out: err %d\n", - -ret); -} - -void pblk_down_chunk(struct pblk *pblk, struct ppa_addr ppa) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - int pos = pblk_ppa_to_pos(geo, ppa); - - __pblk_down_chunk(pblk, pos); -} - -void pblk_down_rq(struct pblk *pblk, struct ppa_addr ppa, - unsigned long *lun_bitmap) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - int pos = pblk_ppa_to_pos(geo, ppa); - - /* If the LUN has been locked for this same request, do no attempt to - * lock it again - */ - if (test_and_set_bit(pos, lun_bitmap)) - return; - - __pblk_down_chunk(pblk, pos); -} - -void pblk_up_chunk(struct pblk *pblk, struct ppa_addr ppa) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_lun *rlun; - int pos = pblk_ppa_to_pos(geo, ppa); - - rlun = &pblk->luns[pos]; - up(&rlun->wr_sem); -} - -void pblk_up_rq(struct pblk *pblk, unsigned long *lun_bitmap) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_lun *rlun; - int num_lun = geo->all_luns; - int bit = -1; - - while ((bit = find_next_bit(lun_bitmap, num_lun, bit + 1)) < num_lun) { - rlun = &pblk->luns[bit]; - up(&rlun->wr_sem); - } -} - -void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa) -{ - struct ppa_addr ppa_l2p; - - /* logic error: lba out-of-bounds. Ignore update */ - if (!(lba < pblk->capacity)) { - WARN(1, "pblk: corrupted L2P map request\n"); - return; - } - - spin_lock(&pblk->trans_lock); - ppa_l2p = pblk_trans_map_get(pblk, lba); - - if (!pblk_addr_in_cache(ppa_l2p) && !pblk_ppa_empty(ppa_l2p)) - pblk_map_invalidate(pblk, ppa_l2p); - - pblk_trans_map_set(pblk, lba, ppa); - spin_unlock(&pblk->trans_lock); -} - -void pblk_update_map_cache(struct pblk *pblk, sector_t lba, struct ppa_addr ppa) -{ - -#ifdef CONFIG_NVM_PBLK_DEBUG - /* Callers must ensure that the ppa points to a cache address */ - BUG_ON(!pblk_addr_in_cache(ppa)); - BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa))); -#endif - - pblk_update_map(pblk, lba, ppa); -} - -int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa_new, - struct pblk_line *gc_line, u64 paddr_gc) -{ - struct ppa_addr ppa_l2p, ppa_gc; - int ret = 1; - -#ifdef CONFIG_NVM_PBLK_DEBUG - /* Callers must ensure that the ppa points to a cache address */ - BUG_ON(!pblk_addr_in_cache(ppa_new)); - BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa_new))); -#endif - - /* logic error: lba out-of-bounds. Ignore update */ - if (!(lba < pblk->capacity)) { - WARN(1, "pblk: corrupted L2P map request\n"); - return 0; - } - - spin_lock(&pblk->trans_lock); - ppa_l2p = pblk_trans_map_get(pblk, lba); - ppa_gc = addr_to_gen_ppa(pblk, paddr_gc, gc_line->id); - - if (!pblk_ppa_comp(ppa_l2p, ppa_gc)) { - spin_lock(&gc_line->lock); - WARN(!test_bit(paddr_gc, gc_line->invalid_bitmap), - "pblk: corrupted GC update"); - spin_unlock(&gc_line->lock); - - ret = 0; - goto out; - } - - pblk_trans_map_set(pblk, lba, ppa_new); -out: - spin_unlock(&pblk->trans_lock); - return ret; -} - -void pblk_update_map_dev(struct pblk *pblk, sector_t lba, - struct ppa_addr ppa_mapped, struct ppa_addr ppa_cache) -{ - struct ppa_addr ppa_l2p; - -#ifdef CONFIG_NVM_PBLK_DEBUG - /* Callers must ensure that the ppa points to a device address */ - BUG_ON(pblk_addr_in_cache(ppa_mapped)); -#endif - /* Invalidate and discard padded entries */ - if (lba == ADDR_EMPTY) { - atomic64_inc(&pblk->pad_wa); -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_inc(&pblk->padded_wb); -#endif - if (!pblk_ppa_empty(ppa_mapped)) - pblk_map_invalidate(pblk, ppa_mapped); - return; - } - - /* logic error: lba out-of-bounds. Ignore update */ - if (!(lba < pblk->capacity)) { - WARN(1, "pblk: corrupted L2P map request\n"); - return; - } - - spin_lock(&pblk->trans_lock); - ppa_l2p = pblk_trans_map_get(pblk, lba); - - /* Do not update L2P if the cacheline has been updated. In this case, - * the mapped ppa must be invalidated - */ - if (!pblk_ppa_comp(ppa_l2p, ppa_cache)) { - if (!pblk_ppa_empty(ppa_mapped)) - pblk_map_invalidate(pblk, ppa_mapped); - goto out; - } - -#ifdef CONFIG_NVM_PBLK_DEBUG - WARN_ON(!pblk_addr_in_cache(ppa_l2p) && !pblk_ppa_empty(ppa_l2p)); -#endif - - pblk_trans_map_set(pblk, lba, ppa_mapped); -out: - spin_unlock(&pblk->trans_lock); -} - -int pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas, - sector_t blba, int nr_secs, bool *from_cache) -{ - int i; - - spin_lock(&pblk->trans_lock); - for (i = 0; i < nr_secs; i++) { - struct ppa_addr ppa; - - ppa = ppas[i] = pblk_trans_map_get(pblk, blba + i); - - /* If the L2P entry maps to a line, the reference is valid */ - if (!pblk_ppa_empty(ppa) && !pblk_addr_in_cache(ppa)) { - struct pblk_line *line = pblk_ppa_to_line(pblk, ppa); - - if (i > 0 && *from_cache) - break; - *from_cache = false; - - kref_get(&line->ref); - } else { - if (i > 0 && !*from_cache) - break; - *from_cache = true; - } - } - spin_unlock(&pblk->trans_lock); - return i; -} - -void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas, - u64 *lba_list, int nr_secs) -{ - u64 lba; - int i; - - spin_lock(&pblk->trans_lock); - for (i = 0; i < nr_secs; i++) { - lba = lba_list[i]; - if (lba != ADDR_EMPTY) { - /* logic error: lba out-of-bounds. Ignore update */ - if (!(lba < pblk->capacity)) { - WARN(1, "pblk: corrupted L2P map request\n"); - continue; - } - ppas[i] = pblk_trans_map_get(pblk, lba); - } - } - spin_unlock(&pblk->trans_lock); -} - -void *pblk_get_meta_for_writes(struct pblk *pblk, struct nvm_rq *rqd) -{ - void *buffer; - - if (pblk_is_oob_meta_supported(pblk)) { - /* Just use OOB metadata buffer as always */ - buffer = rqd->meta_list; - } else { - /* We need to reuse last page of request (packed metadata) - * in similar way as traditional oob metadata - */ - buffer = page_to_virt( - rqd->bio->bi_io_vec[rqd->bio->bi_vcnt - 1].bv_page); - } - - return buffer; -} - -void pblk_get_packed_meta(struct pblk *pblk, struct nvm_rq *rqd) -{ - void *meta_list = rqd->meta_list; - void *page; - int i = 0; - - if (pblk_is_oob_meta_supported(pblk)) - return; - - page = page_to_virt(rqd->bio->bi_io_vec[rqd->bio->bi_vcnt - 1].bv_page); - /* We need to fill oob meta buffer with data from packed metadata */ - for (; i < rqd->nr_ppas; i++) - memcpy(pblk_get_meta(pblk, meta_list, i), - page + (i * sizeof(struct pblk_sec_meta)), - sizeof(struct pblk_sec_meta)); -} diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c deleted file mode 100644 index b31658be35a7..000000000000 --- a/drivers/lightnvm/pblk-gc.c +++ /dev/null @@ -1,726 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2016 CNEX Labs - * Initial release: Javier Gonzalez - * Matias Bjorling - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * pblk-gc.c - pblk's garbage collector - */ - -#include "pblk.h" -#include "pblk-trace.h" -#include - - -static void pblk_gc_free_gc_rq(struct pblk_gc_rq *gc_rq) -{ - vfree(gc_rq->data); - kfree(gc_rq); -} - -static int pblk_gc_write(struct pblk *pblk) -{ - struct pblk_gc *gc = &pblk->gc; - struct pblk_gc_rq *gc_rq, *tgc_rq; - LIST_HEAD(w_list); - - spin_lock(&gc->w_lock); - if (list_empty(&gc->w_list)) { - spin_unlock(&gc->w_lock); - return 1; - } - - list_cut_position(&w_list, &gc->w_list, gc->w_list.prev); - gc->w_entries = 0; - spin_unlock(&gc->w_lock); - - list_for_each_entry_safe(gc_rq, tgc_rq, &w_list, list) { - pblk_write_gc_to_cache(pblk, gc_rq); - list_del(&gc_rq->list); - kref_put(&gc_rq->line->ref, pblk_line_put); - pblk_gc_free_gc_rq(gc_rq); - } - - return 0; -} - -static void pblk_gc_writer_kick(struct pblk_gc *gc) -{ - wake_up_process(gc->gc_writer_ts); -} - -void pblk_put_line_back(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct list_head *move_list; - - spin_lock(&l_mg->gc_lock); - spin_lock(&line->lock); - WARN_ON(line->state != PBLK_LINESTATE_GC); - line->state = PBLK_LINESTATE_CLOSED; - trace_pblk_line_state(pblk_disk_name(pblk), line->id, - line->state); - - /* We need to reset gc_group in order to ensure that - * pblk_line_gc_list will return proper move_list - * since right now current line is not on any of the - * gc lists. - */ - line->gc_group = PBLK_LINEGC_NONE; - move_list = pblk_line_gc_list(pblk, line); - spin_unlock(&line->lock); - list_add_tail(&line->list, move_list); - spin_unlock(&l_mg->gc_lock); -} - -static void pblk_gc_line_ws(struct work_struct *work) -{ - struct pblk_line_ws *gc_rq_ws = container_of(work, - struct pblk_line_ws, ws); - struct pblk *pblk = gc_rq_ws->pblk; - struct pblk_gc *gc = &pblk->gc; - struct pblk_line *line = gc_rq_ws->line; - struct pblk_gc_rq *gc_rq = gc_rq_ws->priv; - int ret; - - up(&gc->gc_sem); - - /* Read from GC victim block */ - ret = pblk_submit_read_gc(pblk, gc_rq); - if (ret) { - line->w_err_gc->has_gc_err = 1; - goto out; - } - - if (!gc_rq->secs_to_gc) - goto out; - -retry: - spin_lock(&gc->w_lock); - if (gc->w_entries >= PBLK_GC_RQ_QD) { - spin_unlock(&gc->w_lock); - pblk_gc_writer_kick(&pblk->gc); - usleep_range(128, 256); - goto retry; - } - gc->w_entries++; - list_add_tail(&gc_rq->list, &gc->w_list); - spin_unlock(&gc->w_lock); - - pblk_gc_writer_kick(&pblk->gc); - - kfree(gc_rq_ws); - return; - -out: - pblk_gc_free_gc_rq(gc_rq); - kref_put(&line->ref, pblk_line_put); - kfree(gc_rq_ws); -} - -static __le64 *get_lba_list_from_emeta(struct pblk *pblk, - struct pblk_line *line) -{ - struct line_emeta *emeta_buf; - struct pblk_line_meta *lm = &pblk->lm; - unsigned int lba_list_size = lm->emeta_len[2]; - __le64 *lba_list; - int ret; - - emeta_buf = kvmalloc(lm->emeta_len[0], GFP_KERNEL); - if (!emeta_buf) - return NULL; - - ret = pblk_line_emeta_read(pblk, line, emeta_buf); - if (ret) { - pblk_err(pblk, "line %d read emeta failed (%d)\n", - line->id, ret); - kvfree(emeta_buf); - return NULL; - } - - /* If this read fails, it means that emeta is corrupted. - * For now, leave the line untouched. - * TODO: Implement a recovery routine that scans and moves - * all sectors on the line. - */ - - ret = pblk_recov_check_emeta(pblk, emeta_buf); - if (ret) { - pblk_err(pblk, "inconsistent emeta (line %d)\n", - line->id); - kvfree(emeta_buf); - return NULL; - } - - lba_list = kvmalloc(lba_list_size, GFP_KERNEL); - - if (lba_list) - memcpy(lba_list, emeta_to_lbas(pblk, emeta_buf), lba_list_size); - - kvfree(emeta_buf); - - return lba_list; -} - -static void pblk_gc_line_prepare_ws(struct work_struct *work) -{ - struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws, - ws); - struct pblk *pblk = line_ws->pblk; - struct pblk_line *line = line_ws->line; - struct pblk_line_meta *lm = &pblk->lm; - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_gc *gc = &pblk->gc; - struct pblk_line_ws *gc_rq_ws; - struct pblk_gc_rq *gc_rq; - __le64 *lba_list; - unsigned long *invalid_bitmap; - int sec_left, nr_secs, bit; - - invalid_bitmap = kmalloc(lm->sec_bitmap_len, GFP_KERNEL); - if (!invalid_bitmap) - goto fail_free_ws; - - if (line->w_err_gc->has_write_err) { - lba_list = line->w_err_gc->lba_list; - line->w_err_gc->lba_list = NULL; - } else { - lba_list = get_lba_list_from_emeta(pblk, line); - if (!lba_list) { - pblk_err(pblk, "could not interpret emeta (line %d)\n", - line->id); - goto fail_free_invalid_bitmap; - } - } - - spin_lock(&line->lock); - bitmap_copy(invalid_bitmap, line->invalid_bitmap, lm->sec_per_line); - sec_left = pblk_line_vsc(line); - spin_unlock(&line->lock); - - if (sec_left < 0) { - pblk_err(pblk, "corrupted GC line (%d)\n", line->id); - goto fail_free_lba_list; - } - - bit = -1; -next_rq: - gc_rq = kmalloc(sizeof(struct pblk_gc_rq), GFP_KERNEL); - if (!gc_rq) - goto fail_free_lba_list; - - nr_secs = 0; - do { - bit = find_next_zero_bit(invalid_bitmap, lm->sec_per_line, - bit + 1); - if (bit > line->emeta_ssec) - break; - - gc_rq->paddr_list[nr_secs] = bit; - gc_rq->lba_list[nr_secs++] = le64_to_cpu(lba_list[bit]); - } while (nr_secs < pblk->max_write_pgs); - - if (unlikely(!nr_secs)) { - kfree(gc_rq); - goto out; - } - - gc_rq->nr_secs = nr_secs; - gc_rq->line = line; - - gc_rq->data = vmalloc(array_size(gc_rq->nr_secs, geo->csecs)); - if (!gc_rq->data) - goto fail_free_gc_rq; - - gc_rq_ws = kmalloc(sizeof(struct pblk_line_ws), GFP_KERNEL); - if (!gc_rq_ws) - goto fail_free_gc_data; - - gc_rq_ws->pblk = pblk; - gc_rq_ws->line = line; - gc_rq_ws->priv = gc_rq; - - /* The write GC path can be much slower than the read GC one due to - * the budget imposed by the rate-limiter. Balance in case that we get - * back pressure from the write GC path. - */ - while (down_timeout(&gc->gc_sem, msecs_to_jiffies(30000))) - io_schedule(); - - kref_get(&line->ref); - - INIT_WORK(&gc_rq_ws->ws, pblk_gc_line_ws); - queue_work(gc->gc_line_reader_wq, &gc_rq_ws->ws); - - sec_left -= nr_secs; - if (sec_left > 0) - goto next_rq; - -out: - kvfree(lba_list); - kfree(line_ws); - kfree(invalid_bitmap); - - kref_put(&line->ref, pblk_line_put); - atomic_dec(&gc->read_inflight_gc); - - return; - -fail_free_gc_data: - vfree(gc_rq->data); -fail_free_gc_rq: - kfree(gc_rq); -fail_free_lba_list: - kvfree(lba_list); -fail_free_invalid_bitmap: - kfree(invalid_bitmap); -fail_free_ws: - kfree(line_ws); - - /* Line goes back to closed state, so we cannot release additional - * reference for line, since we do that only when we want to do - * gc to free line state transition. - */ - pblk_put_line_back(pblk, line); - atomic_dec(&gc->read_inflight_gc); - - pblk_err(pblk, "failed to GC line %d\n", line->id); -} - -static int pblk_gc_line(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_gc *gc = &pblk->gc; - struct pblk_line_ws *line_ws; - - pblk_debug(pblk, "line '%d' being reclaimed for GC\n", line->id); - - line_ws = kmalloc(sizeof(struct pblk_line_ws), GFP_KERNEL); - if (!line_ws) - return -ENOMEM; - - line_ws->pblk = pblk; - line_ws->line = line; - - atomic_inc(&gc->pipeline_gc); - INIT_WORK(&line_ws->ws, pblk_gc_line_prepare_ws); - queue_work(gc->gc_reader_wq, &line_ws->ws); - - return 0; -} - -static void pblk_gc_reader_kick(struct pblk_gc *gc) -{ - wake_up_process(gc->gc_reader_ts); -} - -static void pblk_gc_kick(struct pblk *pblk) -{ - struct pblk_gc *gc = &pblk->gc; - - pblk_gc_writer_kick(gc); - pblk_gc_reader_kick(gc); - - /* If we're shutting down GC, let's not start it up again */ - if (gc->gc_enabled) { - wake_up_process(gc->gc_ts); - mod_timer(&gc->gc_timer, - jiffies + msecs_to_jiffies(GC_TIME_MSECS)); - } -} - -static int pblk_gc_read(struct pblk *pblk) -{ - struct pblk_gc *gc = &pblk->gc; - struct pblk_line *line; - - spin_lock(&gc->r_lock); - if (list_empty(&gc->r_list)) { - spin_unlock(&gc->r_lock); - return 1; - } - - line = list_first_entry(&gc->r_list, struct pblk_line, list); - list_del(&line->list); - spin_unlock(&gc->r_lock); - - pblk_gc_kick(pblk); - - if (pblk_gc_line(pblk, line)) { - pblk_err(pblk, "failed to GC line %d\n", line->id); - /* rollback */ - spin_lock(&gc->r_lock); - list_add_tail(&line->list, &gc->r_list); - spin_unlock(&gc->r_lock); - } - - return 0; -} - -static struct pblk_line *pblk_gc_get_victim_line(struct pblk *pblk, - struct list_head *group_list) -{ - struct pblk_line *line, *victim; - unsigned int line_vsc = ~0x0L, victim_vsc = ~0x0L; - - victim = list_first_entry(group_list, struct pblk_line, list); - - list_for_each_entry(line, group_list, list) { - if (!atomic_read(&line->sec_to_update)) - line_vsc = le32_to_cpu(*line->vsc); - if (line_vsc < victim_vsc) { - victim = line; - victim_vsc = le32_to_cpu(*victim->vsc); - } - } - - if (victim_vsc == ~0x0) - return NULL; - - return victim; -} - -static bool pblk_gc_should_run(struct pblk_gc *gc, struct pblk_rl *rl) -{ - unsigned int nr_blocks_free, nr_blocks_need; - unsigned int werr_lines = atomic_read(&rl->werr_lines); - - nr_blocks_need = pblk_rl_high_thrs(rl); - nr_blocks_free = pblk_rl_nr_free_blks(rl); - - /* This is not critical, no need to take lock here */ - return ((werr_lines > 0) || - ((gc->gc_active) && (nr_blocks_need > nr_blocks_free))); -} - -void pblk_gc_free_full_lines(struct pblk *pblk) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_gc *gc = &pblk->gc; - struct pblk_line *line; - - do { - spin_lock(&l_mg->gc_lock); - if (list_empty(&l_mg->gc_full_list)) { - spin_unlock(&l_mg->gc_lock); - return; - } - - line = list_first_entry(&l_mg->gc_full_list, - struct pblk_line, list); - - spin_lock(&line->lock); - WARN_ON(line->state != PBLK_LINESTATE_CLOSED); - line->state = PBLK_LINESTATE_GC; - trace_pblk_line_state(pblk_disk_name(pblk), line->id, - line->state); - spin_unlock(&line->lock); - - list_del(&line->list); - spin_unlock(&l_mg->gc_lock); - - atomic_inc(&gc->pipeline_gc); - kref_put(&line->ref, pblk_line_put); - } while (1); -} - -/* - * Lines with no valid sectors will be returned to the free list immediately. If - * GC is activated - either because the free block count is under the determined - * threshold, or because it is being forced from user space - only lines with a - * high count of invalid sectors will be recycled. - */ -static void pblk_gc_run(struct pblk *pblk) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_gc *gc = &pblk->gc; - struct pblk_line *line; - struct list_head *group_list; - bool run_gc; - int read_inflight_gc, gc_group = 0, prev_group = 0; - - pblk_gc_free_full_lines(pblk); - - run_gc = pblk_gc_should_run(&pblk->gc, &pblk->rl); - if (!run_gc || (atomic_read(&gc->read_inflight_gc) >= PBLK_GC_L_QD)) - return; - -next_gc_group: - group_list = l_mg->gc_lists[gc_group++]; - - do { - spin_lock(&l_mg->gc_lock); - - line = pblk_gc_get_victim_line(pblk, group_list); - if (!line) { - spin_unlock(&l_mg->gc_lock); - break; - } - - spin_lock(&line->lock); - WARN_ON(line->state != PBLK_LINESTATE_CLOSED); - line->state = PBLK_LINESTATE_GC; - trace_pblk_line_state(pblk_disk_name(pblk), line->id, - line->state); - spin_unlock(&line->lock); - - list_del(&line->list); - spin_unlock(&l_mg->gc_lock); - - spin_lock(&gc->r_lock); - list_add_tail(&line->list, &gc->r_list); - spin_unlock(&gc->r_lock); - - read_inflight_gc = atomic_inc_return(&gc->read_inflight_gc); - pblk_gc_reader_kick(gc); - - prev_group = 1; - - /* No need to queue up more GC lines than we can handle */ - run_gc = pblk_gc_should_run(&pblk->gc, &pblk->rl); - if (!run_gc || read_inflight_gc >= PBLK_GC_L_QD) - break; - } while (1); - - if (!prev_group && pblk->rl.rb_state > gc_group && - gc_group < PBLK_GC_NR_LISTS) - goto next_gc_group; -} - -static void pblk_gc_timer(struct timer_list *t) -{ - struct pblk *pblk = from_timer(pblk, t, gc.gc_timer); - - pblk_gc_kick(pblk); -} - -static int pblk_gc_ts(void *data) -{ - struct pblk *pblk = data; - - while (!kthread_should_stop()) { - pblk_gc_run(pblk); - set_current_state(TASK_INTERRUPTIBLE); - io_schedule(); - } - - return 0; -} - -static int pblk_gc_writer_ts(void *data) -{ - struct pblk *pblk = data; - - while (!kthread_should_stop()) { - if (!pblk_gc_write(pblk)) - continue; - set_current_state(TASK_INTERRUPTIBLE); - io_schedule(); - } - - return 0; -} - -static int pblk_gc_reader_ts(void *data) -{ - struct pblk *pblk = data; - struct pblk_gc *gc = &pblk->gc; - - while (!kthread_should_stop()) { - if (!pblk_gc_read(pblk)) - continue; - set_current_state(TASK_INTERRUPTIBLE); - io_schedule(); - } - -#ifdef CONFIG_NVM_PBLK_DEBUG - pblk_info(pblk, "flushing gc pipeline, %d lines left\n", - atomic_read(&gc->pipeline_gc)); -#endif - - do { - if (!atomic_read(&gc->pipeline_gc)) - break; - - schedule(); - } while (1); - - return 0; -} - -static void pblk_gc_start(struct pblk *pblk) -{ - pblk->gc.gc_active = 1; - pblk_debug(pblk, "gc start\n"); -} - -void pblk_gc_should_start(struct pblk *pblk) -{ - struct pblk_gc *gc = &pblk->gc; - - if (gc->gc_enabled && !gc->gc_active) { - pblk_gc_start(pblk); - pblk_gc_kick(pblk); - } -} - -void pblk_gc_should_stop(struct pblk *pblk) -{ - struct pblk_gc *gc = &pblk->gc; - - if (gc->gc_active && !gc->gc_forced) - gc->gc_active = 0; -} - -void pblk_gc_should_kick(struct pblk *pblk) -{ - pblk_rl_update_rates(&pblk->rl); -} - -void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled, - int *gc_active) -{ - struct pblk_gc *gc = &pblk->gc; - - spin_lock(&gc->lock); - *gc_enabled = gc->gc_enabled; - *gc_active = gc->gc_active; - spin_unlock(&gc->lock); -} - -int pblk_gc_sysfs_force(struct pblk *pblk, int force) -{ - struct pblk_gc *gc = &pblk->gc; - - if (force < 0 || force > 1) - return -EINVAL; - - spin_lock(&gc->lock); - gc->gc_forced = force; - - if (force) - gc->gc_enabled = 1; - else - gc->gc_enabled = 0; - spin_unlock(&gc->lock); - - pblk_gc_should_start(pblk); - - return 0; -} - -int pblk_gc_init(struct pblk *pblk) -{ - struct pblk_gc *gc = &pblk->gc; - int ret; - - gc->gc_ts = kthread_create(pblk_gc_ts, pblk, "pblk-gc-ts"); - if (IS_ERR(gc->gc_ts)) { - pblk_err(pblk, "could not allocate GC main kthread\n"); - return PTR_ERR(gc->gc_ts); - } - - gc->gc_writer_ts = kthread_create(pblk_gc_writer_ts, pblk, - "pblk-gc-writer-ts"); - if (IS_ERR(gc->gc_writer_ts)) { - pblk_err(pblk, "could not allocate GC writer kthread\n"); - ret = PTR_ERR(gc->gc_writer_ts); - goto fail_free_main_kthread; - } - - gc->gc_reader_ts = kthread_create(pblk_gc_reader_ts, pblk, - "pblk-gc-reader-ts"); - if (IS_ERR(gc->gc_reader_ts)) { - pblk_err(pblk, "could not allocate GC reader kthread\n"); - ret = PTR_ERR(gc->gc_reader_ts); - goto fail_free_writer_kthread; - } - - timer_setup(&gc->gc_timer, pblk_gc_timer, 0); - mod_timer(&gc->gc_timer, jiffies + msecs_to_jiffies(GC_TIME_MSECS)); - - gc->gc_active = 0; - gc->gc_forced = 0; - gc->gc_enabled = 1; - gc->w_entries = 0; - atomic_set(&gc->read_inflight_gc, 0); - atomic_set(&gc->pipeline_gc, 0); - - /* Workqueue that reads valid sectors from a line and submit them to the - * GC writer to be recycled. - */ - gc->gc_line_reader_wq = alloc_workqueue("pblk-gc-line-reader-wq", - WQ_MEM_RECLAIM | WQ_UNBOUND, PBLK_GC_MAX_READERS); - if (!gc->gc_line_reader_wq) { - pblk_err(pblk, "could not allocate GC line reader workqueue\n"); - ret = -ENOMEM; - goto fail_free_reader_kthread; - } - - /* Workqueue that prepare lines for GC */ - gc->gc_reader_wq = alloc_workqueue("pblk-gc-line_wq", - WQ_MEM_RECLAIM | WQ_UNBOUND, 1); - if (!gc->gc_reader_wq) { - pblk_err(pblk, "could not allocate GC reader workqueue\n"); - ret = -ENOMEM; - goto fail_free_reader_line_wq; - } - - spin_lock_init(&gc->lock); - spin_lock_init(&gc->w_lock); - spin_lock_init(&gc->r_lock); - - sema_init(&gc->gc_sem, PBLK_GC_RQ_QD); - - INIT_LIST_HEAD(&gc->w_list); - INIT_LIST_HEAD(&gc->r_list); - - return 0; - -fail_free_reader_line_wq: - destroy_workqueue(gc->gc_line_reader_wq); -fail_free_reader_kthread: - kthread_stop(gc->gc_reader_ts); -fail_free_writer_kthread: - kthread_stop(gc->gc_writer_ts); -fail_free_main_kthread: - kthread_stop(gc->gc_ts); - - return ret; -} - -void pblk_gc_exit(struct pblk *pblk, bool graceful) -{ - struct pblk_gc *gc = &pblk->gc; - - gc->gc_enabled = 0; - del_timer_sync(&gc->gc_timer); - gc->gc_active = 0; - - if (gc->gc_ts) - kthread_stop(gc->gc_ts); - - if (gc->gc_reader_ts) - kthread_stop(gc->gc_reader_ts); - - if (graceful) { - flush_workqueue(gc->gc_reader_wq); - flush_workqueue(gc->gc_line_reader_wq); - } - - destroy_workqueue(gc->gc_reader_wq); - destroy_workqueue(gc->gc_line_reader_wq); - - if (gc->gc_writer_ts) - kthread_stop(gc->gc_writer_ts); -} diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c deleted file mode 100644 index 5924f09c217b..000000000000 --- a/drivers/lightnvm/pblk-init.c +++ /dev/null @@ -1,1324 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2015 IT University of Copenhagen (rrpc.c) - * Copyright (C) 2016 CNEX Labs - * Initial release: Javier Gonzalez - * Matias Bjorling - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * Implementation of a physical block-device target for Open-channel SSDs. - * - * pblk-init.c - pblk's initialization. - */ - -#include "pblk.h" -#include "pblk-trace.h" - -static unsigned int write_buffer_size; - -module_param(write_buffer_size, uint, 0644); -MODULE_PARM_DESC(write_buffer_size, "number of entries in a write buffer"); - -struct pblk_global_caches { - struct kmem_cache *ws; - struct kmem_cache *rec; - struct kmem_cache *g_rq; - struct kmem_cache *w_rq; - - struct kref kref; - - struct mutex mutex; /* Ensures consistency between - * caches and kref - */ -}; - -static struct pblk_global_caches pblk_caches = { - .mutex = __MUTEX_INITIALIZER(pblk_caches.mutex), - .kref = KREF_INIT(0), -}; - -struct bio_set pblk_bio_set; - -static blk_qc_t pblk_submit_bio(struct bio *bio) -{ - struct pblk *pblk = bio->bi_bdev->bd_disk->queue->queuedata; - - if (bio_op(bio) == REQ_OP_DISCARD) { - pblk_discard(pblk, bio); - if (!(bio->bi_opf & REQ_PREFLUSH)) { - bio_endio(bio); - return BLK_QC_T_NONE; - } - } - - /* Read requests must be <= 256kb due to NVMe's 64 bit completion bitmap - * constraint. Writes can be of arbitrary size. - */ - if (bio_data_dir(bio) == READ) { - blk_queue_split(&bio); - pblk_submit_read(pblk, bio); - } else { - /* Prevent deadlock in the case of a modest LUN configuration - * and large user I/Os. Unless stalled, the rate limiter - * leaves at least 256KB available for user I/O. - */ - if (pblk_get_secs(bio) > pblk_rl_max_io(&pblk->rl)) - blk_queue_split(&bio); - - pblk_write_to_cache(pblk, bio, PBLK_IOTYPE_USER); - } - - return BLK_QC_T_NONE; -} - -static const struct block_device_operations pblk_bops = { - .owner = THIS_MODULE, - .submit_bio = pblk_submit_bio, -}; - - -static size_t pblk_trans_map_size(struct pblk *pblk) -{ - int entry_size = 8; - - if (pblk->addrf_len < 32) - entry_size = 4; - - return entry_size * pblk->capacity; -} - -#ifdef CONFIG_NVM_PBLK_DEBUG -static u32 pblk_l2p_crc(struct pblk *pblk) -{ - size_t map_size; - u32 crc = ~(u32)0; - - map_size = pblk_trans_map_size(pblk); - crc = crc32_le(crc, pblk->trans_map, map_size); - return crc; -} -#endif - -static void pblk_l2p_free(struct pblk *pblk) -{ - vfree(pblk->trans_map); -} - -static int pblk_l2p_recover(struct pblk *pblk, bool factory_init) -{ - struct pblk_line *line = NULL; - - if (factory_init) { - guid_gen(&pblk->instance_uuid); - } else { - line = pblk_recov_l2p(pblk); - if (IS_ERR(line)) { - pblk_err(pblk, "could not recover l2p table\n"); - return -EFAULT; - } - } - -#ifdef CONFIG_NVM_PBLK_DEBUG - pblk_info(pblk, "init: L2P CRC: %x\n", pblk_l2p_crc(pblk)); -#endif - - /* Free full lines directly as GC has not been started yet */ - pblk_gc_free_full_lines(pblk); - - if (!line) { - /* Configure next line for user data */ - line = pblk_line_get_first_data(pblk); - if (!line) - return -EFAULT; - } - - return 0; -} - -static int pblk_l2p_init(struct pblk *pblk, bool factory_init) -{ - sector_t i; - struct ppa_addr ppa; - size_t map_size; - int ret = 0; - - map_size = pblk_trans_map_size(pblk); - pblk->trans_map = __vmalloc(map_size, GFP_KERNEL | __GFP_NOWARN | - __GFP_RETRY_MAYFAIL | __GFP_HIGHMEM); - if (!pblk->trans_map) { - pblk_err(pblk, "failed to allocate L2P (need %zu of memory)\n", - map_size); - return -ENOMEM; - } - - pblk_ppa_set_empty(&ppa); - - for (i = 0; i < pblk->capacity; i++) - pblk_trans_map_set(pblk, i, ppa); - - ret = pblk_l2p_recover(pblk, factory_init); - if (ret) - vfree(pblk->trans_map); - - return ret; -} - -static void pblk_rwb_free(struct pblk *pblk) -{ - if (pblk_rb_tear_down_check(&pblk->rwb)) - pblk_err(pblk, "write buffer error on tear down\n"); - - pblk_rb_free(&pblk->rwb); -} - -static int pblk_rwb_init(struct pblk *pblk) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - unsigned long buffer_size; - int pgs_in_buffer, threshold; - - threshold = geo->mw_cunits * geo->all_luns; - pgs_in_buffer = (max(geo->mw_cunits, geo->ws_opt) + geo->ws_opt) - * geo->all_luns; - - if (write_buffer_size && (write_buffer_size > pgs_in_buffer)) - buffer_size = write_buffer_size; - else - buffer_size = pgs_in_buffer; - - return pblk_rb_init(&pblk->rwb, buffer_size, threshold, geo->csecs); -} - -static int pblk_set_addrf_12(struct pblk *pblk, struct nvm_geo *geo, - struct nvm_addrf_12 *dst) -{ - struct nvm_addrf_12 *src = (struct nvm_addrf_12 *)&geo->addrf; - int power_len; - - /* Re-calculate channel and lun format to adapt to configuration */ - power_len = get_count_order(geo->num_ch); - if (1 << power_len != geo->num_ch) { - pblk_err(pblk, "supports only power-of-two channel config.\n"); - return -EINVAL; - } - dst->ch_len = power_len; - - power_len = get_count_order(geo->num_lun); - if (1 << power_len != geo->num_lun) { - pblk_err(pblk, "supports only power-of-two LUN config.\n"); - return -EINVAL; - } - dst->lun_len = power_len; - - dst->blk_len = src->blk_len; - dst->pg_len = src->pg_len; - dst->pln_len = src->pln_len; - dst->sec_len = src->sec_len; - - dst->sec_offset = 0; - dst->pln_offset = dst->sec_len; - dst->ch_offset = dst->pln_offset + dst->pln_len; - dst->lun_offset = dst->ch_offset + dst->ch_len; - dst->pg_offset = dst->lun_offset + dst->lun_len; - dst->blk_offset = dst->pg_offset + dst->pg_len; - - dst->sec_mask = ((1ULL << dst->sec_len) - 1) << dst->sec_offset; - dst->pln_mask = ((1ULL << dst->pln_len) - 1) << dst->pln_offset; - dst->ch_mask = ((1ULL << dst->ch_len) - 1) << dst->ch_offset; - dst->lun_mask = ((1ULL << dst->lun_len) - 1) << dst->lun_offset; - dst->pg_mask = ((1ULL << dst->pg_len) - 1) << dst->pg_offset; - dst->blk_mask = ((1ULL << dst->blk_len) - 1) << dst->blk_offset; - - return dst->blk_offset + src->blk_len; -} - -static int pblk_set_addrf_20(struct nvm_geo *geo, struct nvm_addrf *adst, - struct pblk_addrf *udst) -{ - struct nvm_addrf *src = &geo->addrf; - - adst->ch_len = get_count_order(geo->num_ch); - adst->lun_len = get_count_order(geo->num_lun); - adst->chk_len = src->chk_len; - adst->sec_len = src->sec_len; - - adst->sec_offset = 0; - adst->ch_offset = adst->sec_len; - adst->lun_offset = adst->ch_offset + adst->ch_len; - adst->chk_offset = adst->lun_offset + adst->lun_len; - - adst->sec_mask = ((1ULL << adst->sec_len) - 1) << adst->sec_offset; - adst->chk_mask = ((1ULL << adst->chk_len) - 1) << adst->chk_offset; - adst->lun_mask = ((1ULL << adst->lun_len) - 1) << adst->lun_offset; - adst->ch_mask = ((1ULL << adst->ch_len) - 1) << adst->ch_offset; - - udst->sec_stripe = geo->ws_opt; - udst->ch_stripe = geo->num_ch; - udst->lun_stripe = geo->num_lun; - - udst->sec_lun_stripe = udst->sec_stripe * udst->ch_stripe; - udst->sec_ws_stripe = udst->sec_lun_stripe * udst->lun_stripe; - - return adst->chk_offset + adst->chk_len; -} - -static int pblk_set_addrf(struct pblk *pblk) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - int mod; - - switch (geo->version) { - case NVM_OCSSD_SPEC_12: - div_u64_rem(geo->clba, pblk->min_write_pgs, &mod); - if (mod) { - pblk_err(pblk, "bad configuration of sectors/pages\n"); - return -EINVAL; - } - - pblk->addrf_len = pblk_set_addrf_12(pblk, geo, - (void *)&pblk->addrf); - break; - case NVM_OCSSD_SPEC_20: - pblk->addrf_len = pblk_set_addrf_20(geo, (void *)&pblk->addrf, - &pblk->uaddrf); - break; - default: - pblk_err(pblk, "OCSSD revision not supported (%d)\n", - geo->version); - return -EINVAL; - } - - return 0; -} - -static int pblk_create_global_caches(void) -{ - - pblk_caches.ws = kmem_cache_create("pblk_blk_ws", - sizeof(struct pblk_line_ws), 0, 0, NULL); - if (!pblk_caches.ws) - return -ENOMEM; - - pblk_caches.rec = kmem_cache_create("pblk_rec", - sizeof(struct pblk_rec_ctx), 0, 0, NULL); - if (!pblk_caches.rec) - goto fail_destroy_ws; - - pblk_caches.g_rq = kmem_cache_create("pblk_g_rq", pblk_g_rq_size, - 0, 0, NULL); - if (!pblk_caches.g_rq) - goto fail_destroy_rec; - - pblk_caches.w_rq = kmem_cache_create("pblk_w_rq", pblk_w_rq_size, - 0, 0, NULL); - if (!pblk_caches.w_rq) - goto fail_destroy_g_rq; - - return 0; - -fail_destroy_g_rq: - kmem_cache_destroy(pblk_caches.g_rq); -fail_destroy_rec: - kmem_cache_destroy(pblk_caches.rec); -fail_destroy_ws: - kmem_cache_destroy(pblk_caches.ws); - - return -ENOMEM; -} - -static int pblk_get_global_caches(void) -{ - int ret = 0; - - mutex_lock(&pblk_caches.mutex); - - if (kref_get_unless_zero(&pblk_caches.kref)) - goto out; - - ret = pblk_create_global_caches(); - if (!ret) - kref_init(&pblk_caches.kref); - -out: - mutex_unlock(&pblk_caches.mutex); - return ret; -} - -static void pblk_destroy_global_caches(struct kref *ref) -{ - struct pblk_global_caches *c; - - c = container_of(ref, struct pblk_global_caches, kref); - - kmem_cache_destroy(c->ws); - kmem_cache_destroy(c->rec); - kmem_cache_destroy(c->g_rq); - kmem_cache_destroy(c->w_rq); -} - -static void pblk_put_global_caches(void) -{ - mutex_lock(&pblk_caches.mutex); - kref_put(&pblk_caches.kref, pblk_destroy_global_caches); - mutex_unlock(&pblk_caches.mutex); -} - -static int pblk_core_init(struct pblk *pblk) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - int ret, max_write_ppas; - - atomic64_set(&pblk->user_wa, 0); - atomic64_set(&pblk->pad_wa, 0); - atomic64_set(&pblk->gc_wa, 0); - pblk->user_rst_wa = 0; - pblk->pad_rst_wa = 0; - pblk->gc_rst_wa = 0; - - atomic64_set(&pblk->nr_flush, 0); - pblk->nr_flush_rst = 0; - - pblk->min_write_pgs = geo->ws_opt; - pblk->min_write_pgs_data = pblk->min_write_pgs; - max_write_ppas = pblk->min_write_pgs * geo->all_luns; - pblk->max_write_pgs = min_t(int, max_write_ppas, NVM_MAX_VLBA); - pblk->max_write_pgs = min_t(int, pblk->max_write_pgs, - queue_max_hw_sectors(dev->q) / (geo->csecs >> SECTOR_SHIFT)); - pblk_set_sec_per_write(pblk, pblk->min_write_pgs); - - pblk->oob_meta_size = geo->sos; - if (!pblk_is_oob_meta_supported(pblk)) { - /* For drives which does not have OOB metadata feature - * in order to support recovery feature we need to use - * so called packed metadata. Packed metada will store - * the same information as OOB metadata (l2p table mapping, - * but in the form of the single page at the end of - * every write request. - */ - if (pblk->min_write_pgs - * sizeof(struct pblk_sec_meta) > PAGE_SIZE) { - /* We want to keep all the packed metadata on single - * page per write requests. So we need to ensure that - * it will fit. - * - * This is more like sanity check, since there is - * no device with such a big minimal write size - * (above 1 metabytes). - */ - pblk_err(pblk, "Not supported min write size\n"); - return -EINVAL; - } - /* For packed meta approach we do some simplification. - * On read path we always issue requests which size - * equal to max_write_pgs, with all pages filled with - * user payload except of last one page which will be - * filled with packed metadata. - */ - pblk->max_write_pgs = pblk->min_write_pgs; - pblk->min_write_pgs_data = pblk->min_write_pgs - 1; - } - - pblk->pad_dist = kcalloc(pblk->min_write_pgs - 1, sizeof(atomic64_t), - GFP_KERNEL); - if (!pblk->pad_dist) - return -ENOMEM; - - if (pblk_get_global_caches()) - goto fail_free_pad_dist; - - /* Internal bios can be at most the sectors signaled by the device. */ - ret = mempool_init_page_pool(&pblk->page_bio_pool, NVM_MAX_VLBA, 0); - if (ret) - goto free_global_caches; - - ret = mempool_init_slab_pool(&pblk->gen_ws_pool, PBLK_GEN_WS_POOL_SIZE, - pblk_caches.ws); - if (ret) - goto free_page_bio_pool; - - ret = mempool_init_slab_pool(&pblk->rec_pool, geo->all_luns, - pblk_caches.rec); - if (ret) - goto free_gen_ws_pool; - - ret = mempool_init_slab_pool(&pblk->r_rq_pool, geo->all_luns, - pblk_caches.g_rq); - if (ret) - goto free_rec_pool; - - ret = mempool_init_slab_pool(&pblk->e_rq_pool, geo->all_luns, - pblk_caches.g_rq); - if (ret) - goto free_r_rq_pool; - - ret = mempool_init_slab_pool(&pblk->w_rq_pool, geo->all_luns, - pblk_caches.w_rq); - if (ret) - goto free_e_rq_pool; - - pblk->close_wq = alloc_workqueue("pblk-close-wq", - WQ_MEM_RECLAIM | WQ_UNBOUND, PBLK_NR_CLOSE_JOBS); - if (!pblk->close_wq) - goto free_w_rq_pool; - - pblk->bb_wq = alloc_workqueue("pblk-bb-wq", - WQ_MEM_RECLAIM | WQ_UNBOUND, 0); - if (!pblk->bb_wq) - goto free_close_wq; - - pblk->r_end_wq = alloc_workqueue("pblk-read-end-wq", - WQ_MEM_RECLAIM | WQ_UNBOUND, 0); - if (!pblk->r_end_wq) - goto free_bb_wq; - - if (pblk_set_addrf(pblk)) - goto free_r_end_wq; - - INIT_LIST_HEAD(&pblk->compl_list); - INIT_LIST_HEAD(&pblk->resubmit_list); - - return 0; - -free_r_end_wq: - destroy_workqueue(pblk->r_end_wq); -free_bb_wq: - destroy_workqueue(pblk->bb_wq); -free_close_wq: - destroy_workqueue(pblk->close_wq); -free_w_rq_pool: - mempool_exit(&pblk->w_rq_pool); -free_e_rq_pool: - mempool_exit(&pblk->e_rq_pool); -free_r_rq_pool: - mempool_exit(&pblk->r_rq_pool); -free_rec_pool: - mempool_exit(&pblk->rec_pool); -free_gen_ws_pool: - mempool_exit(&pblk->gen_ws_pool); -free_page_bio_pool: - mempool_exit(&pblk->page_bio_pool); -free_global_caches: - pblk_put_global_caches(); -fail_free_pad_dist: - kfree(pblk->pad_dist); - return -ENOMEM; -} - -static void pblk_core_free(struct pblk *pblk) -{ - if (pblk->close_wq) - destroy_workqueue(pblk->close_wq); - - if (pblk->r_end_wq) - destroy_workqueue(pblk->r_end_wq); - - if (pblk->bb_wq) - destroy_workqueue(pblk->bb_wq); - - mempool_exit(&pblk->page_bio_pool); - mempool_exit(&pblk->gen_ws_pool); - mempool_exit(&pblk->rec_pool); - mempool_exit(&pblk->r_rq_pool); - mempool_exit(&pblk->e_rq_pool); - mempool_exit(&pblk->w_rq_pool); - - pblk_put_global_caches(); - kfree(pblk->pad_dist); -} - -static void pblk_line_mg_free(struct pblk *pblk) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - int i; - - kfree(l_mg->bb_template); - kfree(l_mg->bb_aux); - kfree(l_mg->vsc_list); - - for (i = 0; i < PBLK_DATA_LINES; i++) { - kfree(l_mg->sline_meta[i]); - kvfree(l_mg->eline_meta[i]->buf); - kfree(l_mg->eline_meta[i]); - } - - mempool_destroy(l_mg->bitmap_pool); - kmem_cache_destroy(l_mg->bitmap_cache); -} - -static void pblk_line_meta_free(struct pblk_line_mgmt *l_mg, - struct pblk_line *line) -{ - struct pblk_w_err_gc *w_err_gc = line->w_err_gc; - - kfree(line->blk_bitmap); - kfree(line->erase_bitmap); - kfree(line->chks); - - kvfree(w_err_gc->lba_list); - kfree(w_err_gc); -} - -static void pblk_lines_free(struct pblk *pblk) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line *line; - int i; - - for (i = 0; i < l_mg->nr_lines; i++) { - line = &pblk->lines[i]; - - pblk_line_free(line); - pblk_line_meta_free(l_mg, line); - } - - pblk_line_mg_free(pblk); - - kfree(pblk->luns); - kfree(pblk->lines); -} - -static int pblk_luns_init(struct pblk *pblk) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_lun *rlun; - int i; - - /* TODO: Implement unbalanced LUN support */ - if (geo->num_lun < 0) { - pblk_err(pblk, "unbalanced LUN config.\n"); - return -EINVAL; - } - - pblk->luns = kcalloc(geo->all_luns, sizeof(struct pblk_lun), - GFP_KERNEL); - if (!pblk->luns) - return -ENOMEM; - - for (i = 0; i < geo->all_luns; i++) { - /* Stripe across channels */ - int ch = i % geo->num_ch; - int lun_raw = i / geo->num_ch; - int lunid = lun_raw + ch * geo->num_lun; - - rlun = &pblk->luns[i]; - rlun->bppa = dev->luns[lunid]; - - sema_init(&rlun->wr_sem, 1); - } - - return 0; -} - -/* See comment over struct line_emeta definition */ -static unsigned int calc_emeta_len(struct pblk *pblk) -{ - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - - /* Round to sector size so that lba_list starts on its own sector */ - lm->emeta_sec[1] = DIV_ROUND_UP( - sizeof(struct line_emeta) + lm->blk_bitmap_len + - sizeof(struct wa_counters), geo->csecs); - lm->emeta_len[1] = lm->emeta_sec[1] * geo->csecs; - - /* Round to sector size so that vsc_list starts on its own sector */ - lm->dsec_per_line = lm->sec_per_line - lm->emeta_sec[0]; - lm->emeta_sec[2] = DIV_ROUND_UP(lm->dsec_per_line * sizeof(u64), - geo->csecs); - lm->emeta_len[2] = lm->emeta_sec[2] * geo->csecs; - - lm->emeta_sec[3] = DIV_ROUND_UP(l_mg->nr_lines * sizeof(u32), - geo->csecs); - lm->emeta_len[3] = lm->emeta_sec[3] * geo->csecs; - - lm->vsc_list_len = l_mg->nr_lines * sizeof(u32); - - return (lm->emeta_len[1] + lm->emeta_len[2] + lm->emeta_len[3]); -} - -static int pblk_set_provision(struct pblk *pblk, int nr_free_chks) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line_meta *lm = &pblk->lm; - struct nvm_geo *geo = &dev->geo; - sector_t provisioned; - int sec_meta, blk_meta, clba; - int minimum; - - if (geo->op == NVM_TARGET_DEFAULT_OP) - pblk->op = PBLK_DEFAULT_OP; - else - pblk->op = geo->op; - - minimum = pblk_get_min_chks(pblk); - provisioned = nr_free_chks; - provisioned *= (100 - pblk->op); - sector_div(provisioned, 100); - - if ((nr_free_chks - provisioned) < minimum) { - if (geo->op != NVM_TARGET_DEFAULT_OP) { - pblk_err(pblk, "OP too small to create a sane instance\n"); - return -EINTR; - } - - /* If the user did not specify an OP value, and PBLK_DEFAULT_OP - * is not enough, calculate and set sane value - */ - - provisioned = nr_free_chks - minimum; - pblk->op = (100 * minimum) / nr_free_chks; - pblk_info(pblk, "Default OP insufficient, adjusting OP to %d\n", - pblk->op); - } - - pblk->op_blks = nr_free_chks - provisioned; - - /* Internally pblk manages all free blocks, but all calculations based - * on user capacity consider only provisioned blocks - */ - pblk->rl.total_blocks = nr_free_chks; - - /* Consider sectors used for metadata */ - sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines; - blk_meta = DIV_ROUND_UP(sec_meta, geo->clba); - - clba = (geo->clba / pblk->min_write_pgs) * pblk->min_write_pgs_data; - pblk->capacity = (provisioned - blk_meta) * clba; - - atomic_set(&pblk->rl.free_blocks, nr_free_chks); - atomic_set(&pblk->rl.free_user_blocks, nr_free_chks); - - return 0; -} - -static int pblk_setup_line_meta_chk(struct pblk *pblk, struct pblk_line *line, - struct nvm_chk_meta *meta) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line_meta *lm = &pblk->lm; - int i, nr_bad_chks = 0; - - for (i = 0; i < lm->blk_per_line; i++) { - struct pblk_lun *rlun = &pblk->luns[i]; - struct nvm_chk_meta *chunk; - struct nvm_chk_meta *chunk_meta; - struct ppa_addr ppa; - int pos; - - ppa = rlun->bppa; - pos = pblk_ppa_to_pos(geo, ppa); - chunk = &line->chks[pos]; - - ppa.m.chk = line->id; - chunk_meta = pblk_chunk_get_off(pblk, meta, ppa); - - chunk->state = chunk_meta->state; - chunk->type = chunk_meta->type; - chunk->wi = chunk_meta->wi; - chunk->slba = chunk_meta->slba; - chunk->cnlb = chunk_meta->cnlb; - chunk->wp = chunk_meta->wp; - - trace_pblk_chunk_state(pblk_disk_name(pblk), &ppa, - chunk->state); - - if (chunk->type & NVM_CHK_TP_SZ_SPEC) { - WARN_ONCE(1, "pblk: custom-sized chunks unsupported\n"); - continue; - } - - if (!(chunk->state & NVM_CHK_ST_OFFLINE)) - continue; - - set_bit(pos, line->blk_bitmap); - nr_bad_chks++; - } - - return nr_bad_chks; -} - -static long pblk_setup_line_meta(struct pblk *pblk, struct pblk_line *line, - void *chunk_meta, int line_id) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line_meta *lm = &pblk->lm; - long nr_bad_chks, chk_in_line; - - line->pblk = pblk; - line->id = line_id; - line->type = PBLK_LINETYPE_FREE; - line->state = PBLK_LINESTATE_NEW; - line->gc_group = PBLK_LINEGC_NONE; - line->vsc = &l_mg->vsc_list[line_id]; - spin_lock_init(&line->lock); - - nr_bad_chks = pblk_setup_line_meta_chk(pblk, line, chunk_meta); - - chk_in_line = lm->blk_per_line - nr_bad_chks; - if (nr_bad_chks < 0 || nr_bad_chks > lm->blk_per_line || - chk_in_line < lm->min_blk_line) { - line->state = PBLK_LINESTATE_BAD; - list_add_tail(&line->list, &l_mg->bad_list); - return 0; - } - - atomic_set(&line->blk_in_line, chk_in_line); - list_add_tail(&line->list, &l_mg->free_list); - l_mg->nr_free_lines++; - - return chk_in_line; -} - -static int pblk_alloc_line_meta(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_line_meta *lm = &pblk->lm; - - line->blk_bitmap = kzalloc(lm->blk_bitmap_len, GFP_KERNEL); - if (!line->blk_bitmap) - return -ENOMEM; - - line->erase_bitmap = kzalloc(lm->blk_bitmap_len, GFP_KERNEL); - if (!line->erase_bitmap) - goto free_blk_bitmap; - - - line->chks = kmalloc_array(lm->blk_per_line, - sizeof(struct nvm_chk_meta), GFP_KERNEL); - if (!line->chks) - goto free_erase_bitmap; - - line->w_err_gc = kzalloc(sizeof(struct pblk_w_err_gc), GFP_KERNEL); - if (!line->w_err_gc) - goto free_chks; - - return 0; - -free_chks: - kfree(line->chks); -free_erase_bitmap: - kfree(line->erase_bitmap); -free_blk_bitmap: - kfree(line->blk_bitmap); - return -ENOMEM; -} - -static int pblk_line_mg_init(struct pblk *pblk) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line_meta *lm = &pblk->lm; - int i, bb_distance; - - l_mg->nr_lines = geo->num_chk; - l_mg->log_line = l_mg->data_line = NULL; - l_mg->l_seq_nr = l_mg->d_seq_nr = 0; - l_mg->nr_free_lines = 0; - bitmap_zero(&l_mg->meta_bitmap, PBLK_DATA_LINES); - - INIT_LIST_HEAD(&l_mg->free_list); - INIT_LIST_HEAD(&l_mg->corrupt_list); - INIT_LIST_HEAD(&l_mg->bad_list); - INIT_LIST_HEAD(&l_mg->gc_full_list); - INIT_LIST_HEAD(&l_mg->gc_high_list); - INIT_LIST_HEAD(&l_mg->gc_mid_list); - INIT_LIST_HEAD(&l_mg->gc_low_list); - INIT_LIST_HEAD(&l_mg->gc_empty_list); - INIT_LIST_HEAD(&l_mg->gc_werr_list); - - INIT_LIST_HEAD(&l_mg->emeta_list); - - l_mg->gc_lists[0] = &l_mg->gc_werr_list; - l_mg->gc_lists[1] = &l_mg->gc_high_list; - l_mg->gc_lists[2] = &l_mg->gc_mid_list; - l_mg->gc_lists[3] = &l_mg->gc_low_list; - - spin_lock_init(&l_mg->free_lock); - spin_lock_init(&l_mg->close_lock); - spin_lock_init(&l_mg->gc_lock); - - l_mg->vsc_list = kcalloc(l_mg->nr_lines, sizeof(__le32), GFP_KERNEL); - if (!l_mg->vsc_list) - goto fail; - - l_mg->bb_template = kzalloc(lm->sec_bitmap_len, GFP_KERNEL); - if (!l_mg->bb_template) - goto fail_free_vsc_list; - - l_mg->bb_aux = kzalloc(lm->sec_bitmap_len, GFP_KERNEL); - if (!l_mg->bb_aux) - goto fail_free_bb_template; - - /* smeta is always small enough to fit on a kmalloc memory allocation, - * emeta depends on the number of LUNs allocated to the pblk instance - */ - for (i = 0; i < PBLK_DATA_LINES; i++) { - l_mg->sline_meta[i] = kmalloc(lm->smeta_len, GFP_KERNEL); - if (!l_mg->sline_meta[i]) - goto fail_free_smeta; - } - - l_mg->bitmap_cache = kmem_cache_create("pblk_lm_bitmap", - lm->sec_bitmap_len, 0, 0, NULL); - if (!l_mg->bitmap_cache) - goto fail_free_smeta; - - /* the bitmap pool is used for both valid and map bitmaps */ - l_mg->bitmap_pool = mempool_create_slab_pool(PBLK_DATA_LINES * 2, - l_mg->bitmap_cache); - if (!l_mg->bitmap_pool) - goto fail_destroy_bitmap_cache; - - /* emeta allocates three different buffers for managing metadata with - * in-memory and in-media layouts - */ - for (i = 0; i < PBLK_DATA_LINES; i++) { - struct pblk_emeta *emeta; - - emeta = kmalloc(sizeof(struct pblk_emeta), GFP_KERNEL); - if (!emeta) - goto fail_free_emeta; - - emeta->buf = kvmalloc(lm->emeta_len[0], GFP_KERNEL); - if (!emeta->buf) { - kfree(emeta); - goto fail_free_emeta; - } - - emeta->nr_entries = lm->emeta_sec[0]; - l_mg->eline_meta[i] = emeta; - } - - for (i = 0; i < l_mg->nr_lines; i++) - l_mg->vsc_list[i] = cpu_to_le32(EMPTY_ENTRY); - - bb_distance = (geo->all_luns) * geo->ws_opt; - for (i = 0; i < lm->sec_per_line; i += bb_distance) - bitmap_set(l_mg->bb_template, i, geo->ws_opt); - - return 0; - -fail_free_emeta: - while (--i >= 0) { - kvfree(l_mg->eline_meta[i]->buf); - kfree(l_mg->eline_meta[i]); - } - - mempool_destroy(l_mg->bitmap_pool); -fail_destroy_bitmap_cache: - kmem_cache_destroy(l_mg->bitmap_cache); -fail_free_smeta: - for (i = 0; i < PBLK_DATA_LINES; i++) - kfree(l_mg->sline_meta[i]); - kfree(l_mg->bb_aux); -fail_free_bb_template: - kfree(l_mg->bb_template); -fail_free_vsc_list: - kfree(l_mg->vsc_list); -fail: - return -ENOMEM; -} - -static int pblk_line_meta_init(struct pblk *pblk) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line_meta *lm = &pblk->lm; - unsigned int smeta_len, emeta_len; - int i; - - lm->sec_per_line = geo->clba * geo->all_luns; - lm->blk_per_line = geo->all_luns; - lm->blk_bitmap_len = BITS_TO_LONGS(geo->all_luns) * sizeof(long); - lm->sec_bitmap_len = BITS_TO_LONGS(lm->sec_per_line) * sizeof(long); - lm->lun_bitmap_len = BITS_TO_LONGS(geo->all_luns) * sizeof(long); - lm->mid_thrs = lm->sec_per_line / 2; - lm->high_thrs = lm->sec_per_line / 4; - lm->meta_distance = (geo->all_luns / 2) * pblk->min_write_pgs; - - /* Calculate necessary pages for smeta. See comment over struct - * line_smeta definition - */ - i = 1; -add_smeta_page: - lm->smeta_sec = i * geo->ws_opt; - lm->smeta_len = lm->smeta_sec * geo->csecs; - - smeta_len = sizeof(struct line_smeta) + lm->lun_bitmap_len; - if (smeta_len > lm->smeta_len) { - i++; - goto add_smeta_page; - } - - /* Calculate necessary pages for emeta. See comment over struct - * line_emeta definition - */ - i = 1; -add_emeta_page: - lm->emeta_sec[0] = i * geo->ws_opt; - lm->emeta_len[0] = lm->emeta_sec[0] * geo->csecs; - - emeta_len = calc_emeta_len(pblk); - if (emeta_len > lm->emeta_len[0]) { - i++; - goto add_emeta_page; - } - - lm->emeta_bb = geo->all_luns > i ? geo->all_luns - i : 0; - - lm->min_blk_line = 1; - if (geo->all_luns > 1) - lm->min_blk_line += DIV_ROUND_UP(lm->smeta_sec + - lm->emeta_sec[0], geo->clba); - - if (lm->min_blk_line > lm->blk_per_line) { - pblk_err(pblk, "config. not supported. Min. LUN in line:%d\n", - lm->blk_per_line); - return -EINVAL; - } - - return 0; -} - -static int pblk_lines_init(struct pblk *pblk) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line *line; - void *chunk_meta; - int nr_free_chks = 0; - int i, ret; - - ret = pblk_line_meta_init(pblk); - if (ret) - return ret; - - ret = pblk_line_mg_init(pblk); - if (ret) - return ret; - - ret = pblk_luns_init(pblk); - if (ret) - goto fail_free_meta; - - chunk_meta = pblk_get_chunk_meta(pblk); - if (IS_ERR(chunk_meta)) { - ret = PTR_ERR(chunk_meta); - goto fail_free_luns; - } - - pblk->lines = kcalloc(l_mg->nr_lines, sizeof(struct pblk_line), - GFP_KERNEL); - if (!pblk->lines) { - ret = -ENOMEM; - goto fail_free_chunk_meta; - } - - for (i = 0; i < l_mg->nr_lines; i++) { - line = &pblk->lines[i]; - - ret = pblk_alloc_line_meta(pblk, line); - if (ret) - goto fail_free_lines; - - nr_free_chks += pblk_setup_line_meta(pblk, line, chunk_meta, i); - - trace_pblk_line_state(pblk_disk_name(pblk), line->id, - line->state); - } - - if (!nr_free_chks) { - pblk_err(pblk, "too many bad blocks prevent for sane instance\n"); - ret = -EINTR; - goto fail_free_lines; - } - - ret = pblk_set_provision(pblk, nr_free_chks); - if (ret) - goto fail_free_lines; - - vfree(chunk_meta); - return 0; - -fail_free_lines: - while (--i >= 0) - pblk_line_meta_free(l_mg, &pblk->lines[i]); - kfree(pblk->lines); -fail_free_chunk_meta: - vfree(chunk_meta); -fail_free_luns: - kfree(pblk->luns); -fail_free_meta: - pblk_line_mg_free(pblk); - - return ret; -} - -static int pblk_writer_init(struct pblk *pblk) -{ - pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t"); - if (IS_ERR(pblk->writer_ts)) { - int err = PTR_ERR(pblk->writer_ts); - - if (err != -EINTR) - pblk_err(pblk, "could not allocate writer kthread (%d)\n", - err); - return err; - } - - timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0); - mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100)); - - return 0; -} - -static void pblk_writer_stop(struct pblk *pblk) -{ - /* The pipeline must be stopped and the write buffer emptied before the - * write thread is stopped - */ - WARN(pblk_rb_read_count(&pblk->rwb), - "Stopping not fully persisted write buffer\n"); - - WARN(pblk_rb_sync_count(&pblk->rwb), - "Stopping not fully synced write buffer\n"); - - del_timer_sync(&pblk->wtimer); - if (pblk->writer_ts) - kthread_stop(pblk->writer_ts); -} - -static void pblk_free(struct pblk *pblk) -{ - pblk_lines_free(pblk); - pblk_l2p_free(pblk); - pblk_rwb_free(pblk); - pblk_core_free(pblk); - - kfree(pblk); -} - -static void pblk_tear_down(struct pblk *pblk, bool graceful) -{ - if (graceful) - __pblk_pipeline_flush(pblk); - __pblk_pipeline_stop(pblk); - pblk_writer_stop(pblk); - pblk_rb_sync_l2p(&pblk->rwb); - pblk_rl_free(&pblk->rl); - - pblk_debug(pblk, "consistent tear down (graceful:%d)\n", graceful); -} - -static void pblk_exit(void *private, bool graceful) -{ - struct pblk *pblk = private; - - pblk_gc_exit(pblk, graceful); - pblk_tear_down(pblk, graceful); - -#ifdef CONFIG_NVM_PBLK_DEBUG - pblk_info(pblk, "exit: L2P CRC: %x\n", pblk_l2p_crc(pblk)); -#endif - - pblk_free(pblk); -} - -static sector_t pblk_capacity(void *private) -{ - struct pblk *pblk = private; - - return pblk->capacity * NR_PHY_IN_LOG; -} - -static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk, - int flags) -{ - struct nvm_geo *geo = &dev->geo; - struct request_queue *bqueue = dev->q; - struct request_queue *tqueue = tdisk->queue; - struct pblk *pblk; - int ret; - - pblk = kzalloc(sizeof(struct pblk), GFP_KERNEL); - if (!pblk) - return ERR_PTR(-ENOMEM); - - pblk->dev = dev; - pblk->disk = tdisk; - pblk->state = PBLK_STATE_RUNNING; - trace_pblk_state(pblk_disk_name(pblk), pblk->state); - pblk->gc.gc_enabled = 0; - - if (!(geo->version == NVM_OCSSD_SPEC_12 || - geo->version == NVM_OCSSD_SPEC_20)) { - pblk_err(pblk, "OCSSD version not supported (%u)\n", - geo->version); - kfree(pblk); - return ERR_PTR(-EINVAL); - } - - if (geo->ext) { - pblk_err(pblk, "extended metadata not supported\n"); - kfree(pblk); - return ERR_PTR(-EINVAL); - } - - spin_lock_init(&pblk->resubmit_lock); - spin_lock_init(&pblk->trans_lock); - spin_lock_init(&pblk->lock); - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_set(&pblk->inflight_writes, 0); - atomic_long_set(&pblk->padded_writes, 0); - atomic_long_set(&pblk->padded_wb, 0); - atomic_long_set(&pblk->req_writes, 0); - atomic_long_set(&pblk->sub_writes, 0); - atomic_long_set(&pblk->sync_writes, 0); - atomic_long_set(&pblk->inflight_reads, 0); - atomic_long_set(&pblk->cache_reads, 0); - atomic_long_set(&pblk->sync_reads, 0); - atomic_long_set(&pblk->recov_writes, 0); - atomic_long_set(&pblk->recov_writes, 0); - atomic_long_set(&pblk->recov_gc_writes, 0); - atomic_long_set(&pblk->recov_gc_reads, 0); -#endif - - atomic_long_set(&pblk->read_failed, 0); - atomic_long_set(&pblk->read_empty, 0); - atomic_long_set(&pblk->read_high_ecc, 0); - atomic_long_set(&pblk->read_failed_gc, 0); - atomic_long_set(&pblk->write_failed, 0); - atomic_long_set(&pblk->erase_failed, 0); - - ret = pblk_core_init(pblk); - if (ret) { - pblk_err(pblk, "could not initialize core\n"); - goto fail; - } - - ret = pblk_lines_init(pblk); - if (ret) { - pblk_err(pblk, "could not initialize lines\n"); - goto fail_free_core; - } - - ret = pblk_rwb_init(pblk); - if (ret) { - pblk_err(pblk, "could not initialize write buffer\n"); - goto fail_free_lines; - } - - ret = pblk_l2p_init(pblk, flags & NVM_TARGET_FACTORY); - if (ret) { - pblk_err(pblk, "could not initialize maps\n"); - goto fail_free_rwb; - } - - ret = pblk_writer_init(pblk); - if (ret) { - if (ret != -EINTR) - pblk_err(pblk, "could not initialize write thread\n"); - goto fail_free_l2p; - } - - ret = pblk_gc_init(pblk); - if (ret) { - pblk_err(pblk, "could not initialize gc\n"); - goto fail_stop_writer; - } - - /* inherit the size from the underlying device */ - blk_queue_logical_block_size(tqueue, queue_physical_block_size(bqueue)); - blk_queue_max_hw_sectors(tqueue, queue_max_hw_sectors(bqueue)); - - blk_queue_write_cache(tqueue, true, false); - - tqueue->limits.discard_granularity = geo->clba * geo->csecs; - tqueue->limits.discard_alignment = 0; - blk_queue_max_discard_sectors(tqueue, UINT_MAX >> 9); - blk_queue_flag_set(QUEUE_FLAG_DISCARD, tqueue); - - pblk_info(pblk, "luns:%u, lines:%d, secs:%llu, buf entries:%u\n", - geo->all_luns, pblk->l_mg.nr_lines, - (unsigned long long)pblk->capacity, - pblk->rwb.nr_entries); - - wake_up_process(pblk->writer_ts); - - /* Check if we need to start GC */ - pblk_gc_should_kick(pblk); - - return pblk; - -fail_stop_writer: - pblk_writer_stop(pblk); -fail_free_l2p: - pblk_l2p_free(pblk); -fail_free_rwb: - pblk_rwb_free(pblk); -fail_free_lines: - pblk_lines_free(pblk); -fail_free_core: - pblk_core_free(pblk); -fail: - kfree(pblk); - return ERR_PTR(ret); -} - -/* physical block device target */ -static struct nvm_tgt_type tt_pblk = { - .name = "pblk", - .version = {1, 0, 0}, - - .bops = &pblk_bops, - .capacity = pblk_capacity, - - .init = pblk_init, - .exit = pblk_exit, - - .sysfs_init = pblk_sysfs_init, - .sysfs_exit = pblk_sysfs_exit, - .owner = THIS_MODULE, -}; - -static int __init pblk_module_init(void) -{ - int ret; - - ret = bioset_init(&pblk_bio_set, BIO_POOL_SIZE, 0, 0); - if (ret) - return ret; - ret = nvm_register_tgt_type(&tt_pblk); - if (ret) - bioset_exit(&pblk_bio_set); - return ret; -} - -static void pblk_module_exit(void) -{ - bioset_exit(&pblk_bio_set); - nvm_unregister_tgt_type(&tt_pblk); -} - -module_init(pblk_module_init); -module_exit(pblk_module_exit); -MODULE_AUTHOR("Javier Gonzalez "); -MODULE_AUTHOR("Matias Bjorling "); -MODULE_LICENSE("GPL v2"); -MODULE_DESCRIPTION("Physical Block-Device for Open-Channel SSDs"); diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c deleted file mode 100644 index 5408e32b2f13..000000000000 --- a/drivers/lightnvm/pblk-map.c +++ /dev/null @@ -1,210 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2016 CNEX Labs - * Initial release: Javier Gonzalez - * Matias Bjorling - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * pblk-map.c - pblk's lba-ppa mapping strategy - * - */ - -#include "pblk.h" - -static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry, - struct ppa_addr *ppa_list, - unsigned long *lun_bitmap, - void *meta_list, - unsigned int valid_secs) -{ - struct pblk_line *line = pblk_line_get_data(pblk); - struct pblk_emeta *emeta; - struct pblk_w_ctx *w_ctx; - __le64 *lba_list; - u64 paddr; - int nr_secs = pblk->min_write_pgs; - int i; - - if (!line) - return -ENOSPC; - - if (pblk_line_is_full(line)) { - struct pblk_line *prev_line = line; - - /* If we cannot allocate a new line, make sure to store metadata - * on current line and then fail - */ - line = pblk_line_replace_data(pblk); - pblk_line_close_meta(pblk, prev_line); - - if (!line) { - pblk_pipeline_stop(pblk); - return -ENOSPC; - } - - } - - emeta = line->emeta; - lba_list = emeta_to_lbas(pblk, emeta->buf); - - paddr = pblk_alloc_page(pblk, line, nr_secs); - - for (i = 0; i < nr_secs; i++, paddr++) { - struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i); - __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); - - /* ppa to be sent to the device */ - ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id); - - /* Write context for target bio completion on write buffer. Note - * that the write buffer is protected by the sync backpointer, - * and a single writer thread have access to each specific entry - * at a time. Thus, it is safe to modify the context for the - * entry we are setting up for submission without taking any - * lock or memory barrier. - */ - if (i < valid_secs) { - kref_get(&line->ref); - atomic_inc(&line->sec_to_update); - w_ctx = pblk_rb_w_ctx(&pblk->rwb, sentry + i); - w_ctx->ppa = ppa_list[i]; - meta->lba = cpu_to_le64(w_ctx->lba); - lba_list[paddr] = cpu_to_le64(w_ctx->lba); - if (lba_list[paddr] != addr_empty) - line->nr_valid_lbas++; - else - atomic64_inc(&pblk->pad_wa); - } else { - lba_list[paddr] = addr_empty; - meta->lba = addr_empty; - __pblk_map_invalidate(pblk, line, paddr); - } - } - - pblk_down_rq(pblk, ppa_list[0], lun_bitmap); - return 0; -} - -int pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry, - unsigned long *lun_bitmap, unsigned int valid_secs, - unsigned int off) -{ - void *meta_list = pblk_get_meta_for_writes(pblk, rqd); - void *meta_buffer; - struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd); - unsigned int map_secs; - int min = pblk->min_write_pgs; - int i; - int ret; - - for (i = off; i < rqd->nr_ppas; i += min) { - map_secs = (i + min > valid_secs) ? (valid_secs % min) : min; - meta_buffer = pblk_get_meta(pblk, meta_list, i); - - ret = pblk_map_page_data(pblk, sentry + i, &ppa_list[i], - lun_bitmap, meta_buffer, map_secs); - if (ret) - return ret; - } - - return 0; -} - -/* only if erase_ppa is set, acquire erase semaphore */ -int pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd, - unsigned int sentry, unsigned long *lun_bitmap, - unsigned int valid_secs, struct ppa_addr *erase_ppa) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line_meta *lm = &pblk->lm; - void *meta_list = pblk_get_meta_for_writes(pblk, rqd); - void *meta_buffer; - struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd); - struct pblk_line *e_line, *d_line; - unsigned int map_secs; - int min = pblk->min_write_pgs; - int i, erase_lun; - int ret; - - - for (i = 0; i < rqd->nr_ppas; i += min) { - map_secs = (i + min > valid_secs) ? (valid_secs % min) : min; - meta_buffer = pblk_get_meta(pblk, meta_list, i); - - ret = pblk_map_page_data(pblk, sentry + i, &ppa_list[i], - lun_bitmap, meta_buffer, map_secs); - if (ret) - return ret; - - erase_lun = pblk_ppa_to_pos(geo, ppa_list[i]); - - /* line can change after page map. We might also be writing the - * last line. - */ - e_line = pblk_line_get_erase(pblk); - if (!e_line) - return pblk_map_rq(pblk, rqd, sentry, lun_bitmap, - valid_secs, i + min); - - spin_lock(&e_line->lock); - if (!test_bit(erase_lun, e_line->erase_bitmap)) { - set_bit(erase_lun, e_line->erase_bitmap); - atomic_dec(&e_line->left_eblks); - - *erase_ppa = ppa_list[i]; - erase_ppa->a.blk = e_line->id; - erase_ppa->a.reserved = 0; - - spin_unlock(&e_line->lock); - - /* Avoid evaluating e_line->left_eblks */ - return pblk_map_rq(pblk, rqd, sentry, lun_bitmap, - valid_secs, i + min); - } - spin_unlock(&e_line->lock); - } - - d_line = pblk_line_get_data(pblk); - - /* line can change after page map. We might also be writing the - * last line. - */ - e_line = pblk_line_get_erase(pblk); - if (!e_line) - return -ENOSPC; - - /* Erase blocks that are bad in this line but might not be in next */ - if (unlikely(pblk_ppa_empty(*erase_ppa)) && - bitmap_weight(d_line->blk_bitmap, lm->blk_per_line)) { - int bit = -1; - -retry: - bit = find_next_bit(d_line->blk_bitmap, - lm->blk_per_line, bit + 1); - if (bit >= lm->blk_per_line) - return 0; - - spin_lock(&e_line->lock); - if (test_bit(bit, e_line->erase_bitmap)) { - spin_unlock(&e_line->lock); - goto retry; - } - spin_unlock(&e_line->lock); - - set_bit(bit, e_line->erase_bitmap); - atomic_dec(&e_line->left_eblks); - *erase_ppa = pblk->luns[bit].bppa; /* set ch and lun */ - erase_ppa->a.blk = e_line->id; - } - - return 0; -} diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c deleted file mode 100644 index 5abb1705b039..000000000000 --- a/drivers/lightnvm/pblk-rb.c +++ /dev/null @@ -1,858 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2016 CNEX Labs - * Initial release: Javier Gonzalez - * - * Based upon the circular ringbuffer. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * pblk-rb.c - pblk's write buffer - */ - -#include - -#include "pblk.h" - -static DECLARE_RWSEM(pblk_rb_lock); - -static void pblk_rb_data_free(struct pblk_rb *rb) -{ - struct pblk_rb_pages *p, *t; - - down_write(&pblk_rb_lock); - list_for_each_entry_safe(p, t, &rb->pages, list) { - free_pages((unsigned long)page_address(p->pages), p->order); - list_del(&p->list); - kfree(p); - } - up_write(&pblk_rb_lock); -} - -void pblk_rb_free(struct pblk_rb *rb) -{ - pblk_rb_data_free(rb); - vfree(rb->entries); -} - -/* - * pblk_rb_calculate_size -- calculate the size of the write buffer - */ -static unsigned int pblk_rb_calculate_size(unsigned int nr_entries, - unsigned int threshold) -{ - unsigned int thr_sz = 1 << (get_count_order(threshold + NVM_MAX_VLBA)); - unsigned int max_sz = max(thr_sz, nr_entries); - unsigned int max_io; - - /* Alloc a write buffer that can (i) fit at least two split bios - * (considering max I/O size NVM_MAX_VLBA, and (ii) guarantee that the - * threshold will be respected - */ - max_io = (1 << max((int)(get_count_order(max_sz)), - (int)(get_count_order(NVM_MAX_VLBA << 1)))); - if ((threshold + NVM_MAX_VLBA) >= max_io) - max_io <<= 1; - - return max_io; -} - -/* - * Initialize ring buffer. The data and metadata buffers must be previously - * allocated and their size must be a power of two - * (Documentation/core-api/circular-buffers.rst) - */ -int pblk_rb_init(struct pblk_rb *rb, unsigned int size, unsigned int threshold, - unsigned int seg_size) -{ - struct pblk *pblk = container_of(rb, struct pblk, rwb); - struct pblk_rb_entry *entries; - unsigned int init_entry = 0; - unsigned int max_order = MAX_ORDER - 1; - unsigned int power_size, power_seg_sz; - unsigned int alloc_order, order, iter; - unsigned int nr_entries; - - nr_entries = pblk_rb_calculate_size(size, threshold); - entries = vzalloc(array_size(nr_entries, sizeof(struct pblk_rb_entry))); - if (!entries) - return -ENOMEM; - - power_size = get_count_order(nr_entries); - power_seg_sz = get_count_order(seg_size); - - down_write(&pblk_rb_lock); - rb->entries = entries; - rb->seg_size = (1 << power_seg_sz); - rb->nr_entries = (1 << power_size); - rb->mem = rb->subm = rb->sync = rb->l2p_update = 0; - rb->back_thres = threshold; - rb->flush_point = EMPTY_ENTRY; - - spin_lock_init(&rb->w_lock); - spin_lock_init(&rb->s_lock); - - INIT_LIST_HEAD(&rb->pages); - - alloc_order = power_size; - if (alloc_order >= max_order) { - order = max_order; - iter = (1 << (alloc_order - max_order)); - } else { - order = alloc_order; - iter = 1; - } - - do { - struct pblk_rb_entry *entry; - struct pblk_rb_pages *page_set; - void *kaddr; - unsigned long set_size; - int i; - - page_set = kmalloc(sizeof(struct pblk_rb_pages), GFP_KERNEL); - if (!page_set) { - up_write(&pblk_rb_lock); - vfree(entries); - return -ENOMEM; - } - - page_set->order = order; - page_set->pages = alloc_pages(GFP_KERNEL, order); - if (!page_set->pages) { - kfree(page_set); - pblk_rb_data_free(rb); - up_write(&pblk_rb_lock); - vfree(entries); - return -ENOMEM; - } - kaddr = page_address(page_set->pages); - - entry = &rb->entries[init_entry]; - entry->data = kaddr; - entry->cacheline = pblk_cacheline_to_addr(init_entry++); - entry->w_ctx.flags = PBLK_WRITABLE_ENTRY; - - set_size = (1 << order); - for (i = 1; i < set_size; i++) { - entry = &rb->entries[init_entry]; - entry->cacheline = pblk_cacheline_to_addr(init_entry++); - entry->data = kaddr + (i * rb->seg_size); - entry->w_ctx.flags = PBLK_WRITABLE_ENTRY; - bio_list_init(&entry->w_ctx.bios); - } - - list_add_tail(&page_set->list, &rb->pages); - iter--; - } while (iter > 0); - up_write(&pblk_rb_lock); - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_set(&rb->inflight_flush_point, 0); -#endif - - /* - * Initialize rate-limiter, which controls access to the write buffer - * by user and GC I/O - */ - pblk_rl_init(&pblk->rl, rb->nr_entries, threshold); - - return 0; -} - -static void clean_wctx(struct pblk_w_ctx *w_ctx) -{ - int flags; - - flags = READ_ONCE(w_ctx->flags); - WARN_ONCE(!(flags & PBLK_SUBMITTED_ENTRY), - "pblk: overwriting unsubmitted data\n"); - - /* Release flags on context. Protect from writes and reads */ - smp_store_release(&w_ctx->flags, PBLK_WRITABLE_ENTRY); - pblk_ppa_set_empty(&w_ctx->ppa); - w_ctx->lba = ADDR_EMPTY; -} - -#define pblk_rb_ring_count(head, tail, size) CIRC_CNT(head, tail, size) -#define pblk_rb_ring_space(rb, head, tail, size) \ - (CIRC_SPACE(head, tail, size)) - -/* - * Buffer space is calculated with respect to the back pointer signaling - * synchronized entries to the media. - */ -static unsigned int pblk_rb_space(struct pblk_rb *rb) -{ - unsigned int mem = READ_ONCE(rb->mem); - unsigned int sync = READ_ONCE(rb->sync); - - return pblk_rb_ring_space(rb, mem, sync, rb->nr_entries); -} - -unsigned int pblk_rb_ptr_wrap(struct pblk_rb *rb, unsigned int p, - unsigned int nr_entries) -{ - return (p + nr_entries) & (rb->nr_entries - 1); -} - -/* - * Buffer count is calculated with respect to the submission entry signaling the - * entries that are available to send to the media - */ -unsigned int pblk_rb_read_count(struct pblk_rb *rb) -{ - unsigned int mem = READ_ONCE(rb->mem); - unsigned int subm = READ_ONCE(rb->subm); - - return pblk_rb_ring_count(mem, subm, rb->nr_entries); -} - -unsigned int pblk_rb_sync_count(struct pblk_rb *rb) -{ - unsigned int mem = READ_ONCE(rb->mem); - unsigned int sync = READ_ONCE(rb->sync); - - return pblk_rb_ring_count(mem, sync, rb->nr_entries); -} - -unsigned int pblk_rb_read_commit(struct pblk_rb *rb, unsigned int nr_entries) -{ - unsigned int subm; - - subm = READ_ONCE(rb->subm); - /* Commit read means updating submission pointer */ - smp_store_release(&rb->subm, pblk_rb_ptr_wrap(rb, subm, nr_entries)); - - return subm; -} - -static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int to_update) -{ - struct pblk *pblk = container_of(rb, struct pblk, rwb); - struct pblk_line *line; - struct pblk_rb_entry *entry; - struct pblk_w_ctx *w_ctx; - unsigned int user_io = 0, gc_io = 0; - unsigned int i; - int flags; - - for (i = 0; i < to_update; i++) { - entry = &rb->entries[rb->l2p_update]; - w_ctx = &entry->w_ctx; - - flags = READ_ONCE(entry->w_ctx.flags); - if (flags & PBLK_IOTYPE_USER) - user_io++; - else if (flags & PBLK_IOTYPE_GC) - gc_io++; - else - WARN(1, "pblk: unknown IO type\n"); - - pblk_update_map_dev(pblk, w_ctx->lba, w_ctx->ppa, - entry->cacheline); - - line = pblk_ppa_to_line(pblk, w_ctx->ppa); - atomic_dec(&line->sec_to_update); - kref_put(&line->ref, pblk_line_put); - clean_wctx(w_ctx); - rb->l2p_update = pblk_rb_ptr_wrap(rb, rb->l2p_update, 1); - } - - pblk_rl_out(&pblk->rl, user_io, gc_io); - - return 0; -} - -/* - * When we move the l2p_update pointer, we update the l2p table - lookups will - * point to the physical address instead of to the cacheline in the write buffer - * from this moment on. - */ -static int pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int nr_entries, - unsigned int mem, unsigned int sync) -{ - unsigned int space, count; - int ret = 0; - - lockdep_assert_held(&rb->w_lock); - - /* Update l2p only as buffer entries are being overwritten */ - space = pblk_rb_ring_space(rb, mem, rb->l2p_update, rb->nr_entries); - if (space > nr_entries) - goto out; - - count = nr_entries - space; - /* l2p_update used exclusively under rb->w_lock */ - ret = __pblk_rb_update_l2p(rb, count); - -out: - return ret; -} - -/* - * Update the l2p entry for all sectors stored on the write buffer. This means - * that all future lookups to the l2p table will point to a device address, not - * to the cacheline in the write buffer. - */ -void pblk_rb_sync_l2p(struct pblk_rb *rb) -{ - unsigned int sync; - unsigned int to_update; - - spin_lock(&rb->w_lock); - - /* Protect from reads and writes */ - sync = smp_load_acquire(&rb->sync); - - to_update = pblk_rb_ring_count(sync, rb->l2p_update, rb->nr_entries); - __pblk_rb_update_l2p(rb, to_update); - - spin_unlock(&rb->w_lock); -} - -/* - * Write @nr_entries to ring buffer from @data buffer if there is enough space. - * Typically, 4KB data chunks coming from a bio will be copied to the ring - * buffer, thus the write will fail if not all incoming data can be copied. - * - */ -static void __pblk_rb_write_entry(struct pblk_rb *rb, void *data, - struct pblk_w_ctx w_ctx, - struct pblk_rb_entry *entry) -{ - memcpy(entry->data, data, rb->seg_size); - - entry->w_ctx.lba = w_ctx.lba; - entry->w_ctx.ppa = w_ctx.ppa; -} - -void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data, - struct pblk_w_ctx w_ctx, unsigned int ring_pos) -{ - struct pblk *pblk = container_of(rb, struct pblk, rwb); - struct pblk_rb_entry *entry; - int flags; - - entry = &rb->entries[ring_pos]; - flags = READ_ONCE(entry->w_ctx.flags); -#ifdef CONFIG_NVM_PBLK_DEBUG - /* Caller must guarantee that the entry is free */ - BUG_ON(!(flags & PBLK_WRITABLE_ENTRY)); -#endif - - __pblk_rb_write_entry(rb, data, w_ctx, entry); - - pblk_update_map_cache(pblk, w_ctx.lba, entry->cacheline); - flags = w_ctx.flags | PBLK_WRITTEN_DATA; - - /* Release flags on write context. Protect from writes */ - smp_store_release(&entry->w_ctx.flags, flags); -} - -void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data, - struct pblk_w_ctx w_ctx, struct pblk_line *line, - u64 paddr, unsigned int ring_pos) -{ - struct pblk *pblk = container_of(rb, struct pblk, rwb); - struct pblk_rb_entry *entry; - int flags; - - entry = &rb->entries[ring_pos]; - flags = READ_ONCE(entry->w_ctx.flags); -#ifdef CONFIG_NVM_PBLK_DEBUG - /* Caller must guarantee that the entry is free */ - BUG_ON(!(flags & PBLK_WRITABLE_ENTRY)); -#endif - - __pblk_rb_write_entry(rb, data, w_ctx, entry); - - if (!pblk_update_map_gc(pblk, w_ctx.lba, entry->cacheline, line, paddr)) - entry->w_ctx.lba = ADDR_EMPTY; - - flags = w_ctx.flags | PBLK_WRITTEN_DATA; - - /* Release flags on write context. Protect from writes */ - smp_store_release(&entry->w_ctx.flags, flags); -} - -static int pblk_rb_flush_point_set(struct pblk_rb *rb, struct bio *bio, - unsigned int pos) -{ - struct pblk_rb_entry *entry; - unsigned int sync, flush_point; - - pblk_rb_sync_init(rb, NULL); - sync = READ_ONCE(rb->sync); - - if (pos == sync) { - pblk_rb_sync_end(rb, NULL); - return 0; - } - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_inc(&rb->inflight_flush_point); -#endif - - flush_point = (pos == 0) ? (rb->nr_entries - 1) : (pos - 1); - entry = &rb->entries[flush_point]; - - /* Protect flush points */ - smp_store_release(&rb->flush_point, flush_point); - - if (bio) - bio_list_add(&entry->w_ctx.bios, bio); - - pblk_rb_sync_end(rb, NULL); - - return bio ? 1 : 0; -} - -static int __pblk_rb_may_write(struct pblk_rb *rb, unsigned int nr_entries, - unsigned int *pos) -{ - unsigned int mem; - unsigned int sync; - unsigned int threshold; - - sync = READ_ONCE(rb->sync); - mem = READ_ONCE(rb->mem); - - threshold = nr_entries + rb->back_thres; - - if (pblk_rb_ring_space(rb, mem, sync, rb->nr_entries) < threshold) - return 0; - - if (pblk_rb_update_l2p(rb, nr_entries, mem, sync)) - return 0; - - *pos = mem; - - return 1; -} - -static int pblk_rb_may_write(struct pblk_rb *rb, unsigned int nr_entries, - unsigned int *pos) -{ - if (!__pblk_rb_may_write(rb, nr_entries, pos)) - return 0; - - /* Protect from read count */ - smp_store_release(&rb->mem, pblk_rb_ptr_wrap(rb, *pos, nr_entries)); - return 1; -} - -void pblk_rb_flush(struct pblk_rb *rb) -{ - struct pblk *pblk = container_of(rb, struct pblk, rwb); - unsigned int mem = READ_ONCE(rb->mem); - - if (pblk_rb_flush_point_set(rb, NULL, mem)) - return; - - pblk_write_kick(pblk); -} - -static int pblk_rb_may_write_flush(struct pblk_rb *rb, unsigned int nr_entries, - unsigned int *pos, struct bio *bio, - int *io_ret) -{ - unsigned int mem; - - if (!__pblk_rb_may_write(rb, nr_entries, pos)) - return 0; - - mem = pblk_rb_ptr_wrap(rb, *pos, nr_entries); - *io_ret = NVM_IO_DONE; - - if (bio->bi_opf & REQ_PREFLUSH) { - struct pblk *pblk = container_of(rb, struct pblk, rwb); - - atomic64_inc(&pblk->nr_flush); - if (pblk_rb_flush_point_set(&pblk->rwb, bio, mem)) - *io_ret = NVM_IO_OK; - } - - /* Protect from read count */ - smp_store_release(&rb->mem, mem); - - return 1; -} - -/* - * Atomically check that (i) there is space on the write buffer for the - * incoming I/O, and (ii) the current I/O type has enough budget in the write - * buffer (rate-limiter). - */ -int pblk_rb_may_write_user(struct pblk_rb *rb, struct bio *bio, - unsigned int nr_entries, unsigned int *pos) -{ - struct pblk *pblk = container_of(rb, struct pblk, rwb); - int io_ret; - - spin_lock(&rb->w_lock); - io_ret = pblk_rl_user_may_insert(&pblk->rl, nr_entries); - if (io_ret) { - spin_unlock(&rb->w_lock); - return io_ret; - } - - if (!pblk_rb_may_write_flush(rb, nr_entries, pos, bio, &io_ret)) { - spin_unlock(&rb->w_lock); - return NVM_IO_REQUEUE; - } - - pblk_rl_user_in(&pblk->rl, nr_entries); - spin_unlock(&rb->w_lock); - - return io_ret; -} - -/* - * Look at pblk_rb_may_write_user comment - */ -int pblk_rb_may_write_gc(struct pblk_rb *rb, unsigned int nr_entries, - unsigned int *pos) -{ - struct pblk *pblk = container_of(rb, struct pblk, rwb); - - spin_lock(&rb->w_lock); - if (!pblk_rl_gc_may_insert(&pblk->rl, nr_entries)) { - spin_unlock(&rb->w_lock); - return 0; - } - - if (!pblk_rb_may_write(rb, nr_entries, pos)) { - spin_unlock(&rb->w_lock); - return 0; - } - - pblk_rl_gc_in(&pblk->rl, nr_entries); - spin_unlock(&rb->w_lock); - - return 1; -} - -/* - * Read available entries on rb and add them to the given bio. To avoid a memory - * copy, a page reference to the write buffer is used to be added to the bio. - * - * This function is used by the write thread to form the write bio that will - * persist data on the write buffer to the media. - */ -unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd, - unsigned int pos, unsigned int nr_entries, - unsigned int count) -{ - struct pblk *pblk = container_of(rb, struct pblk, rwb); - struct request_queue *q = pblk->dev->q; - struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); - struct bio *bio = rqd->bio; - struct pblk_rb_entry *entry; - struct page *page; - unsigned int pad = 0, to_read = nr_entries; - unsigned int i; - int flags; - - if (count < nr_entries) { - pad = nr_entries - count; - to_read = count; - } - - /* Add space for packed metadata if in use*/ - pad += (pblk->min_write_pgs - pblk->min_write_pgs_data); - - c_ctx->sentry = pos; - c_ctx->nr_valid = to_read; - c_ctx->nr_padded = pad; - - for (i = 0; i < to_read; i++) { - entry = &rb->entries[pos]; - - /* A write has been allowed into the buffer, but data is still - * being copied to it. It is ok to busy wait. - */ -try: - flags = READ_ONCE(entry->w_ctx.flags); - if (!(flags & PBLK_WRITTEN_DATA)) { - io_schedule(); - goto try; - } - - page = virt_to_page(entry->data); - if (!page) { - pblk_err(pblk, "could not allocate write bio page\n"); - flags &= ~PBLK_WRITTEN_DATA; - flags |= PBLK_SUBMITTED_ENTRY; - /* Release flags on context. Protect from writes */ - smp_store_release(&entry->w_ctx.flags, flags); - return NVM_IO_ERR; - } - - if (bio_add_pc_page(q, bio, page, rb->seg_size, 0) != - rb->seg_size) { - pblk_err(pblk, "could not add page to write bio\n"); - flags &= ~PBLK_WRITTEN_DATA; - flags |= PBLK_SUBMITTED_ENTRY; - /* Release flags on context. Protect from writes */ - smp_store_release(&entry->w_ctx.flags, flags); - return NVM_IO_ERR; - } - - flags &= ~PBLK_WRITTEN_DATA; - flags |= PBLK_SUBMITTED_ENTRY; - - /* Release flags on context. Protect from writes */ - smp_store_release(&entry->w_ctx.flags, flags); - - pos = pblk_rb_ptr_wrap(rb, pos, 1); - } - - if (pad) { - if (pblk_bio_add_pages(pblk, bio, GFP_KERNEL, pad)) { - pblk_err(pblk, "could not pad page in write bio\n"); - return NVM_IO_ERR; - } - - if (pad < pblk->min_write_pgs) - atomic64_inc(&pblk->pad_dist[pad - 1]); - else - pblk_warn(pblk, "padding more than min. sectors\n"); - - atomic64_add(pad, &pblk->pad_wa); - } - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_add(pad, &pblk->padded_writes); -#endif - - return NVM_IO_OK; -} - -/* - * Copy to bio only if the lba matches the one on the given cache entry. - * Otherwise, it means that the entry has been overwritten, and the bio should - * be directed to disk. - */ -int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba, - struct ppa_addr ppa) -{ - struct pblk *pblk = container_of(rb, struct pblk, rwb); - struct pblk_rb_entry *entry; - struct pblk_w_ctx *w_ctx; - struct ppa_addr l2p_ppa; - u64 pos = pblk_addr_to_cacheline(ppa); - void *data; - int flags; - int ret = 1; - - -#ifdef CONFIG_NVM_PBLK_DEBUG - /* Caller must ensure that the access will not cause an overflow */ - BUG_ON(pos >= rb->nr_entries); -#endif - entry = &rb->entries[pos]; - w_ctx = &entry->w_ctx; - flags = READ_ONCE(w_ctx->flags); - - spin_lock(&rb->w_lock); - spin_lock(&pblk->trans_lock); - l2p_ppa = pblk_trans_map_get(pblk, lba); - spin_unlock(&pblk->trans_lock); - - /* Check if the entry has been overwritten or is scheduled to be */ - if (!pblk_ppa_comp(l2p_ppa, ppa) || w_ctx->lba != lba || - flags & PBLK_WRITABLE_ENTRY) { - ret = 0; - goto out; - } - data = bio_data(bio); - memcpy(data, entry->data, rb->seg_size); - -out: - spin_unlock(&rb->w_lock); - return ret; -} - -struct pblk_w_ctx *pblk_rb_w_ctx(struct pblk_rb *rb, unsigned int pos) -{ - unsigned int entry = pblk_rb_ptr_wrap(rb, pos, 0); - - return &rb->entries[entry].w_ctx; -} - -unsigned int pblk_rb_sync_init(struct pblk_rb *rb, unsigned long *flags) - __acquires(&rb->s_lock) -{ - if (flags) - spin_lock_irqsave(&rb->s_lock, *flags); - else - spin_lock_irq(&rb->s_lock); - - return rb->sync; -} - -void pblk_rb_sync_end(struct pblk_rb *rb, unsigned long *flags) - __releases(&rb->s_lock) -{ - lockdep_assert_held(&rb->s_lock); - - if (flags) - spin_unlock_irqrestore(&rb->s_lock, *flags); - else - spin_unlock_irq(&rb->s_lock); -} - -unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries) -{ - unsigned int sync, flush_point; - lockdep_assert_held(&rb->s_lock); - - sync = READ_ONCE(rb->sync); - flush_point = READ_ONCE(rb->flush_point); - - if (flush_point != EMPTY_ENTRY) { - unsigned int secs_to_flush; - - secs_to_flush = pblk_rb_ring_count(flush_point, sync, - rb->nr_entries); - if (secs_to_flush < nr_entries) { - /* Protect flush points */ - smp_store_release(&rb->flush_point, EMPTY_ENTRY); - } - } - - sync = pblk_rb_ptr_wrap(rb, sync, nr_entries); - - /* Protect from counts */ - smp_store_release(&rb->sync, sync); - - return sync; -} - -/* Calculate how many sectors to submit up to the current flush point. */ -unsigned int pblk_rb_flush_point_count(struct pblk_rb *rb) -{ - unsigned int subm, sync, flush_point; - unsigned int submitted, to_flush; - - /* Protect flush points */ - flush_point = smp_load_acquire(&rb->flush_point); - if (flush_point == EMPTY_ENTRY) - return 0; - - /* Protect syncs */ - sync = smp_load_acquire(&rb->sync); - - subm = READ_ONCE(rb->subm); - submitted = pblk_rb_ring_count(subm, sync, rb->nr_entries); - - /* The sync point itself counts as a sector to sync */ - to_flush = pblk_rb_ring_count(flush_point, sync, rb->nr_entries) + 1; - - return (submitted < to_flush) ? (to_flush - submitted) : 0; -} - -int pblk_rb_tear_down_check(struct pblk_rb *rb) -{ - struct pblk_rb_entry *entry; - int i; - int ret = 0; - - spin_lock(&rb->w_lock); - spin_lock_irq(&rb->s_lock); - - if ((rb->mem == rb->subm) && (rb->subm == rb->sync) && - (rb->sync == rb->l2p_update) && - (rb->flush_point == EMPTY_ENTRY)) { - goto out; - } - - if (!rb->entries) { - ret = 1; - goto out; - } - - for (i = 0; i < rb->nr_entries; i++) { - entry = &rb->entries[i]; - - if (!entry->data) { - ret = 1; - goto out; - } - } - -out: - spin_unlock_irq(&rb->s_lock); - spin_unlock(&rb->w_lock); - - return ret; -} - -unsigned int pblk_rb_wrap_pos(struct pblk_rb *rb, unsigned int pos) -{ - return (pos & (rb->nr_entries - 1)); -} - -int pblk_rb_pos_oob(struct pblk_rb *rb, u64 pos) -{ - return (pos >= rb->nr_entries); -} - -ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf) -{ - struct pblk *pblk = container_of(rb, struct pblk, rwb); - struct pblk_c_ctx *c; - ssize_t offset; - int queued_entries = 0; - - spin_lock_irq(&rb->s_lock); - list_for_each_entry(c, &pblk->compl_list, list) - queued_entries++; - spin_unlock_irq(&rb->s_lock); - - if (rb->flush_point != EMPTY_ENTRY) - offset = scnprintf(buf, PAGE_SIZE, - "%u\t%u\t%u\t%u\t%u\t%u\t%u - %u/%u/%u - %d\n", - rb->nr_entries, - rb->mem, - rb->subm, - rb->sync, - rb->l2p_update, -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_read(&rb->inflight_flush_point), -#else - 0, -#endif - rb->flush_point, - pblk_rb_read_count(rb), - pblk_rb_space(rb), - pblk_rb_flush_point_count(rb), - queued_entries); - else - offset = scnprintf(buf, PAGE_SIZE, - "%u\t%u\t%u\t%u\t%u\t%u\tNULL - %u/%u/%u - %d\n", - rb->nr_entries, - rb->mem, - rb->subm, - rb->sync, - rb->l2p_update, -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_read(&rb->inflight_flush_point), -#else - 0, -#endif - pblk_rb_read_count(rb), - pblk_rb_space(rb), - pblk_rb_flush_point_count(rb), - queued_entries); - - return offset; -} diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c deleted file mode 100644 index c28537a489bc..000000000000 --- a/drivers/lightnvm/pblk-read.c +++ /dev/null @@ -1,474 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2016 CNEX Labs - * Initial release: Javier Gonzalez - * Matias Bjorling - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * pblk-read.c - pblk's read path - */ - -#include "pblk.h" - -/* - * There is no guarantee that the value read from cache has not been updated and - * resides at another location in the cache. We guarantee though that if the - * value is read from the cache, it belongs to the mapped lba. In order to - * guarantee and order between writes and reads are ordered, a flush must be - * issued. - */ -static int pblk_read_from_cache(struct pblk *pblk, struct bio *bio, - sector_t lba, struct ppa_addr ppa) -{ -#ifdef CONFIG_NVM_PBLK_DEBUG - /* Callers must ensure that the ppa points to a cache address */ - BUG_ON(pblk_ppa_empty(ppa)); - BUG_ON(!pblk_addr_in_cache(ppa)); -#endif - - return pblk_rb_copy_to_bio(&pblk->rwb, bio, lba, ppa); -} - -static int pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd, - struct bio *bio, sector_t blba, - bool *from_cache) -{ - void *meta_list = rqd->meta_list; - int nr_secs, i; - -retry: - nr_secs = pblk_lookup_l2p_seq(pblk, rqd->ppa_list, blba, rqd->nr_ppas, - from_cache); - - if (!*from_cache) - goto end; - - for (i = 0; i < nr_secs; i++) { - struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i); - sector_t lba = blba + i; - - if (pblk_ppa_empty(rqd->ppa_list[i])) { - __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); - - meta->lba = addr_empty; - } else if (pblk_addr_in_cache(rqd->ppa_list[i])) { - /* - * Try to read from write buffer. The address is later - * checked on the write buffer to prevent retrieving - * overwritten data. - */ - if (!pblk_read_from_cache(pblk, bio, lba, - rqd->ppa_list[i])) { - if (i == 0) { - /* - * We didn't call with bio_advance() - * yet, so we can just retry. - */ - goto retry; - } else { - /* - * We already call bio_advance() - * so we cannot retry and we need - * to quit that function in order - * to allow caller to handle the bio - * splitting in the current sector - * position. - */ - nr_secs = i; - goto end; - } - } - meta->lba = cpu_to_le64(lba); -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_inc(&pblk->cache_reads); -#endif - } - bio_advance(bio, PBLK_EXPOSED_PAGE_SIZE); - } - -end: - if (pblk_io_aligned(pblk, nr_secs)) - rqd->is_seq = 1; - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_add(nr_secs, &pblk->inflight_reads); -#endif - - return nr_secs; -} - - -static void pblk_read_check_seq(struct pblk *pblk, struct nvm_rq *rqd, - sector_t blba) -{ - void *meta_list = rqd->meta_list; - int nr_lbas = rqd->nr_ppas; - int i; - - if (!pblk_is_oob_meta_supported(pblk)) - return; - - for (i = 0; i < nr_lbas; i++) { - struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i); - u64 lba = le64_to_cpu(meta->lba); - - if (lba == ADDR_EMPTY) - continue; - - if (lba != blba + i) { -#ifdef CONFIG_NVM_PBLK_DEBUG - struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd); - - print_ppa(pblk, &ppa_list[i], "seq", i); -#endif - pblk_err(pblk, "corrupted read LBA (%llu/%llu)\n", - lba, (u64)blba + i); - WARN_ON(1); - } - } -} - -/* - * There can be holes in the lba list. - */ -static void pblk_read_check_rand(struct pblk *pblk, struct nvm_rq *rqd, - u64 *lba_list, int nr_lbas) -{ - void *meta_lba_list = rqd->meta_list; - int i, j; - - if (!pblk_is_oob_meta_supported(pblk)) - return; - - for (i = 0, j = 0; i < nr_lbas; i++) { - struct pblk_sec_meta *meta = pblk_get_meta(pblk, - meta_lba_list, j); - u64 lba = lba_list[i]; - u64 meta_lba; - - if (lba == ADDR_EMPTY) - continue; - - meta_lba = le64_to_cpu(meta->lba); - - if (lba != meta_lba) { -#ifdef CONFIG_NVM_PBLK_DEBUG - struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd); - - print_ppa(pblk, &ppa_list[j], "rnd", j); -#endif - pblk_err(pblk, "corrupted read LBA (%llu/%llu)\n", - meta_lba, lba); - WARN_ON(1); - } - - j++; - } - - WARN_ONCE(j != rqd->nr_ppas, "pblk: corrupted random request\n"); -} - -static void pblk_end_user_read(struct bio *bio, int error) -{ - if (error && error != NVM_RSP_WARN_HIGHECC) - bio_io_error(bio); - else - bio_endio(bio); -} - -static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd, - bool put_line) -{ - struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd); - struct bio *int_bio = rqd->bio; - unsigned long start_time = r_ctx->start_time; - - bio_end_io_acct(int_bio, start_time); - - if (rqd->error) - pblk_log_read_err(pblk, rqd); - - pblk_read_check_seq(pblk, rqd, r_ctx->lba); - bio_put(int_bio); - - if (put_line) - pblk_rq_to_line_put(pblk, rqd); - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_add(rqd->nr_ppas, &pblk->sync_reads); - atomic_long_sub(rqd->nr_ppas, &pblk->inflight_reads); -#endif - - pblk_free_rqd(pblk, rqd, PBLK_READ); - atomic_dec(&pblk->inflight_io); -} - -static void pblk_end_io_read(struct nvm_rq *rqd) -{ - struct pblk *pblk = rqd->private; - struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd); - struct bio *bio = (struct bio *)r_ctx->private; - - pblk_end_user_read(bio, rqd->error); - __pblk_end_io_read(pblk, rqd, true); -} - -static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, struct bio *bio, - sector_t lba, bool *from_cache) -{ - struct pblk_sec_meta *meta = pblk_get_meta(pblk, rqd->meta_list, 0); - struct ppa_addr ppa; - - pblk_lookup_l2p_seq(pblk, &ppa, lba, 1, from_cache); - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_inc(&pblk->inflight_reads); -#endif - -retry: - if (pblk_ppa_empty(ppa)) { - __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); - - meta->lba = addr_empty; - return; - } - - /* Try to read from write buffer. The address is later checked on the - * write buffer to prevent retrieving overwritten data. - */ - if (pblk_addr_in_cache(ppa)) { - if (!pblk_read_from_cache(pblk, bio, lba, ppa)) { - pblk_lookup_l2p_seq(pblk, &ppa, lba, 1, from_cache); - goto retry; - } - - meta->lba = cpu_to_le64(lba); - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_inc(&pblk->cache_reads); -#endif - } else { - rqd->ppa_addr = ppa; - } -} - -void pblk_submit_read(struct pblk *pblk, struct bio *bio) -{ - sector_t blba = pblk_get_lba(bio); - unsigned int nr_secs = pblk_get_secs(bio); - bool from_cache; - struct pblk_g_ctx *r_ctx; - struct nvm_rq *rqd; - struct bio *int_bio, *split_bio; - unsigned long start_time; - - start_time = bio_start_io_acct(bio); - - rqd = pblk_alloc_rqd(pblk, PBLK_READ); - - rqd->opcode = NVM_OP_PREAD; - rqd->nr_ppas = nr_secs; - rqd->private = pblk; - rqd->end_io = pblk_end_io_read; - - r_ctx = nvm_rq_to_pdu(rqd); - r_ctx->start_time = start_time; - r_ctx->lba = blba; - - if (pblk_alloc_rqd_meta(pblk, rqd)) { - bio_io_error(bio); - pblk_free_rqd(pblk, rqd, PBLK_READ); - return; - } - - /* Clone read bio to deal internally with: - * -read errors when reading from drive - * -bio_advance() calls during cache reads - */ - int_bio = bio_clone_fast(bio, GFP_KERNEL, &pblk_bio_set); - - if (nr_secs > 1) - nr_secs = pblk_read_ppalist_rq(pblk, rqd, int_bio, blba, - &from_cache); - else - pblk_read_rq(pblk, rqd, int_bio, blba, &from_cache); - -split_retry: - r_ctx->private = bio; /* original bio */ - rqd->bio = int_bio; /* internal bio */ - - if (from_cache && nr_secs == rqd->nr_ppas) { - /* All data was read from cache, we can complete the IO. */ - pblk_end_user_read(bio, 0); - atomic_inc(&pblk->inflight_io); - __pblk_end_io_read(pblk, rqd, false); - } else if (nr_secs != rqd->nr_ppas) { - /* The read bio request could be partially filled by the write - * buffer, but there are some holes that need to be read from - * the drive. In order to handle this, we will use block layer - * mechanism to split this request in to smaller ones and make - * a chain of it. - */ - split_bio = bio_split(bio, nr_secs * NR_PHY_IN_LOG, GFP_KERNEL, - &pblk_bio_set); - bio_chain(split_bio, bio); - submit_bio_noacct(bio); - - /* New bio contains first N sectors of the previous one, so - * we can continue to use existing rqd, but we need to shrink - * the number of PPAs in it. New bio is also guaranteed that - * it contains only either data from cache or from drive, newer - * mix of them. - */ - bio = split_bio; - rqd->nr_ppas = nr_secs; - if (rqd->nr_ppas == 1) - rqd->ppa_addr = rqd->ppa_list[0]; - - /* Recreate int_bio - existing might have some needed internal - * fields modified already. - */ - bio_put(int_bio); - int_bio = bio_clone_fast(bio, GFP_KERNEL, &pblk_bio_set); - goto split_retry; - } else if (pblk_submit_io(pblk, rqd, NULL)) { - /* Submitting IO to drive failed, let's report an error */ - rqd->error = -ENODEV; - pblk_end_io_read(rqd); - } -} - -static int read_ppalist_rq_gc(struct pblk *pblk, struct nvm_rq *rqd, - struct pblk_line *line, u64 *lba_list, - u64 *paddr_list_gc, unsigned int nr_secs) -{ - struct ppa_addr ppa_list_l2p[NVM_MAX_VLBA]; - struct ppa_addr ppa_gc; - int valid_secs = 0; - int i; - - pblk_lookup_l2p_rand(pblk, ppa_list_l2p, lba_list, nr_secs); - - for (i = 0; i < nr_secs; i++) { - if (lba_list[i] == ADDR_EMPTY) - continue; - - ppa_gc = addr_to_gen_ppa(pblk, paddr_list_gc[i], line->id); - if (!pblk_ppa_comp(ppa_list_l2p[i], ppa_gc)) { - paddr_list_gc[i] = lba_list[i] = ADDR_EMPTY; - continue; - } - - rqd->ppa_list[valid_secs++] = ppa_list_l2p[i]; - } - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_add(valid_secs, &pblk->inflight_reads); -#endif - - return valid_secs; -} - -static int read_rq_gc(struct pblk *pblk, struct nvm_rq *rqd, - struct pblk_line *line, sector_t lba, - u64 paddr_gc) -{ - struct ppa_addr ppa_l2p, ppa_gc; - int valid_secs = 0; - - if (lba == ADDR_EMPTY) - goto out; - - /* logic error: lba out-of-bounds */ - if (lba >= pblk->capacity) { - WARN(1, "pblk: read lba out of bounds\n"); - goto out; - } - - spin_lock(&pblk->trans_lock); - ppa_l2p = pblk_trans_map_get(pblk, lba); - spin_unlock(&pblk->trans_lock); - - ppa_gc = addr_to_gen_ppa(pblk, paddr_gc, line->id); - if (!pblk_ppa_comp(ppa_l2p, ppa_gc)) - goto out; - - rqd->ppa_addr = ppa_l2p; - valid_secs = 1; - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_inc(&pblk->inflight_reads); -#endif - -out: - return valid_secs; -} - -int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq) -{ - struct nvm_rq rqd; - int ret = NVM_IO_OK; - - memset(&rqd, 0, sizeof(struct nvm_rq)); - - ret = pblk_alloc_rqd_meta(pblk, &rqd); - if (ret) - return ret; - - if (gc_rq->nr_secs > 1) { - gc_rq->secs_to_gc = read_ppalist_rq_gc(pblk, &rqd, gc_rq->line, - gc_rq->lba_list, - gc_rq->paddr_list, - gc_rq->nr_secs); - if (gc_rq->secs_to_gc == 1) - rqd.ppa_addr = rqd.ppa_list[0]; - } else { - gc_rq->secs_to_gc = read_rq_gc(pblk, &rqd, gc_rq->line, - gc_rq->lba_list[0], - gc_rq->paddr_list[0]); - } - - if (!(gc_rq->secs_to_gc)) - goto out; - - rqd.opcode = NVM_OP_PREAD; - rqd.nr_ppas = gc_rq->secs_to_gc; - - if (pblk_submit_io_sync(pblk, &rqd, gc_rq->data)) { - ret = -EIO; - goto err_free_dma; - } - - pblk_read_check_rand(pblk, &rqd, gc_rq->lba_list, gc_rq->nr_secs); - - atomic_dec(&pblk->inflight_io); - - if (rqd.error) { - atomic_long_inc(&pblk->read_failed_gc); -#ifdef CONFIG_NVM_PBLK_DEBUG - pblk_print_failed_rqd(pblk, &rqd, rqd.error); -#endif - } - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_add(gc_rq->secs_to_gc, &pblk->sync_reads); - atomic_long_add(gc_rq->secs_to_gc, &pblk->recov_gc_reads); - atomic_long_sub(gc_rq->secs_to_gc, &pblk->inflight_reads); -#endif - -out: - pblk_free_rqd_meta(pblk, &rqd); - return ret; - -err_free_dma: - pblk_free_rqd_meta(pblk, &rqd); - return ret; -} diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c deleted file mode 100644 index 0e6f0c76e930..000000000000 --- a/drivers/lightnvm/pblk-recovery.c +++ /dev/null @@ -1,874 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2016 CNEX Labs - * Initial: Javier Gonzalez - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * pblk-recovery.c - pblk's recovery path - * - * The L2P recovery path is single threaded as the L2P table is updated in order - * following the line sequence ID. - */ - -#include "pblk.h" -#include "pblk-trace.h" - -int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta_buf) -{ - u32 crc; - - crc = pblk_calc_emeta_crc(pblk, emeta_buf); - if (le32_to_cpu(emeta_buf->crc) != crc) - return 1; - - if (le32_to_cpu(emeta_buf->header.identifier) != PBLK_MAGIC) - return 1; - - return 0; -} - -static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_emeta *emeta = line->emeta; - struct line_emeta *emeta_buf = emeta->buf; - __le64 *lba_list; - u64 data_start, data_end; - u64 nr_valid_lbas, nr_lbas = 0; - u64 i; - - lba_list = emeta_to_lbas(pblk, emeta_buf); - if (!lba_list) - return 1; - - data_start = pblk_line_smeta_start(pblk, line) + lm->smeta_sec; - data_end = line->emeta_ssec; - nr_valid_lbas = le64_to_cpu(emeta_buf->nr_valid_lbas); - - for (i = data_start; i < data_end; i++) { - struct ppa_addr ppa; - int pos; - - ppa = addr_to_gen_ppa(pblk, i, line->id); - pos = pblk_ppa_to_pos(geo, ppa); - - /* Do not update bad blocks */ - if (test_bit(pos, line->blk_bitmap)) - continue; - - if (le64_to_cpu(lba_list[i]) == ADDR_EMPTY) { - spin_lock(&line->lock); - if (test_and_set_bit(i, line->invalid_bitmap)) - WARN_ONCE(1, "pblk: rec. double invalidate:\n"); - else - le32_add_cpu(line->vsc, -1); - spin_unlock(&line->lock); - - continue; - } - - pblk_update_map(pblk, le64_to_cpu(lba_list[i]), ppa); - nr_lbas++; - } - - if (nr_valid_lbas != nr_lbas) - pblk_err(pblk, "line %d - inconsistent lba list(%llu/%llu)\n", - line->id, nr_valid_lbas, nr_lbas); - - line->left_msecs = 0; - - return 0; -} - -static void pblk_update_line_wp(struct pblk *pblk, struct pblk_line *line, - u64 written_secs) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - int i; - - for (i = 0; i < written_secs; i += pblk->min_write_pgs) - __pblk_alloc_page(pblk, line, pblk->min_write_pgs); - - spin_lock(&l_mg->free_lock); - if (written_secs > line->left_msecs) { - /* - * We have all data sectors written - * and some emeta sectors written too. - */ - line->left_msecs = 0; - } else { - /* We have only some data sectors written. */ - line->left_msecs -= written_secs; - } - spin_unlock(&l_mg->free_lock); -} - -static u64 pblk_sec_in_open_line(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_line_meta *lm = &pblk->lm; - int nr_bb = bitmap_weight(line->blk_bitmap, lm->blk_per_line); - u64 written_secs = 0; - int valid_chunks = 0; - int i; - - for (i = 0; i < lm->blk_per_line; i++) { - struct nvm_chk_meta *chunk = &line->chks[i]; - - if (chunk->state & NVM_CHK_ST_OFFLINE) - continue; - - written_secs += chunk->wp; - valid_chunks++; - } - - if (lm->blk_per_line - nr_bb != valid_chunks) - pblk_err(pblk, "recovery line %d is bad\n", line->id); - - pblk_update_line_wp(pblk, line, written_secs - lm->smeta_sec); - - return written_secs; -} - -struct pblk_recov_alloc { - struct ppa_addr *ppa_list; - void *meta_list; - struct nvm_rq *rqd; - void *data; - dma_addr_t dma_ppa_list; - dma_addr_t dma_meta_list; -}; - -static void pblk_recov_complete(struct kref *ref) -{ - struct pblk_pad_rq *pad_rq = container_of(ref, struct pblk_pad_rq, ref); - - complete(&pad_rq->wait); -} - -static void pblk_end_io_recov(struct nvm_rq *rqd) -{ - struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd); - struct pblk_pad_rq *pad_rq = rqd->private; - struct pblk *pblk = pad_rq->pblk; - - pblk_up_chunk(pblk, ppa_list[0]); - - pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT); - - atomic_dec(&pblk->inflight_io); - kref_put(&pad_rq->ref, pblk_recov_complete); -} - -/* pad line using line bitmap. */ -static int pblk_recov_pad_line(struct pblk *pblk, struct pblk_line *line, - int left_ppas) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - void *meta_list; - struct pblk_pad_rq *pad_rq; - struct nvm_rq *rqd; - struct ppa_addr *ppa_list; - void *data; - __le64 *lba_list = emeta_to_lbas(pblk, line->emeta->buf); - u64 w_ptr = line->cur_sec; - int left_line_ppas, rq_ppas; - int i, j; - int ret = 0; - - spin_lock(&line->lock); - left_line_ppas = line->left_msecs; - spin_unlock(&line->lock); - - pad_rq = kmalloc(sizeof(struct pblk_pad_rq), GFP_KERNEL); - if (!pad_rq) - return -ENOMEM; - - data = vzalloc(array_size(pblk->max_write_pgs, geo->csecs)); - if (!data) { - ret = -ENOMEM; - goto free_rq; - } - - pad_rq->pblk = pblk; - init_completion(&pad_rq->wait); - kref_init(&pad_rq->ref); - -next_pad_rq: - rq_ppas = pblk_calc_secs(pblk, left_ppas, 0, false); - if (rq_ppas < pblk->min_write_pgs) { - pblk_err(pblk, "corrupted pad line %d\n", line->id); - goto fail_complete; - } - - rqd = pblk_alloc_rqd(pblk, PBLK_WRITE_INT); - - ret = pblk_alloc_rqd_meta(pblk, rqd); - if (ret) { - pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT); - goto fail_complete; - } - - rqd->bio = NULL; - rqd->opcode = NVM_OP_PWRITE; - rqd->is_seq = 1; - rqd->nr_ppas = rq_ppas; - rqd->end_io = pblk_end_io_recov; - rqd->private = pad_rq; - - ppa_list = nvm_rq_to_ppa_list(rqd); - meta_list = rqd->meta_list; - - for (i = 0; i < rqd->nr_ppas; ) { - struct ppa_addr ppa; - int pos; - - w_ptr = pblk_alloc_page(pblk, line, pblk->min_write_pgs); - ppa = addr_to_gen_ppa(pblk, w_ptr, line->id); - pos = pblk_ppa_to_pos(geo, ppa); - - while (test_bit(pos, line->blk_bitmap)) { - w_ptr += pblk->min_write_pgs; - ppa = addr_to_gen_ppa(pblk, w_ptr, line->id); - pos = pblk_ppa_to_pos(geo, ppa); - } - - for (j = 0; j < pblk->min_write_pgs; j++, i++, w_ptr++) { - struct ppa_addr dev_ppa; - struct pblk_sec_meta *meta; - __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); - - dev_ppa = addr_to_gen_ppa(pblk, w_ptr, line->id); - - pblk_map_invalidate(pblk, dev_ppa); - lba_list[w_ptr] = addr_empty; - meta = pblk_get_meta(pblk, meta_list, i); - meta->lba = addr_empty; - ppa_list[i] = dev_ppa; - } - } - - kref_get(&pad_rq->ref); - pblk_down_chunk(pblk, ppa_list[0]); - - ret = pblk_submit_io(pblk, rqd, data); - if (ret) { - pblk_err(pblk, "I/O submission failed: %d\n", ret); - pblk_up_chunk(pblk, ppa_list[0]); - kref_put(&pad_rq->ref, pblk_recov_complete); - pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT); - goto fail_complete; - } - - left_line_ppas -= rq_ppas; - left_ppas -= rq_ppas; - if (left_ppas && left_line_ppas) - goto next_pad_rq; - -fail_complete: - kref_put(&pad_rq->ref, pblk_recov_complete); - wait_for_completion(&pad_rq->wait); - - if (!pblk_line_is_full(line)) - pblk_err(pblk, "corrupted padded line: %d\n", line->id); - - vfree(data); -free_rq: - kfree(pad_rq); - return ret; -} - -static int pblk_pad_distance(struct pblk *pblk, struct pblk_line *line) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - int distance = geo->mw_cunits * geo->all_luns * geo->ws_opt; - - return (distance > line->left_msecs) ? line->left_msecs : distance; -} - -/* Return a chunk belonging to a line by stripe(write order) index */ -static struct nvm_chk_meta *pblk_get_stripe_chunk(struct pblk *pblk, - struct pblk_line *line, - int index) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_lun *rlun; - struct ppa_addr ppa; - int pos; - - rlun = &pblk->luns[index]; - ppa = rlun->bppa; - pos = pblk_ppa_to_pos(geo, ppa); - - return &line->chks[pos]; -} - -static int pblk_line_wps_are_unbalanced(struct pblk *pblk, - struct pblk_line *line) -{ - struct pblk_line_meta *lm = &pblk->lm; - int blk_in_line = lm->blk_per_line; - struct nvm_chk_meta *chunk; - u64 max_wp, min_wp; - int i; - - i = find_first_zero_bit(line->blk_bitmap, blk_in_line); - - /* If there is one or zero good chunks in the line, - * the write pointers can't be unbalanced. - */ - if (i >= (blk_in_line - 1)) - return 0; - - chunk = pblk_get_stripe_chunk(pblk, line, i); - max_wp = chunk->wp; - if (max_wp > pblk->max_write_pgs) - min_wp = max_wp - pblk->max_write_pgs; - else - min_wp = 0; - - i = find_next_zero_bit(line->blk_bitmap, blk_in_line, i + 1); - while (i < blk_in_line) { - chunk = pblk_get_stripe_chunk(pblk, line, i); - if (chunk->wp > max_wp || chunk->wp < min_wp) - return 1; - - i = find_next_zero_bit(line->blk_bitmap, blk_in_line, i + 1); - } - - return 0; -} - -static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line, - struct pblk_recov_alloc p) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct pblk_line_meta *lm = &pblk->lm; - struct nvm_geo *geo = &dev->geo; - struct ppa_addr *ppa_list; - void *meta_list; - struct nvm_rq *rqd; - void *data; - dma_addr_t dma_ppa_list, dma_meta_list; - __le64 *lba_list; - u64 paddr = pblk_line_smeta_start(pblk, line) + lm->smeta_sec; - bool padded = false; - int rq_ppas; - int i, j; - int ret; - u64 left_ppas = pblk_sec_in_open_line(pblk, line) - lm->smeta_sec; - - if (pblk_line_wps_are_unbalanced(pblk, line)) - pblk_warn(pblk, "recovering unbalanced line (%d)\n", line->id); - - ppa_list = p.ppa_list; - meta_list = p.meta_list; - rqd = p.rqd; - data = p.data; - dma_ppa_list = p.dma_ppa_list; - dma_meta_list = p.dma_meta_list; - - lba_list = emeta_to_lbas(pblk, line->emeta->buf); - -next_rq: - memset(rqd, 0, pblk_g_rq_size); - - rq_ppas = pblk_calc_secs(pblk, left_ppas, 0, false); - if (!rq_ppas) - rq_ppas = pblk->min_write_pgs; - -retry_rq: - rqd->bio = NULL; - rqd->opcode = NVM_OP_PREAD; - rqd->meta_list = meta_list; - rqd->nr_ppas = rq_ppas; - rqd->ppa_list = ppa_list; - rqd->dma_ppa_list = dma_ppa_list; - rqd->dma_meta_list = dma_meta_list; - ppa_list = nvm_rq_to_ppa_list(rqd); - - if (pblk_io_aligned(pblk, rq_ppas)) - rqd->is_seq = 1; - - for (i = 0; i < rqd->nr_ppas; ) { - struct ppa_addr ppa; - int pos; - - ppa = addr_to_gen_ppa(pblk, paddr, line->id); - pos = pblk_ppa_to_pos(geo, ppa); - - while (test_bit(pos, line->blk_bitmap)) { - paddr += pblk->min_write_pgs; - ppa = addr_to_gen_ppa(pblk, paddr, line->id); - pos = pblk_ppa_to_pos(geo, ppa); - } - - for (j = 0; j < pblk->min_write_pgs; j++, i++) - ppa_list[i] = - addr_to_gen_ppa(pblk, paddr + j, line->id); - } - - ret = pblk_submit_io_sync(pblk, rqd, data); - if (ret) { - pblk_err(pblk, "I/O submission failed: %d\n", ret); - return ret; - } - - atomic_dec(&pblk->inflight_io); - - /* If a read fails, do a best effort by padding the line and retrying */ - if (rqd->error && rqd->error != NVM_RSP_WARN_HIGHECC) { - int pad_distance, ret; - - if (padded) { - pblk_log_read_err(pblk, rqd); - return -EINTR; - } - - pad_distance = pblk_pad_distance(pblk, line); - ret = pblk_recov_pad_line(pblk, line, pad_distance); - if (ret) { - return ret; - } - - padded = true; - goto retry_rq; - } - - pblk_get_packed_meta(pblk, rqd); - - for (i = 0; i < rqd->nr_ppas; i++) { - struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i); - u64 lba = le64_to_cpu(meta->lba); - - lba_list[paddr++] = cpu_to_le64(lba); - - if (lba == ADDR_EMPTY || lba >= pblk->capacity) - continue; - - line->nr_valid_lbas++; - pblk_update_map(pblk, lba, ppa_list[i]); - } - - left_ppas -= rq_ppas; - if (left_ppas > 0) - goto next_rq; - -#ifdef CONFIG_NVM_PBLK_DEBUG - WARN_ON(padded && !pblk_line_is_full(line)); -#endif - - return 0; -} - -/* Scan line for lbas on out of bound area */ -static int pblk_recov_l2p_from_oob(struct pblk *pblk, struct pblk_line *line) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct nvm_rq *rqd; - struct ppa_addr *ppa_list; - void *meta_list; - struct pblk_recov_alloc p; - void *data; - dma_addr_t dma_ppa_list, dma_meta_list; - int ret = 0; - - meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &dma_meta_list); - if (!meta_list) - return -ENOMEM; - - ppa_list = (void *)(meta_list) + pblk_dma_meta_size(pblk); - dma_ppa_list = dma_meta_list + pblk_dma_meta_size(pblk); - - data = kcalloc(pblk->max_write_pgs, geo->csecs, GFP_KERNEL); - if (!data) { - ret = -ENOMEM; - goto free_meta_list; - } - - rqd = mempool_alloc(&pblk->r_rq_pool, GFP_KERNEL); - memset(rqd, 0, pblk_g_rq_size); - - p.ppa_list = ppa_list; - p.meta_list = meta_list; - p.rqd = rqd; - p.data = data; - p.dma_ppa_list = dma_ppa_list; - p.dma_meta_list = dma_meta_list; - - ret = pblk_recov_scan_oob(pblk, line, p); - if (ret) { - pblk_err(pblk, "could not recover L2P form OOB\n"); - goto out; - } - - if (pblk_line_is_full(line)) - pblk_line_recov_close(pblk, line); - -out: - mempool_free(rqd, &pblk->r_rq_pool); - kfree(data); -free_meta_list: - nvm_dev_dma_free(dev->parent, meta_list, dma_meta_list); - - return ret; -} - -/* Insert lines ordered by sequence number (seq_num) on list */ -static void pblk_recov_line_add_ordered(struct list_head *head, - struct pblk_line *line) -{ - struct pblk_line *t = NULL; - - list_for_each_entry(t, head, list) - if (t->seq_nr > line->seq_nr) - break; - - __list_add(&line->list, t->list.prev, &t->list); -} - -static u64 pblk_line_emeta_start(struct pblk *pblk, struct pblk_line *line) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line_meta *lm = &pblk->lm; - unsigned int emeta_secs; - u64 emeta_start; - struct ppa_addr ppa; - int pos; - - emeta_secs = lm->emeta_sec[0]; - emeta_start = lm->sec_per_line; - - while (emeta_secs) { - emeta_start--; - ppa = addr_to_gen_ppa(pblk, emeta_start, line->id); - pos = pblk_ppa_to_pos(geo, ppa); - if (!test_bit(pos, line->blk_bitmap)) - emeta_secs--; - } - - return emeta_start; -} - -static int pblk_recov_check_line_version(struct pblk *pblk, - struct line_emeta *emeta) -{ - struct line_header *header = &emeta->header; - - if (header->version_major != EMETA_VERSION_MAJOR) { - pblk_err(pblk, "line major version mismatch: %d, expected: %d\n", - header->version_major, EMETA_VERSION_MAJOR); - return 1; - } - -#ifdef CONFIG_NVM_PBLK_DEBUG - if (header->version_minor > EMETA_VERSION_MINOR) - pblk_info(pblk, "newer line minor version found: %d\n", - header->version_minor); -#endif - - return 0; -} - -static void pblk_recov_wa_counters(struct pblk *pblk, - struct line_emeta *emeta) -{ - struct pblk_line_meta *lm = &pblk->lm; - struct line_header *header = &emeta->header; - struct wa_counters *wa = emeta_to_wa(lm, emeta); - - /* WA counters were introduced in emeta version 0.2 */ - if (header->version_major > 0 || header->version_minor >= 2) { - u64 user = le64_to_cpu(wa->user); - u64 pad = le64_to_cpu(wa->pad); - u64 gc = le64_to_cpu(wa->gc); - - atomic64_set(&pblk->user_wa, user); - atomic64_set(&pblk->pad_wa, pad); - atomic64_set(&pblk->gc_wa, gc); - - pblk->user_rst_wa = user; - pblk->pad_rst_wa = pad; - pblk->gc_rst_wa = gc; - } -} - -static int pblk_line_was_written(struct pblk_line *line, - struct pblk *pblk) -{ - - struct pblk_line_meta *lm = &pblk->lm; - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct nvm_chk_meta *chunk; - struct ppa_addr bppa; - int smeta_blk; - - if (line->state == PBLK_LINESTATE_BAD) - return 0; - - smeta_blk = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line); - if (smeta_blk >= lm->blk_per_line) - return 0; - - bppa = pblk->luns[smeta_blk].bppa; - chunk = &line->chks[pblk_ppa_to_pos(geo, bppa)]; - - if (chunk->state & NVM_CHK_ST_CLOSED || - (chunk->state & NVM_CHK_ST_OPEN - && chunk->wp >= lm->smeta_sec)) - return 1; - - return 0; -} - -static bool pblk_line_is_open(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_line_meta *lm = &pblk->lm; - int i; - - for (i = 0; i < lm->blk_per_line; i++) - if (line->chks[i].state & NVM_CHK_ST_OPEN) - return true; - - return false; -} - -struct pblk_line *pblk_recov_l2p(struct pblk *pblk) -{ - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line *line, *tline, *data_line = NULL; - struct pblk_smeta *smeta; - struct pblk_emeta *emeta; - struct line_smeta *smeta_buf; - int found_lines = 0, recovered_lines = 0, open_lines = 0; - int is_next = 0; - int meta_line; - int i, valid_uuid = 0; - LIST_HEAD(recov_list); - - /* TODO: Implement FTL snapshot */ - - /* Scan recovery - takes place when FTL snapshot fails */ - spin_lock(&l_mg->free_lock); - meta_line = find_first_zero_bit(&l_mg->meta_bitmap, PBLK_DATA_LINES); - set_bit(meta_line, &l_mg->meta_bitmap); - smeta = l_mg->sline_meta[meta_line]; - emeta = l_mg->eline_meta[meta_line]; - smeta_buf = (struct line_smeta *)smeta; - spin_unlock(&l_mg->free_lock); - - /* Order data lines using their sequence number */ - for (i = 0; i < l_mg->nr_lines; i++) { - u32 crc; - - line = &pblk->lines[i]; - - memset(smeta, 0, lm->smeta_len); - line->smeta = smeta; - line->lun_bitmap = ((void *)(smeta_buf)) + - sizeof(struct line_smeta); - - if (!pblk_line_was_written(line, pblk)) - continue; - - /* Lines that cannot be read are assumed as not written here */ - if (pblk_line_smeta_read(pblk, line)) - continue; - - crc = pblk_calc_smeta_crc(pblk, smeta_buf); - if (le32_to_cpu(smeta_buf->crc) != crc) - continue; - - if (le32_to_cpu(smeta_buf->header.identifier) != PBLK_MAGIC) - continue; - - if (smeta_buf->header.version_major != SMETA_VERSION_MAJOR) { - pblk_err(pblk, "found incompatible line version %u\n", - smeta_buf->header.version_major); - return ERR_PTR(-EINVAL); - } - - /* The first valid instance uuid is used for initialization */ - if (!valid_uuid) { - import_guid(&pblk->instance_uuid, smeta_buf->header.uuid); - valid_uuid = 1; - } - - if (!guid_equal(&pblk->instance_uuid, - (guid_t *)&smeta_buf->header.uuid)) { - pblk_debug(pblk, "ignore line %u due to uuid mismatch\n", - i); - continue; - } - - /* Update line metadata */ - spin_lock(&line->lock); - line->id = le32_to_cpu(smeta_buf->header.id); - line->type = le16_to_cpu(smeta_buf->header.type); - line->seq_nr = le64_to_cpu(smeta_buf->seq_nr); - spin_unlock(&line->lock); - - /* Update general metadata */ - spin_lock(&l_mg->free_lock); - if (line->seq_nr >= l_mg->d_seq_nr) - l_mg->d_seq_nr = line->seq_nr + 1; - l_mg->nr_free_lines--; - spin_unlock(&l_mg->free_lock); - - if (pblk_line_recov_alloc(pblk, line)) - goto out; - - pblk_recov_line_add_ordered(&recov_list, line); - found_lines++; - pblk_debug(pblk, "recovering data line %d, seq:%llu\n", - line->id, smeta_buf->seq_nr); - } - - if (!found_lines) { - guid_gen(&pblk->instance_uuid); - - spin_lock(&l_mg->free_lock); - WARN_ON_ONCE(!test_and_clear_bit(meta_line, - &l_mg->meta_bitmap)); - spin_unlock(&l_mg->free_lock); - - goto out; - } - - /* Verify closed blocks and recover this portion of L2P table*/ - list_for_each_entry_safe(line, tline, &recov_list, list) { - recovered_lines++; - - line->emeta_ssec = pblk_line_emeta_start(pblk, line); - line->emeta = emeta; - memset(line->emeta->buf, 0, lm->emeta_len[0]); - - if (pblk_line_is_open(pblk, line)) { - pblk_recov_l2p_from_oob(pblk, line); - goto next; - } - - if (pblk_line_emeta_read(pblk, line, line->emeta->buf)) { - pblk_recov_l2p_from_oob(pblk, line); - goto next; - } - - if (pblk_recov_check_emeta(pblk, line->emeta->buf)) { - pblk_recov_l2p_from_oob(pblk, line); - goto next; - } - - if (pblk_recov_check_line_version(pblk, line->emeta->buf)) - return ERR_PTR(-EINVAL); - - pblk_recov_wa_counters(pblk, line->emeta->buf); - - if (pblk_recov_l2p_from_emeta(pblk, line)) - pblk_recov_l2p_from_oob(pblk, line); - -next: - if (pblk_line_is_full(line)) { - struct list_head *move_list; - - spin_lock(&line->lock); - line->state = PBLK_LINESTATE_CLOSED; - trace_pblk_line_state(pblk_disk_name(pblk), line->id, - line->state); - move_list = pblk_line_gc_list(pblk, line); - spin_unlock(&line->lock); - - spin_lock(&l_mg->gc_lock); - list_move_tail(&line->list, move_list); - spin_unlock(&l_mg->gc_lock); - - mempool_free(line->map_bitmap, l_mg->bitmap_pool); - line->map_bitmap = NULL; - line->smeta = NULL; - line->emeta = NULL; - } else { - spin_lock(&line->lock); - line->state = PBLK_LINESTATE_OPEN; - spin_unlock(&line->lock); - - line->emeta->mem = 0; - atomic_set(&line->emeta->sync, 0); - - trace_pblk_line_state(pblk_disk_name(pblk), line->id, - line->state); - - data_line = line; - line->meta_line = meta_line; - - open_lines++; - } - } - - if (!open_lines) { - spin_lock(&l_mg->free_lock); - WARN_ON_ONCE(!test_and_clear_bit(meta_line, - &l_mg->meta_bitmap)); - spin_unlock(&l_mg->free_lock); - } else { - spin_lock(&l_mg->free_lock); - l_mg->data_line = data_line; - /* Allocate next line for preparation */ - l_mg->data_next = pblk_line_get(pblk); - if (l_mg->data_next) { - l_mg->data_next->seq_nr = l_mg->d_seq_nr++; - l_mg->data_next->type = PBLK_LINETYPE_DATA; - is_next = 1; - } - spin_unlock(&l_mg->free_lock); - } - - if (is_next) - pblk_line_erase(pblk, l_mg->data_next); - -out: - if (found_lines != recovered_lines) - pblk_err(pblk, "failed to recover all found lines %d/%d\n", - found_lines, recovered_lines); - - return data_line; -} - -/* - * Pad current line - */ -int pblk_recov_pad(struct pblk *pblk) -{ - struct pblk_line *line; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - int left_msecs; - int ret = 0; - - spin_lock(&l_mg->free_lock); - line = l_mg->data_line; - left_msecs = line->left_msecs; - spin_unlock(&l_mg->free_lock); - - ret = pblk_recov_pad_line(pblk, line, left_msecs); - if (ret) { - pblk_err(pblk, "tear down padding failed (%d)\n", ret); - return ret; - } - - pblk_line_close_meta(pblk, line); - return ret; -} diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c deleted file mode 100644 index a5f8bc2defbc..000000000000 --- a/drivers/lightnvm/pblk-rl.c +++ /dev/null @@ -1,254 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2016 CNEX Labs - * Initial release: Javier Gonzalez - * Matias Bjorling - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * pblk-rl.c - pblk's rate limiter for user I/O - * - */ - -#include "pblk.h" - -static void pblk_rl_kick_u_timer(struct pblk_rl *rl) -{ - mod_timer(&rl->u_timer, jiffies + msecs_to_jiffies(5000)); -} - -int pblk_rl_is_limit(struct pblk_rl *rl) -{ - int rb_space; - - rb_space = atomic_read(&rl->rb_space); - - return (rb_space == 0); -} - -int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries) -{ - int rb_user_cnt = atomic_read(&rl->rb_user_cnt); - int rb_space = atomic_read(&rl->rb_space); - - if (unlikely(rb_space >= 0) && (rb_space - nr_entries < 0)) - return NVM_IO_ERR; - - if (rb_user_cnt >= rl->rb_user_max) - return NVM_IO_REQUEUE; - - return NVM_IO_OK; -} - -void pblk_rl_inserted(struct pblk_rl *rl, int nr_entries) -{ - int rb_space = atomic_read(&rl->rb_space); - - if (unlikely(rb_space >= 0)) - atomic_sub(nr_entries, &rl->rb_space); -} - -int pblk_rl_gc_may_insert(struct pblk_rl *rl, int nr_entries) -{ - int rb_gc_cnt = atomic_read(&rl->rb_gc_cnt); - int rb_user_active; - - /* If there is no user I/O let GC take over space on the write buffer */ - rb_user_active = READ_ONCE(rl->rb_user_active); - return (!(rb_gc_cnt >= rl->rb_gc_max && rb_user_active)); -} - -void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries) -{ - atomic_add(nr_entries, &rl->rb_user_cnt); - - /* Release user I/O state. Protect from GC */ - smp_store_release(&rl->rb_user_active, 1); - pblk_rl_kick_u_timer(rl); -} - -void pblk_rl_werr_line_in(struct pblk_rl *rl) -{ - atomic_inc(&rl->werr_lines); -} - -void pblk_rl_werr_line_out(struct pblk_rl *rl) -{ - atomic_dec(&rl->werr_lines); -} - -void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries) -{ - atomic_add(nr_entries, &rl->rb_gc_cnt); -} - -void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc) -{ - atomic_sub(nr_user, &rl->rb_user_cnt); - atomic_sub(nr_gc, &rl->rb_gc_cnt); -} - -unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl) -{ - return atomic_read(&rl->free_blocks); -} - -unsigned long pblk_rl_nr_user_free_blks(struct pblk_rl *rl) -{ - return atomic_read(&rl->free_user_blocks); -} - -static void __pblk_rl_update_rates(struct pblk_rl *rl, - unsigned long free_blocks) -{ - struct pblk *pblk = container_of(rl, struct pblk, rl); - int max = rl->rb_budget; - int werr_gc_needed = atomic_read(&rl->werr_lines); - - if (free_blocks >= rl->high) { - if (werr_gc_needed) { - /* Allocate a small budget for recovering - * lines with write errors - */ - rl->rb_gc_max = 1 << rl->rb_windows_pw; - rl->rb_user_max = max - rl->rb_gc_max; - rl->rb_state = PBLK_RL_WERR; - } else { - rl->rb_user_max = max; - rl->rb_gc_max = 0; - rl->rb_state = PBLK_RL_OFF; - } - } else if (free_blocks < rl->high) { - int shift = rl->high_pw - rl->rb_windows_pw; - int user_windows = free_blocks >> shift; - int user_max = user_windows << ilog2(NVM_MAX_VLBA); - - rl->rb_user_max = user_max; - rl->rb_gc_max = max - user_max; - - if (free_blocks <= rl->rsv_blocks) { - rl->rb_user_max = 0; - rl->rb_gc_max = max; - } - - /* In the worst case, we will need to GC lines in the low list - * (high valid sector count). If there are lines to GC on high - * or mid lists, these will be prioritized - */ - rl->rb_state = PBLK_RL_LOW; - } - - if (rl->rb_state != PBLK_RL_OFF) - pblk_gc_should_start(pblk); - else - pblk_gc_should_stop(pblk); -} - -void pblk_rl_update_rates(struct pblk_rl *rl) -{ - __pblk_rl_update_rates(rl, pblk_rl_nr_user_free_blks(rl)); -} - -void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line) -{ - int blk_in_line = atomic_read(&line->blk_in_line); - int free_blocks; - - atomic_add(blk_in_line, &rl->free_blocks); - free_blocks = atomic_add_return(blk_in_line, &rl->free_user_blocks); - - __pblk_rl_update_rates(rl, free_blocks); -} - -void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line, - bool used) -{ - int blk_in_line = atomic_read(&line->blk_in_line); - int free_blocks; - - atomic_sub(blk_in_line, &rl->free_blocks); - - if (used) - free_blocks = atomic_sub_return(blk_in_line, - &rl->free_user_blocks); - else - free_blocks = atomic_read(&rl->free_user_blocks); - - __pblk_rl_update_rates(rl, free_blocks); -} - -int pblk_rl_high_thrs(struct pblk_rl *rl) -{ - return rl->high; -} - -int pblk_rl_max_io(struct pblk_rl *rl) -{ - return rl->rb_max_io; -} - -static void pblk_rl_u_timer(struct timer_list *t) -{ - struct pblk_rl *rl = from_timer(rl, t, u_timer); - - /* Release user I/O state. Protect from GC */ - smp_store_release(&rl->rb_user_active, 0); -} - -void pblk_rl_free(struct pblk_rl *rl) -{ - del_timer(&rl->u_timer); -} - -void pblk_rl_init(struct pblk_rl *rl, int budget, int threshold) -{ - struct pblk *pblk = container_of(rl, struct pblk, rl); - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line_meta *lm = &pblk->lm; - int sec_meta, blk_meta; - unsigned int rb_windows; - - /* Consider sectors used for metadata */ - sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines; - blk_meta = DIV_ROUND_UP(sec_meta, geo->clba); - - rl->high = pblk->op_blks - blk_meta - lm->blk_per_line; - rl->high_pw = get_count_order(rl->high); - - rl->rsv_blocks = pblk_get_min_chks(pblk); - - /* This will always be a power-of-2 */ - rb_windows = budget / NVM_MAX_VLBA; - rl->rb_windows_pw = get_count_order(rb_windows); - - /* To start with, all buffer is available to user I/O writers */ - rl->rb_budget = budget; - rl->rb_user_max = budget; - rl->rb_gc_max = 0; - rl->rb_state = PBLK_RL_HIGH; - - /* Maximize I/O size and ansure that back threshold is respected */ - if (threshold) - rl->rb_max_io = budget - pblk->min_write_pgs_data - threshold; - else - rl->rb_max_io = budget - pblk->min_write_pgs_data - 1; - - atomic_set(&rl->rb_user_cnt, 0); - atomic_set(&rl->rb_gc_cnt, 0); - atomic_set(&rl->rb_space, -1); - atomic_set(&rl->werr_lines, 0); - - timer_setup(&rl->u_timer, pblk_rl_u_timer, 0); - - rl->rb_user_active = 0; - rl->rb_gc_active = 0; -} diff --git a/drivers/lightnvm/pblk-sysfs.c b/drivers/lightnvm/pblk-sysfs.c deleted file mode 100644 index 6387302b03f2..000000000000 --- a/drivers/lightnvm/pblk-sysfs.c +++ /dev/null @@ -1,728 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2016 CNEX Labs - * Initial release: Javier Gonzalez - * Matias Bjorling - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * Implementation of a physical block-device target for Open-channel SSDs. - * - * pblk-sysfs.c - pblk's sysfs - * - */ - -#include "pblk.h" - -static ssize_t pblk_sysfs_luns_show(struct pblk *pblk, char *page) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_lun *rlun; - ssize_t sz = 0; - int i; - - for (i = 0; i < geo->all_luns; i++) { - int active = 1; - - rlun = &pblk->luns[i]; - if (!down_trylock(&rlun->wr_sem)) { - active = 0; - up(&rlun->wr_sem); - } - sz += scnprintf(page + sz, PAGE_SIZE - sz, - "pblk: pos:%d, ch:%d, lun:%d - %d\n", - i, - rlun->bppa.a.ch, - rlun->bppa.a.lun, - active); - } - - return sz; -} - -static ssize_t pblk_sysfs_rate_limiter(struct pblk *pblk, char *page) -{ - int free_blocks, free_user_blocks, total_blocks; - int rb_user_max, rb_user_cnt; - int rb_gc_max, rb_gc_cnt, rb_budget, rb_state; - - free_blocks = pblk_rl_nr_free_blks(&pblk->rl); - free_user_blocks = pblk_rl_nr_user_free_blks(&pblk->rl); - rb_user_max = pblk->rl.rb_user_max; - rb_user_cnt = atomic_read(&pblk->rl.rb_user_cnt); - rb_gc_max = pblk->rl.rb_gc_max; - rb_gc_cnt = atomic_read(&pblk->rl.rb_gc_cnt); - rb_budget = pblk->rl.rb_budget; - rb_state = pblk->rl.rb_state; - - total_blocks = pblk->rl.total_blocks; - - return snprintf(page, PAGE_SIZE, - "u:%u/%u,gc:%u/%u(%u)(stop:<%u,full:>%u,free:%d/%d/%d)-%d\n", - rb_user_cnt, - rb_user_max, - rb_gc_cnt, - rb_gc_max, - rb_state, - rb_budget, - pblk->rl.high, - free_blocks, - free_user_blocks, - total_blocks, - READ_ONCE(pblk->rl.rb_user_active)); -} - -static ssize_t pblk_sysfs_gc_state_show(struct pblk *pblk, char *page) -{ - int gc_enabled, gc_active; - - pblk_gc_sysfs_state_show(pblk, &gc_enabled, &gc_active); - return snprintf(page, PAGE_SIZE, "gc_enabled=%d, gc_active=%d\n", - gc_enabled, gc_active); -} - -static ssize_t pblk_sysfs_stats(struct pblk *pblk, char *page) -{ - ssize_t sz; - - sz = snprintf(page, PAGE_SIZE, - "read_failed=%lu, read_high_ecc=%lu, read_empty=%lu, read_failed_gc=%lu, write_failed=%lu, erase_failed=%lu\n", - atomic_long_read(&pblk->read_failed), - atomic_long_read(&pblk->read_high_ecc), - atomic_long_read(&pblk->read_empty), - atomic_long_read(&pblk->read_failed_gc), - atomic_long_read(&pblk->write_failed), - atomic_long_read(&pblk->erase_failed)); - - return sz; -} - -static ssize_t pblk_sysfs_write_buffer(struct pblk *pblk, char *page) -{ - return pblk_rb_sysfs(&pblk->rwb, page); -} - -static ssize_t pblk_sysfs_ppaf(struct pblk *pblk, char *page) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - ssize_t sz = 0; - - if (geo->version == NVM_OCSSD_SPEC_12) { - struct nvm_addrf_12 *ppaf = (struct nvm_addrf_12 *)&pblk->addrf; - struct nvm_addrf_12 *gppaf = (struct nvm_addrf_12 *)&geo->addrf; - - sz = scnprintf(page, PAGE_SIZE, - "g:(b:%d)blk:%d/%d,pg:%d/%d,lun:%d/%d,ch:%d/%d,pl:%d/%d,sec:%d/%d\n", - pblk->addrf_len, - ppaf->blk_offset, ppaf->blk_len, - ppaf->pg_offset, ppaf->pg_len, - ppaf->lun_offset, ppaf->lun_len, - ppaf->ch_offset, ppaf->ch_len, - ppaf->pln_offset, ppaf->pln_len, - ppaf->sec_offset, ppaf->sec_len); - - sz += scnprintf(page + sz, PAGE_SIZE - sz, - "d:blk:%d/%d,pg:%d/%d,lun:%d/%d,ch:%d/%d,pl:%d/%d,sec:%d/%d\n", - gppaf->blk_offset, gppaf->blk_len, - gppaf->pg_offset, gppaf->pg_len, - gppaf->lun_offset, gppaf->lun_len, - gppaf->ch_offset, gppaf->ch_len, - gppaf->pln_offset, gppaf->pln_len, - gppaf->sec_offset, gppaf->sec_len); - } else { - struct nvm_addrf *ppaf = &pblk->addrf; - struct nvm_addrf *gppaf = &geo->addrf; - - sz = scnprintf(page, PAGE_SIZE, - "pblk:(s:%d)ch:%d/%d,lun:%d/%d,chk:%d/%d/sec:%d/%d\n", - pblk->addrf_len, - ppaf->ch_offset, ppaf->ch_len, - ppaf->lun_offset, ppaf->lun_len, - ppaf->chk_offset, ppaf->chk_len, - ppaf->sec_offset, ppaf->sec_len); - - sz += scnprintf(page + sz, PAGE_SIZE - sz, - "device:ch:%d/%d,lun:%d/%d,chk:%d/%d,sec:%d/%d\n", - gppaf->ch_offset, gppaf->ch_len, - gppaf->lun_offset, gppaf->lun_len, - gppaf->chk_offset, gppaf->chk_len, - gppaf->sec_offset, gppaf->sec_len); - } - - return sz; -} - -static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line *line; - ssize_t sz = 0; - int nr_free_lines; - int cur_data, cur_log; - int free_line_cnt = 0, closed_line_cnt = 0, emeta_line_cnt = 0; - int d_line_cnt = 0, l_line_cnt = 0; - int gc_full = 0, gc_high = 0, gc_mid = 0, gc_low = 0, gc_empty = 0; - int gc_werr = 0; - - int bad = 0, cor = 0; - int msecs = 0, cur_sec = 0, vsc = 0, sec_in_line = 0; - int map_weight = 0, meta_weight = 0; - - spin_lock(&l_mg->free_lock); - cur_data = (l_mg->data_line) ? l_mg->data_line->id : -1; - cur_log = (l_mg->log_line) ? l_mg->log_line->id : -1; - nr_free_lines = l_mg->nr_free_lines; - - list_for_each_entry(line, &l_mg->free_list, list) - free_line_cnt++; - spin_unlock(&l_mg->free_lock); - - spin_lock(&l_mg->close_lock); - list_for_each_entry(line, &l_mg->emeta_list, list) - emeta_line_cnt++; - spin_unlock(&l_mg->close_lock); - - spin_lock(&l_mg->gc_lock); - list_for_each_entry(line, &l_mg->gc_full_list, list) { - if (line->type == PBLK_LINETYPE_DATA) - d_line_cnt++; - else if (line->type == PBLK_LINETYPE_LOG) - l_line_cnt++; - closed_line_cnt++; - gc_full++; - } - - list_for_each_entry(line, &l_mg->gc_high_list, list) { - if (line->type == PBLK_LINETYPE_DATA) - d_line_cnt++; - else if (line->type == PBLK_LINETYPE_LOG) - l_line_cnt++; - closed_line_cnt++; - gc_high++; - } - - list_for_each_entry(line, &l_mg->gc_mid_list, list) { - if (line->type == PBLK_LINETYPE_DATA) - d_line_cnt++; - else if (line->type == PBLK_LINETYPE_LOG) - l_line_cnt++; - closed_line_cnt++; - gc_mid++; - } - - list_for_each_entry(line, &l_mg->gc_low_list, list) { - if (line->type == PBLK_LINETYPE_DATA) - d_line_cnt++; - else if (line->type == PBLK_LINETYPE_LOG) - l_line_cnt++; - closed_line_cnt++; - gc_low++; - } - - list_for_each_entry(line, &l_mg->gc_empty_list, list) { - if (line->type == PBLK_LINETYPE_DATA) - d_line_cnt++; - else if (line->type == PBLK_LINETYPE_LOG) - l_line_cnt++; - closed_line_cnt++; - gc_empty++; - } - - list_for_each_entry(line, &l_mg->gc_werr_list, list) { - if (line->type == PBLK_LINETYPE_DATA) - d_line_cnt++; - else if (line->type == PBLK_LINETYPE_LOG) - l_line_cnt++; - closed_line_cnt++; - gc_werr++; - } - - list_for_each_entry(line, &l_mg->bad_list, list) - bad++; - list_for_each_entry(line, &l_mg->corrupt_list, list) - cor++; - spin_unlock(&l_mg->gc_lock); - - spin_lock(&l_mg->free_lock); - if (l_mg->data_line) { - cur_sec = l_mg->data_line->cur_sec; - msecs = l_mg->data_line->left_msecs; - vsc = le32_to_cpu(*l_mg->data_line->vsc); - sec_in_line = l_mg->data_line->sec_in_line; - meta_weight = bitmap_weight(&l_mg->meta_bitmap, - PBLK_DATA_LINES); - - spin_lock(&l_mg->data_line->lock); - if (l_mg->data_line->map_bitmap) - map_weight = bitmap_weight(l_mg->data_line->map_bitmap, - lm->sec_per_line); - else - map_weight = 0; - spin_unlock(&l_mg->data_line->lock); - } - spin_unlock(&l_mg->free_lock); - - if (nr_free_lines != free_line_cnt) - pblk_err(pblk, "corrupted free line list:%d/%d\n", - nr_free_lines, free_line_cnt); - - sz = scnprintf(page, PAGE_SIZE - sz, - "line: nluns:%d, nblks:%d, nsecs:%d\n", - geo->all_luns, lm->blk_per_line, lm->sec_per_line); - - sz += scnprintf(page + sz, PAGE_SIZE - sz, - "lines:d:%d,l:%d-f:%d,m:%d/%d,c:%d,b:%d,co:%d(d:%d,l:%d)t:%d\n", - cur_data, cur_log, - nr_free_lines, - emeta_line_cnt, meta_weight, - closed_line_cnt, - bad, cor, - d_line_cnt, l_line_cnt, - l_mg->nr_lines); - - sz += scnprintf(page + sz, PAGE_SIZE - sz, - "GC: full:%d, high:%d, mid:%d, low:%d, empty:%d, werr: %d, queue:%d\n", - gc_full, gc_high, gc_mid, gc_low, gc_empty, gc_werr, - atomic_read(&pblk->gc.read_inflight_gc)); - - sz += scnprintf(page + sz, PAGE_SIZE - sz, - "data (%d) cur:%d, left:%d, vsc:%d, s:%d, map:%d/%d (%d)\n", - cur_data, cur_sec, msecs, vsc, sec_in_line, - map_weight, lm->sec_per_line, - atomic_read(&pblk->inflight_io)); - - return sz; -} - -static ssize_t pblk_sysfs_lines_info(struct pblk *pblk, char *page) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line_meta *lm = &pblk->lm; - ssize_t sz = 0; - - sz = scnprintf(page, PAGE_SIZE - sz, - "smeta - len:%d, secs:%d\n", - lm->smeta_len, lm->smeta_sec); - sz += scnprintf(page + sz, PAGE_SIZE - sz, - "emeta - len:%d, sec:%d, bb_start:%d\n", - lm->emeta_len[0], lm->emeta_sec[0], - lm->emeta_bb); - sz += scnprintf(page + sz, PAGE_SIZE - sz, - "bitmap lengths: sec:%d, blk:%d, lun:%d\n", - lm->sec_bitmap_len, - lm->blk_bitmap_len, - lm->lun_bitmap_len); - sz += scnprintf(page + sz, PAGE_SIZE - sz, - "blk_line:%d, sec_line:%d, sec_blk:%d\n", - lm->blk_per_line, - lm->sec_per_line, - geo->clba); - - return sz; -} - -static ssize_t pblk_sysfs_get_sec_per_write(struct pblk *pblk, char *page) -{ - return snprintf(page, PAGE_SIZE, "%d\n", pblk->sec_per_write); -} - -static ssize_t pblk_get_write_amp(u64 user, u64 gc, u64 pad, - char *page) -{ - int sz; - - sz = scnprintf(page, PAGE_SIZE, - "user:%lld gc:%lld pad:%lld WA:", - user, gc, pad); - - if (!user) { - sz += scnprintf(page + sz, PAGE_SIZE - sz, "NaN\n"); - } else { - u64 wa_int; - u32 wa_frac; - - wa_int = (user + gc + pad) * 100000; - wa_int = div64_u64(wa_int, user); - wa_int = div_u64_rem(wa_int, 100000, &wa_frac); - - sz += scnprintf(page + sz, PAGE_SIZE - sz, "%llu.%05u\n", - wa_int, wa_frac); - } - - return sz; -} - -static ssize_t pblk_sysfs_get_write_amp_mileage(struct pblk *pblk, char *page) -{ - return pblk_get_write_amp(atomic64_read(&pblk->user_wa), - atomic64_read(&pblk->gc_wa), atomic64_read(&pblk->pad_wa), - page); -} - -static ssize_t pblk_sysfs_get_write_amp_trip(struct pblk *pblk, char *page) -{ - return pblk_get_write_amp( - atomic64_read(&pblk->user_wa) - pblk->user_rst_wa, - atomic64_read(&pblk->gc_wa) - pblk->gc_rst_wa, - atomic64_read(&pblk->pad_wa) - pblk->pad_rst_wa, page); -} - -static long long bucket_percentage(unsigned long long bucket, - unsigned long long total) -{ - int p = bucket * 100; - - p = div_u64(p, total); - - return p; -} - -static ssize_t pblk_sysfs_get_padding_dist(struct pblk *pblk, char *page) -{ - int sz = 0; - unsigned long long total; - unsigned long long total_buckets = 0; - int buckets = pblk->min_write_pgs - 1; - int i; - - total = atomic64_read(&pblk->nr_flush) - pblk->nr_flush_rst; - if (!total) { - for (i = 0; i < (buckets + 1); i++) - sz += scnprintf(page + sz, PAGE_SIZE - sz, - "%d:0 ", i); - sz += scnprintf(page + sz, PAGE_SIZE - sz, "\n"); - - return sz; - } - - for (i = 0; i < buckets; i++) - total_buckets += atomic64_read(&pblk->pad_dist[i]); - - sz += scnprintf(page + sz, PAGE_SIZE - sz, "0:%lld%% ", - bucket_percentage(total - total_buckets, total)); - - for (i = 0; i < buckets; i++) { - unsigned long long p; - - p = bucket_percentage(atomic64_read(&pblk->pad_dist[i]), - total); - sz += scnprintf(page + sz, PAGE_SIZE - sz, "%d:%lld%% ", - i + 1, p); - } - sz += scnprintf(page + sz, PAGE_SIZE - sz, "\n"); - - return sz; -} - -#ifdef CONFIG_NVM_PBLK_DEBUG -static ssize_t pblk_sysfs_stats_debug(struct pblk *pblk, char *page) -{ - return snprintf(page, PAGE_SIZE, - "%lu\t%lu\t%ld\t%llu\t%ld\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\n", - atomic_long_read(&pblk->inflight_writes), - atomic_long_read(&pblk->inflight_reads), - atomic_long_read(&pblk->req_writes), - (u64)atomic64_read(&pblk->nr_flush), - atomic_long_read(&pblk->padded_writes), - atomic_long_read(&pblk->padded_wb), - atomic_long_read(&pblk->sub_writes), - atomic_long_read(&pblk->sync_writes), - atomic_long_read(&pblk->recov_writes), - atomic_long_read(&pblk->recov_gc_writes), - atomic_long_read(&pblk->recov_gc_reads), - atomic_long_read(&pblk->cache_reads), - atomic_long_read(&pblk->sync_reads)); -} -#endif - -static ssize_t pblk_sysfs_gc_force(struct pblk *pblk, const char *page, - size_t len) -{ - size_t c_len; - int force; - - c_len = strcspn(page, "\n"); - if (c_len >= len) - return -EINVAL; - - if (kstrtouint(page, 0, &force)) - return -EINVAL; - - pblk_gc_sysfs_force(pblk, force); - - return len; -} - -static ssize_t pblk_sysfs_set_sec_per_write(struct pblk *pblk, - const char *page, size_t len) -{ - size_t c_len; - int sec_per_write; - - c_len = strcspn(page, "\n"); - if (c_len >= len) - return -EINVAL; - - if (kstrtouint(page, 0, &sec_per_write)) - return -EINVAL; - - if (!pblk_is_oob_meta_supported(pblk)) { - /* For packed metadata case it is - * not allowed to change sec_per_write. - */ - return -EINVAL; - } - - if (sec_per_write < pblk->min_write_pgs - || sec_per_write > pblk->max_write_pgs - || sec_per_write % pblk->min_write_pgs != 0) - return -EINVAL; - - pblk_set_sec_per_write(pblk, sec_per_write); - - return len; -} - -static ssize_t pblk_sysfs_set_write_amp_trip(struct pblk *pblk, - const char *page, size_t len) -{ - size_t c_len; - int reset_value; - - c_len = strcspn(page, "\n"); - if (c_len >= len) - return -EINVAL; - - if (kstrtouint(page, 0, &reset_value)) - return -EINVAL; - - if (reset_value != 0) - return -EINVAL; - - pblk->user_rst_wa = atomic64_read(&pblk->user_wa); - pblk->pad_rst_wa = atomic64_read(&pblk->pad_wa); - pblk->gc_rst_wa = atomic64_read(&pblk->gc_wa); - - return len; -} - - -static ssize_t pblk_sysfs_set_padding_dist(struct pblk *pblk, - const char *page, size_t len) -{ - size_t c_len; - int reset_value; - int buckets = pblk->min_write_pgs - 1; - int i; - - c_len = strcspn(page, "\n"); - if (c_len >= len) - return -EINVAL; - - if (kstrtouint(page, 0, &reset_value)) - return -EINVAL; - - if (reset_value != 0) - return -EINVAL; - - for (i = 0; i < buckets; i++) - atomic64_set(&pblk->pad_dist[i], 0); - - pblk->nr_flush_rst = atomic64_read(&pblk->nr_flush); - - return len; -} - -static struct attribute sys_write_luns = { - .name = "write_luns", - .mode = 0444, -}; - -static struct attribute sys_rate_limiter_attr = { - .name = "rate_limiter", - .mode = 0444, -}; - -static struct attribute sys_gc_state = { - .name = "gc_state", - .mode = 0444, -}; - -static struct attribute sys_errors_attr = { - .name = "errors", - .mode = 0444, -}; - -static struct attribute sys_rb_attr = { - .name = "write_buffer", - .mode = 0444, -}; - -static struct attribute sys_stats_ppaf_attr = { - .name = "ppa_format", - .mode = 0444, -}; - -static struct attribute sys_lines_attr = { - .name = "lines", - .mode = 0444, -}; - -static struct attribute sys_lines_info_attr = { - .name = "lines_info", - .mode = 0444, -}; - -static struct attribute sys_gc_force = { - .name = "gc_force", - .mode = 0200, -}; - -static struct attribute sys_max_sec_per_write = { - .name = "max_sec_per_write", - .mode = 0644, -}; - -static struct attribute sys_write_amp_mileage = { - .name = "write_amp_mileage", - .mode = 0444, -}; - -static struct attribute sys_write_amp_trip = { - .name = "write_amp_trip", - .mode = 0644, -}; - -static struct attribute sys_padding_dist = { - .name = "padding_dist", - .mode = 0644, -}; - -#ifdef CONFIG_NVM_PBLK_DEBUG -static struct attribute sys_stats_debug_attr = { - .name = "stats", - .mode = 0444, -}; -#endif - -static struct attribute *pblk_attrs[] = { - &sys_write_luns, - &sys_rate_limiter_attr, - &sys_errors_attr, - &sys_gc_state, - &sys_gc_force, - &sys_max_sec_per_write, - &sys_rb_attr, - &sys_stats_ppaf_attr, - &sys_lines_attr, - &sys_lines_info_attr, - &sys_write_amp_mileage, - &sys_write_amp_trip, - &sys_padding_dist, -#ifdef CONFIG_NVM_PBLK_DEBUG - &sys_stats_debug_attr, -#endif - NULL, -}; - -static ssize_t pblk_sysfs_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct pblk *pblk = container_of(kobj, struct pblk, kobj); - - if (strcmp(attr->name, "rate_limiter") == 0) - return pblk_sysfs_rate_limiter(pblk, buf); - else if (strcmp(attr->name, "write_luns") == 0) - return pblk_sysfs_luns_show(pblk, buf); - else if (strcmp(attr->name, "gc_state") == 0) - return pblk_sysfs_gc_state_show(pblk, buf); - else if (strcmp(attr->name, "errors") == 0) - return pblk_sysfs_stats(pblk, buf); - else if (strcmp(attr->name, "write_buffer") == 0) - return pblk_sysfs_write_buffer(pblk, buf); - else if (strcmp(attr->name, "ppa_format") == 0) - return pblk_sysfs_ppaf(pblk, buf); - else if (strcmp(attr->name, "lines") == 0) - return pblk_sysfs_lines(pblk, buf); - else if (strcmp(attr->name, "lines_info") == 0) - return pblk_sysfs_lines_info(pblk, buf); - else if (strcmp(attr->name, "max_sec_per_write") == 0) - return pblk_sysfs_get_sec_per_write(pblk, buf); - else if (strcmp(attr->name, "write_amp_mileage") == 0) - return pblk_sysfs_get_write_amp_mileage(pblk, buf); - else if (strcmp(attr->name, "write_amp_trip") == 0) - return pblk_sysfs_get_write_amp_trip(pblk, buf); - else if (strcmp(attr->name, "padding_dist") == 0) - return pblk_sysfs_get_padding_dist(pblk, buf); -#ifdef CONFIG_NVM_PBLK_DEBUG - else if (strcmp(attr->name, "stats") == 0) - return pblk_sysfs_stats_debug(pblk, buf); -#endif - return 0; -} - -static ssize_t pblk_sysfs_store(struct kobject *kobj, struct attribute *attr, - const char *buf, size_t len) -{ - struct pblk *pblk = container_of(kobj, struct pblk, kobj); - - if (strcmp(attr->name, "gc_force") == 0) - return pblk_sysfs_gc_force(pblk, buf, len); - else if (strcmp(attr->name, "max_sec_per_write") == 0) - return pblk_sysfs_set_sec_per_write(pblk, buf, len); - else if (strcmp(attr->name, "write_amp_trip") == 0) - return pblk_sysfs_set_write_amp_trip(pblk, buf, len); - else if (strcmp(attr->name, "padding_dist") == 0) - return pblk_sysfs_set_padding_dist(pblk, buf, len); - return 0; -} - -static const struct sysfs_ops pblk_sysfs_ops = { - .show = pblk_sysfs_show, - .store = pblk_sysfs_store, -}; - -static struct kobj_type pblk_ktype = { - .sysfs_ops = &pblk_sysfs_ops, - .default_attrs = pblk_attrs, -}; - -int pblk_sysfs_init(struct gendisk *tdisk) -{ - struct pblk *pblk = tdisk->private_data; - struct device *parent_dev = disk_to_dev(pblk->disk); - int ret; - - ret = kobject_init_and_add(&pblk->kobj, &pblk_ktype, - kobject_get(&parent_dev->kobj), - "%s", "pblk"); - if (ret) { - pblk_err(pblk, "could not register\n"); - return ret; - } - - kobject_uevent(&pblk->kobj, KOBJ_ADD); - return 0; -} - -void pblk_sysfs_exit(struct gendisk *tdisk) -{ - struct pblk *pblk = tdisk->private_data; - - kobject_uevent(&pblk->kobj, KOBJ_REMOVE); - kobject_del(&pblk->kobj); - kobject_put(&pblk->kobj); -} diff --git a/drivers/lightnvm/pblk-trace.h b/drivers/lightnvm/pblk-trace.h deleted file mode 100644 index 47b67c6bff7a..000000000000 --- a/drivers/lightnvm/pblk-trace.h +++ /dev/null @@ -1,145 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#undef TRACE_SYSTEM -#define TRACE_SYSTEM pblk - -#if !defined(_TRACE_PBLK_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_PBLK_H - -#include - -struct ppa_addr; - -#define show_chunk_flags(state) __print_flags(state, "", \ - { NVM_CHK_ST_FREE, "FREE", }, \ - { NVM_CHK_ST_CLOSED, "CLOSED", }, \ - { NVM_CHK_ST_OPEN, "OPEN", }, \ - { NVM_CHK_ST_OFFLINE, "OFFLINE", }) - -#define show_line_state(state) __print_symbolic(state, \ - { PBLK_LINESTATE_NEW, "NEW", }, \ - { PBLK_LINESTATE_FREE, "FREE", }, \ - { PBLK_LINESTATE_OPEN, "OPEN", }, \ - { PBLK_LINESTATE_CLOSED, "CLOSED", }, \ - { PBLK_LINESTATE_GC, "GC", }, \ - { PBLK_LINESTATE_BAD, "BAD", }, \ - { PBLK_LINESTATE_CORRUPT, "CORRUPT" }) - - -#define show_pblk_state(state) __print_symbolic(state, \ - { PBLK_STATE_RUNNING, "RUNNING", }, \ - { PBLK_STATE_STOPPING, "STOPPING", }, \ - { PBLK_STATE_RECOVERING, "RECOVERING", }, \ - { PBLK_STATE_STOPPED, "STOPPED" }) - -#define show_chunk_erase_state(state) __print_symbolic(state, \ - { PBLK_CHUNK_RESET_START, "START", }, \ - { PBLK_CHUNK_RESET_DONE, "OK", }, \ - { PBLK_CHUNK_RESET_FAILED, "FAILED" }) - - -TRACE_EVENT(pblk_chunk_reset, - - TP_PROTO(const char *name, struct ppa_addr *ppa, int state), - - TP_ARGS(name, ppa, state), - - TP_STRUCT__entry( - __string(name, name) - __field(u64, ppa) - __field(int, state) - ), - - TP_fast_assign( - __assign_str(name, name); - __entry->ppa = ppa->ppa; - __entry->state = state; - ), - - TP_printk("dev=%s grp=%llu pu=%llu chk=%llu state=%s", __get_str(name), - (u64)(((struct ppa_addr *)(&__entry->ppa))->m.grp), - (u64)(((struct ppa_addr *)(&__entry->ppa))->m.pu), - (u64)(((struct ppa_addr *)(&__entry->ppa))->m.chk), - show_chunk_erase_state((int)__entry->state)) - -); - -TRACE_EVENT(pblk_chunk_state, - - TP_PROTO(const char *name, struct ppa_addr *ppa, int state), - - TP_ARGS(name, ppa, state), - - TP_STRUCT__entry( - __string(name, name) - __field(u64, ppa) - __field(int, state) - ), - - TP_fast_assign( - __assign_str(name, name); - __entry->ppa = ppa->ppa; - __entry->state = state; - ), - - TP_printk("dev=%s grp=%llu pu=%llu chk=%llu state=%s", __get_str(name), - (u64)(((struct ppa_addr *)(&__entry->ppa))->m.grp), - (u64)(((struct ppa_addr *)(&__entry->ppa))->m.pu), - (u64)(((struct ppa_addr *)(&__entry->ppa))->m.chk), - show_chunk_flags((int)__entry->state)) - -); - -TRACE_EVENT(pblk_line_state, - - TP_PROTO(const char *name, int line, int state), - - TP_ARGS(name, line, state), - - TP_STRUCT__entry( - __string(name, name) - __field(int, line) - __field(int, state) - ), - - TP_fast_assign( - __assign_str(name, name); - __entry->line = line; - __entry->state = state; - ), - - TP_printk("dev=%s line=%d state=%s", __get_str(name), - (int)__entry->line, - show_line_state((int)__entry->state)) - -); - -TRACE_EVENT(pblk_state, - - TP_PROTO(const char *name, int state), - - TP_ARGS(name, state), - - TP_STRUCT__entry( - __string(name, name) - __field(int, state) - ), - - TP_fast_assign( - __assign_str(name, name); - __entry->state = state; - ), - - TP_printk("dev=%s state=%s", __get_str(name), - show_pblk_state((int)__entry->state)) - -); - -#endif /* !defined(_TRACE_PBLK_H) || defined(TRACE_HEADER_MULTI_READ) */ - -/* This part must be outside protection */ - -#undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH ../../drivers/lightnvm -#undef TRACE_INCLUDE_FILE -#define TRACE_INCLUDE_FILE pblk-trace -#include diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c deleted file mode 100644 index b9a2aeba95ab..000000000000 --- a/drivers/lightnvm/pblk-write.c +++ /dev/null @@ -1,665 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2016 CNEX Labs - * Initial release: Javier Gonzalez - * Matias Bjorling - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * pblk-write.c - pblk's write path from write buffer to media - */ - -#include "pblk.h" -#include "pblk-trace.h" - -static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd, - struct pblk_c_ctx *c_ctx) -{ - struct bio *original_bio; - struct pblk_rb *rwb = &pblk->rwb; - unsigned long ret; - int i; - - for (i = 0; i < c_ctx->nr_valid; i++) { - struct pblk_w_ctx *w_ctx; - int pos = c_ctx->sentry + i; - int flags; - - w_ctx = pblk_rb_w_ctx(rwb, pos); - flags = READ_ONCE(w_ctx->flags); - - if (flags & PBLK_FLUSH_ENTRY) { - flags &= ~PBLK_FLUSH_ENTRY; - /* Release flags on context. Protect from writes */ - smp_store_release(&w_ctx->flags, flags); - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_dec(&rwb->inflight_flush_point); -#endif - } - - while ((original_bio = bio_list_pop(&w_ctx->bios))) - bio_endio(original_bio); - } - - if (c_ctx->nr_padded) - pblk_bio_free_pages(pblk, rqd->bio, c_ctx->nr_valid, - c_ctx->nr_padded); - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_add(rqd->nr_ppas, &pblk->sync_writes); -#endif - - ret = pblk_rb_sync_advance(&pblk->rwb, c_ctx->nr_valid); - - bio_put(rqd->bio); - pblk_free_rqd(pblk, rqd, PBLK_WRITE); - - return ret; -} - -static unsigned long pblk_end_queued_w_bio(struct pblk *pblk, - struct nvm_rq *rqd, - struct pblk_c_ctx *c_ctx) -{ - list_del(&c_ctx->list); - return pblk_end_w_bio(pblk, rqd, c_ctx); -} - -static void pblk_complete_write(struct pblk *pblk, struct nvm_rq *rqd, - struct pblk_c_ctx *c_ctx) -{ - struct pblk_c_ctx *c, *r; - unsigned long flags; - unsigned long pos; - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_sub(c_ctx->nr_valid, &pblk->inflight_writes); -#endif - pblk_up_rq(pblk, c_ctx->lun_bitmap); - - pos = pblk_rb_sync_init(&pblk->rwb, &flags); - if (pos == c_ctx->sentry) { - pos = pblk_end_w_bio(pblk, rqd, c_ctx); - -retry: - list_for_each_entry_safe(c, r, &pblk->compl_list, list) { - rqd = nvm_rq_from_c_ctx(c); - if (c->sentry == pos) { - pos = pblk_end_queued_w_bio(pblk, rqd, c); - goto retry; - } - } - } else { - WARN_ON(nvm_rq_from_c_ctx(c_ctx) != rqd); - list_add_tail(&c_ctx->list, &pblk->compl_list); - } - pblk_rb_sync_end(&pblk->rwb, &flags); -} - -/* Map remaining sectors in chunk, starting from ppa */ -static void pblk_map_remaining(struct pblk *pblk, struct ppa_addr *ppa, - int rqd_ppas) -{ - struct pblk_line *line; - struct ppa_addr map_ppa = *ppa; - __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); - __le64 *lba_list; - u64 paddr; - int done = 0; - int n = 0; - - line = pblk_ppa_to_line(pblk, *ppa); - lba_list = emeta_to_lbas(pblk, line->emeta->buf); - - spin_lock(&line->lock); - - while (!done) { - paddr = pblk_dev_ppa_to_line_addr(pblk, map_ppa); - - if (!test_and_set_bit(paddr, line->map_bitmap)) - line->left_msecs--; - - if (n < rqd_ppas && lba_list[paddr] != addr_empty) - line->nr_valid_lbas--; - - lba_list[paddr] = addr_empty; - - if (!test_and_set_bit(paddr, line->invalid_bitmap)) - le32_add_cpu(line->vsc, -1); - - done = nvm_next_ppa_in_chk(pblk->dev, &map_ppa); - - n++; - } - - line->w_err_gc->has_write_err = 1; - spin_unlock(&line->lock); -} - -static void pblk_prepare_resubmit(struct pblk *pblk, unsigned int sentry, - unsigned int nr_entries) -{ - struct pblk_rb *rb = &pblk->rwb; - struct pblk_rb_entry *entry; - struct pblk_line *line; - struct pblk_w_ctx *w_ctx; - struct ppa_addr ppa_l2p; - int flags; - unsigned int i; - - spin_lock(&pblk->trans_lock); - for (i = 0; i < nr_entries; i++) { - entry = &rb->entries[pblk_rb_ptr_wrap(rb, sentry, i)]; - w_ctx = &entry->w_ctx; - - /* Check if the lba has been overwritten */ - if (w_ctx->lba != ADDR_EMPTY) { - ppa_l2p = pblk_trans_map_get(pblk, w_ctx->lba); - if (!pblk_ppa_comp(ppa_l2p, entry->cacheline)) - w_ctx->lba = ADDR_EMPTY; - } - - /* Mark up the entry as submittable again */ - flags = READ_ONCE(w_ctx->flags); - flags |= PBLK_WRITTEN_DATA; - /* Release flags on write context. Protect from writes */ - smp_store_release(&w_ctx->flags, flags); - - /* Decrease the reference count to the line as we will - * re-map these entries - */ - line = pblk_ppa_to_line(pblk, w_ctx->ppa); - atomic_dec(&line->sec_to_update); - kref_put(&line->ref, pblk_line_put); - } - spin_unlock(&pblk->trans_lock); -} - -static void pblk_queue_resubmit(struct pblk *pblk, struct pblk_c_ctx *c_ctx) -{ - struct pblk_c_ctx *r_ctx; - - r_ctx = kzalloc(sizeof(struct pblk_c_ctx), GFP_KERNEL); - if (!r_ctx) - return; - - r_ctx->lun_bitmap = NULL; - r_ctx->sentry = c_ctx->sentry; - r_ctx->nr_valid = c_ctx->nr_valid; - r_ctx->nr_padded = c_ctx->nr_padded; - - spin_lock(&pblk->resubmit_lock); - list_add_tail(&r_ctx->list, &pblk->resubmit_list); - spin_unlock(&pblk->resubmit_lock); - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_add(c_ctx->nr_valid, &pblk->recov_writes); -#endif -} - -static void pblk_submit_rec(struct work_struct *work) -{ - struct pblk_rec_ctx *recovery = - container_of(work, struct pblk_rec_ctx, ws_rec); - struct pblk *pblk = recovery->pblk; - struct nvm_rq *rqd = recovery->rqd; - struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); - struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd); - - pblk_log_write_err(pblk, rqd); - - pblk_map_remaining(pblk, ppa_list, rqd->nr_ppas); - pblk_queue_resubmit(pblk, c_ctx); - - pblk_up_rq(pblk, c_ctx->lun_bitmap); - if (c_ctx->nr_padded) - pblk_bio_free_pages(pblk, rqd->bio, c_ctx->nr_valid, - c_ctx->nr_padded); - bio_put(rqd->bio); - pblk_free_rqd(pblk, rqd, PBLK_WRITE); - mempool_free(recovery, &pblk->rec_pool); - - atomic_dec(&pblk->inflight_io); - pblk_write_kick(pblk); -} - - -static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd) -{ - struct pblk_rec_ctx *recovery; - - recovery = mempool_alloc(&pblk->rec_pool, GFP_ATOMIC); - if (!recovery) { - pblk_err(pblk, "could not allocate recovery work\n"); - return; - } - - recovery->pblk = pblk; - recovery->rqd = rqd; - - INIT_WORK(&recovery->ws_rec, pblk_submit_rec); - queue_work(pblk->close_wq, &recovery->ws_rec); -} - -static void pblk_end_io_write(struct nvm_rq *rqd) -{ - struct pblk *pblk = rqd->private; - struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); - - if (rqd->error) { - pblk_end_w_fail(pblk, rqd); - return; - } else { - if (trace_pblk_chunk_state_enabled()) - pblk_check_chunk_state_update(pblk, rqd); -#ifdef CONFIG_NVM_PBLK_DEBUG - WARN_ONCE(rqd->bio->bi_status, "pblk: corrupted write error\n"); -#endif - } - - pblk_complete_write(pblk, rqd, c_ctx); - atomic_dec(&pblk->inflight_io); -} - -static void pblk_end_io_write_meta(struct nvm_rq *rqd) -{ - struct pblk *pblk = rqd->private; - struct pblk_g_ctx *m_ctx = nvm_rq_to_pdu(rqd); - struct pblk_line *line = m_ctx->private; - struct pblk_emeta *emeta = line->emeta; - struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd); - int sync; - - pblk_up_chunk(pblk, ppa_list[0]); - - if (rqd->error) { - pblk_log_write_err(pblk, rqd); - pblk_err(pblk, "metadata I/O failed. Line %d\n", line->id); - line->w_err_gc->has_write_err = 1; - } else { - if (trace_pblk_chunk_state_enabled()) - pblk_check_chunk_state_update(pblk, rqd); - } - - sync = atomic_add_return(rqd->nr_ppas, &emeta->sync); - if (sync == emeta->nr_entries) - pblk_gen_run_ws(pblk, line, NULL, pblk_line_close_ws, - GFP_ATOMIC, pblk->close_wq); - - pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT); - - atomic_dec(&pblk->inflight_io); -} - -static int pblk_alloc_w_rq(struct pblk *pblk, struct nvm_rq *rqd, - unsigned int nr_secs, nvm_end_io_fn(*end_io)) -{ - /* Setup write request */ - rqd->opcode = NVM_OP_PWRITE; - rqd->nr_ppas = nr_secs; - rqd->is_seq = 1; - rqd->private = pblk; - rqd->end_io = end_io; - - return pblk_alloc_rqd_meta(pblk, rqd); -} - -static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd, - struct ppa_addr *erase_ppa) -{ - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_line *e_line = pblk_line_get_erase(pblk); - struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); - unsigned int valid = c_ctx->nr_valid; - unsigned int padded = c_ctx->nr_padded; - unsigned int nr_secs = valid + padded; - unsigned long *lun_bitmap; - int ret; - - lun_bitmap = kzalloc(lm->lun_bitmap_len, GFP_KERNEL); - if (!lun_bitmap) - return -ENOMEM; - c_ctx->lun_bitmap = lun_bitmap; - - ret = pblk_alloc_w_rq(pblk, rqd, nr_secs, pblk_end_io_write); - if (ret) { - kfree(lun_bitmap); - return ret; - } - - if (likely(!e_line || !atomic_read(&e_line->left_eblks))) - ret = pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, - valid, 0); - else - ret = pblk_map_erase_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, - valid, erase_ppa); - - return ret; -} - -static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail, - unsigned int secs_to_flush) -{ - int secs_to_sync; - - secs_to_sync = pblk_calc_secs(pblk, secs_avail, secs_to_flush, true); - -#ifdef CONFIG_NVM_PBLK_DEBUG - if ((!secs_to_sync && secs_to_flush) - || (secs_to_sync < 0) - || (secs_to_sync > secs_avail && !secs_to_flush)) { - pblk_err(pblk, "bad sector calculation (a:%d,s:%d,f:%d)\n", - secs_avail, secs_to_sync, secs_to_flush); - } -#endif - - return secs_to_sync; -} - -int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_emeta *emeta = meta_line->emeta; - struct ppa_addr *ppa_list; - struct pblk_g_ctx *m_ctx; - struct nvm_rq *rqd; - void *data; - u64 paddr; - int rq_ppas = pblk->min_write_pgs; - int id = meta_line->id; - int rq_len; - int i, j; - int ret; - - rqd = pblk_alloc_rqd(pblk, PBLK_WRITE_INT); - - m_ctx = nvm_rq_to_pdu(rqd); - m_ctx->private = meta_line; - - rq_len = rq_ppas * geo->csecs; - data = ((void *)emeta->buf) + emeta->mem; - - ret = pblk_alloc_w_rq(pblk, rqd, rq_ppas, pblk_end_io_write_meta); - if (ret) - goto fail_free_rqd; - - ppa_list = nvm_rq_to_ppa_list(rqd); - for (i = 0; i < rqd->nr_ppas; ) { - spin_lock(&meta_line->lock); - paddr = __pblk_alloc_page(pblk, meta_line, rq_ppas); - spin_unlock(&meta_line->lock); - for (j = 0; j < rq_ppas; j++, i++, paddr++) - ppa_list[i] = addr_to_gen_ppa(pblk, paddr, id); - } - - spin_lock(&l_mg->close_lock); - emeta->mem += rq_len; - if (emeta->mem >= lm->emeta_len[0]) - list_del(&meta_line->list); - spin_unlock(&l_mg->close_lock); - - pblk_down_chunk(pblk, ppa_list[0]); - - ret = pblk_submit_io(pblk, rqd, data); - if (ret) { - pblk_err(pblk, "emeta I/O submission failed: %d\n", ret); - goto fail_rollback; - } - - return NVM_IO_OK; - -fail_rollback: - pblk_up_chunk(pblk, ppa_list[0]); - spin_lock(&l_mg->close_lock); - pblk_dealloc_page(pblk, meta_line, rq_ppas); - list_add(&meta_line->list, &meta_line->list); - spin_unlock(&l_mg->close_lock); -fail_free_rqd: - pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT); - return ret; -} - -static inline bool pblk_valid_meta_ppa(struct pblk *pblk, - struct pblk_line *meta_line, - struct nvm_rq *data_rqd) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_c_ctx *data_c_ctx = nvm_rq_to_pdu(data_rqd); - struct pblk_line *data_line = pblk_line_get_data(pblk); - struct ppa_addr ppa, ppa_opt; - u64 paddr; - int pos_opt; - - /* Schedule a metadata I/O that is half the distance from the data I/O - * with regards to the number of LUNs forming the pblk instance. This - * balances LUN conflicts across every I/O. - * - * When the LUN configuration changes (e.g., due to GC), this distance - * can align, which would result on metadata and data I/Os colliding. In - * this case, modify the distance to not be optimal, but move the - * optimal in the right direction. - */ - paddr = pblk_lookup_page(pblk, meta_line); - ppa = addr_to_gen_ppa(pblk, paddr, 0); - ppa_opt = addr_to_gen_ppa(pblk, paddr + data_line->meta_distance, 0); - pos_opt = pblk_ppa_to_pos(geo, ppa_opt); - - if (test_bit(pos_opt, data_c_ctx->lun_bitmap) || - test_bit(pos_opt, data_line->blk_bitmap)) - return true; - - if (unlikely(pblk_ppa_comp(ppa_opt, ppa))) - data_line->meta_distance--; - - return false; -} - -static struct pblk_line *pblk_should_submit_meta_io(struct pblk *pblk, - struct nvm_rq *data_rqd) -{ - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line *meta_line; - - spin_lock(&l_mg->close_lock); - if (list_empty(&l_mg->emeta_list)) { - spin_unlock(&l_mg->close_lock); - return NULL; - } - meta_line = list_first_entry(&l_mg->emeta_list, struct pblk_line, list); - if (meta_line->emeta->mem >= lm->emeta_len[0]) { - spin_unlock(&l_mg->close_lock); - return NULL; - } - spin_unlock(&l_mg->close_lock); - - if (!pblk_valid_meta_ppa(pblk, meta_line, data_rqd)) - return NULL; - - return meta_line; -} - -static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd) -{ - struct ppa_addr erase_ppa; - struct pblk_line *meta_line; - int err; - - pblk_ppa_set_empty(&erase_ppa); - - /* Assign lbas to ppas and populate request structure */ - err = pblk_setup_w_rq(pblk, rqd, &erase_ppa); - if (err) { - pblk_err(pblk, "could not setup write request: %d\n", err); - return NVM_IO_ERR; - } - - meta_line = pblk_should_submit_meta_io(pblk, rqd); - - /* Submit data write for current data line */ - err = pblk_submit_io(pblk, rqd, NULL); - if (err) { - pblk_err(pblk, "data I/O submission failed: %d\n", err); - return NVM_IO_ERR; - } - - if (!pblk_ppa_empty(erase_ppa)) { - /* Submit erase for next data line */ - if (pblk_blk_erase_async(pblk, erase_ppa)) { - struct pblk_line *e_line = pblk_line_get_erase(pblk); - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - int bit; - - atomic_inc(&e_line->left_eblks); - bit = pblk_ppa_to_pos(geo, erase_ppa); - WARN_ON(!test_and_clear_bit(bit, e_line->erase_bitmap)); - } - } - - if (meta_line) { - /* Submit metadata write for previous data line */ - err = pblk_submit_meta_io(pblk, meta_line); - if (err) { - pblk_err(pblk, "metadata I/O submission failed: %d", - err); - return NVM_IO_ERR; - } - } - - return NVM_IO_OK; -} - -static void pblk_free_write_rqd(struct pblk *pblk, struct nvm_rq *rqd) -{ - struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); - struct bio *bio = rqd->bio; - - if (c_ctx->nr_padded) - pblk_bio_free_pages(pblk, bio, c_ctx->nr_valid, - c_ctx->nr_padded); -} - -static int pblk_submit_write(struct pblk *pblk, int *secs_left) -{ - struct bio *bio; - struct nvm_rq *rqd; - unsigned int secs_avail, secs_to_sync, secs_to_com; - unsigned int secs_to_flush, packed_meta_pgs; - unsigned long pos; - unsigned int resubmit; - - *secs_left = 0; - - spin_lock(&pblk->resubmit_lock); - resubmit = !list_empty(&pblk->resubmit_list); - spin_unlock(&pblk->resubmit_lock); - - /* Resubmit failed writes first */ - if (resubmit) { - struct pblk_c_ctx *r_ctx; - - spin_lock(&pblk->resubmit_lock); - r_ctx = list_first_entry(&pblk->resubmit_list, - struct pblk_c_ctx, list); - list_del(&r_ctx->list); - spin_unlock(&pblk->resubmit_lock); - - secs_avail = r_ctx->nr_valid; - pos = r_ctx->sentry; - - pblk_prepare_resubmit(pblk, pos, secs_avail); - secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, - secs_avail); - - kfree(r_ctx); - } else { - /* If there are no sectors in the cache, - * flushes (bios without data) will be cleared on - * the cache threads - */ - secs_avail = pblk_rb_read_count(&pblk->rwb); - if (!secs_avail) - return 0; - - secs_to_flush = pblk_rb_flush_point_count(&pblk->rwb); - if (!secs_to_flush && secs_avail < pblk->min_write_pgs_data) - return 0; - - secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, - secs_to_flush); - if (secs_to_sync > pblk->max_write_pgs) { - pblk_err(pblk, "bad buffer sync calculation\n"); - return 0; - } - - secs_to_com = (secs_to_sync > secs_avail) ? - secs_avail : secs_to_sync; - pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com); - } - - packed_meta_pgs = (pblk->min_write_pgs - pblk->min_write_pgs_data); - bio = bio_alloc(GFP_KERNEL, secs_to_sync + packed_meta_pgs); - - bio->bi_iter.bi_sector = 0; /* internal bio */ - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - - rqd = pblk_alloc_rqd(pblk, PBLK_WRITE); - rqd->bio = bio; - - if (pblk_rb_read_to_bio(&pblk->rwb, rqd, pos, secs_to_sync, - secs_avail)) { - pblk_err(pblk, "corrupted write bio\n"); - goto fail_put_bio; - } - - if (pblk_submit_io_set(pblk, rqd)) - goto fail_free_bio; - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_add(secs_to_sync, &pblk->sub_writes); -#endif - - *secs_left = 1; - return 0; - -fail_free_bio: - pblk_free_write_rqd(pblk, rqd); -fail_put_bio: - bio_put(bio); - pblk_free_rqd(pblk, rqd, PBLK_WRITE); - - return -EINTR; -} - -int pblk_write_ts(void *data) -{ - struct pblk *pblk = data; - int secs_left; - int write_failure = 0; - - while (!kthread_should_stop()) { - if (!write_failure) { - write_failure = pblk_submit_write(pblk, &secs_left); - - if (secs_left) - continue; - } - set_current_state(TASK_INTERRUPTIBLE); - io_schedule(); - } - - return 0; -} diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h deleted file mode 100644 index 86ffa875bfe1..000000000000 --- a/drivers/lightnvm/pblk.h +++ /dev/null @@ -1,1358 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2015 IT University of Copenhagen (rrpc.h) - * Copyright (C) 2016 CNEX Labs - * Initial release: Matias Bjorling - * Write buffering: Javier Gonzalez - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * Implementation of a Physical Block-device target for Open-channel SSDs. - * - */ - -#ifndef PBLK_H_ -#define PBLK_H_ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -/* Run only GC if less than 1/X blocks are free */ -#define GC_LIMIT_INVERSE 5 -#define GC_TIME_MSECS 1000 - -#define PBLK_SECTOR (512) -#define PBLK_EXPOSED_PAGE_SIZE (4096) - -#define PBLK_NR_CLOSE_JOBS (4) - -#define PBLK_CACHE_NAME_LEN (DISK_NAME_LEN + 16) - -/* Max 512 LUNs per device */ -#define PBLK_MAX_LUNS_BITMAP (4) - -#define NR_PHY_IN_LOG (PBLK_EXPOSED_PAGE_SIZE / PBLK_SECTOR) - -/* Static pool sizes */ -#define PBLK_GEN_WS_POOL_SIZE (2) - -#define PBLK_DEFAULT_OP (11) - -enum { - PBLK_READ = READ, - PBLK_WRITE = WRITE,/* Write from write buffer */ - PBLK_WRITE_INT, /* Internal write - no write buffer */ - PBLK_READ_RECOV, /* Recovery read - errors allowed */ - PBLK_ERASE, -}; - -enum { - /* IO Types */ - PBLK_IOTYPE_USER = 1 << 0, - PBLK_IOTYPE_GC = 1 << 1, - - /* Write buffer flags */ - PBLK_FLUSH_ENTRY = 1 << 2, - PBLK_WRITTEN_DATA = 1 << 3, - PBLK_SUBMITTED_ENTRY = 1 << 4, - PBLK_WRITABLE_ENTRY = 1 << 5, -}; - -enum { - PBLK_BLK_ST_OPEN = 0x1, - PBLK_BLK_ST_CLOSED = 0x2, -}; - -enum { - PBLK_CHUNK_RESET_START, - PBLK_CHUNK_RESET_DONE, - PBLK_CHUNK_RESET_FAILED, -}; - -struct pblk_sec_meta { - u64 reserved; - __le64 lba; -}; - -/* The number of GC lists and the rate-limiter states go together. This way the - * rate-limiter can dictate how much GC is needed based on resource utilization. - */ -#define PBLK_GC_NR_LISTS 4 - -enum { - PBLK_RL_OFF = 0, - PBLK_RL_WERR = 1, - PBLK_RL_HIGH = 2, - PBLK_RL_MID = 3, - PBLK_RL_LOW = 4 -}; - -#define pblk_dma_ppa_size (sizeof(u64) * NVM_MAX_VLBA) - -/* write buffer completion context */ -struct pblk_c_ctx { - struct list_head list; /* Head for out-of-order completion */ - - unsigned long *lun_bitmap; /* Luns used on current request */ - unsigned int sentry; - unsigned int nr_valid; - unsigned int nr_padded; -}; - -/* read context */ -struct pblk_g_ctx { - void *private; - unsigned long start_time; - u64 lba; -}; - -/* Pad context */ -struct pblk_pad_rq { - struct pblk *pblk; - struct completion wait; - struct kref ref; -}; - -/* Recovery context */ -struct pblk_rec_ctx { - struct pblk *pblk; - struct nvm_rq *rqd; - struct work_struct ws_rec; -}; - -/* Write context */ -struct pblk_w_ctx { - struct bio_list bios; /* Original bios - used for completion - * in REQ_FUA, REQ_FLUSH case - */ - u64 lba; /* Logic addr. associated with entry */ - struct ppa_addr ppa; /* Physic addr. associated with entry */ - int flags; /* Write context flags */ -}; - -struct pblk_rb_entry { - struct ppa_addr cacheline; /* Cacheline for this entry */ - void *data; /* Pointer to data on this entry */ - struct pblk_w_ctx w_ctx; /* Context for this entry */ - struct list_head index; /* List head to enable indexes */ -}; - -#define EMPTY_ENTRY (~0U) - -struct pblk_rb_pages { - struct page *pages; - int order; - struct list_head list; -}; - -struct pblk_rb { - struct pblk_rb_entry *entries; /* Ring buffer entries */ - unsigned int mem; /* Write offset - points to next - * writable entry in memory - */ - unsigned int subm; /* Read offset - points to last entry - * that has been submitted to the media - * to be persisted - */ - unsigned int sync; /* Synced - backpointer that signals - * the last submitted entry that has - * been successfully persisted to media - */ - unsigned int flush_point; /* Sync point - last entry that must be - * flushed to the media. Used with - * REQ_FLUSH and REQ_FUA - */ - unsigned int l2p_update; /* l2p update point - next entry for - * which l2p mapping will be updated to - * contain a device ppa address (instead - * of a cacheline - */ - unsigned int nr_entries; /* Number of entries in write buffer - - * must be a power of two - */ - unsigned int seg_size; /* Size of the data segments being - * stored on each entry. Typically this - * will be 4KB - */ - - unsigned int back_thres; /* Threshold that shall be maintained by - * the backpointer in order to respect - * geo->mw_cunits on a per chunk basis - */ - - struct list_head pages; /* List of data pages */ - - spinlock_t w_lock; /* Write lock */ - spinlock_t s_lock; /* Sync lock */ - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_t inflight_flush_point; /* Not served REQ_FLUSH | REQ_FUA */ -#endif -}; - -#define PBLK_RECOVERY_SECTORS 16 - -struct pblk_lun { - struct ppa_addr bppa; - struct semaphore wr_sem; -}; - -struct pblk_gc_rq { - struct pblk_line *line; - void *data; - u64 paddr_list[NVM_MAX_VLBA]; - u64 lba_list[NVM_MAX_VLBA]; - int nr_secs; - int secs_to_gc; - struct list_head list; -}; - -struct pblk_gc { - /* These states are not protected by a lock since (i) they are in the - * fast path, and (ii) they are not critical. - */ - int gc_active; - int gc_enabled; - int gc_forced; - - struct task_struct *gc_ts; - struct task_struct *gc_writer_ts; - struct task_struct *gc_reader_ts; - - struct workqueue_struct *gc_line_reader_wq; - struct workqueue_struct *gc_reader_wq; - - struct timer_list gc_timer; - - struct semaphore gc_sem; - atomic_t read_inflight_gc; /* Number of lines with inflight GC reads */ - atomic_t pipeline_gc; /* Number of lines in the GC pipeline - - * started reads to finished writes - */ - int w_entries; - - struct list_head w_list; - struct list_head r_list; - - spinlock_t lock; - spinlock_t w_lock; - spinlock_t r_lock; -}; - -struct pblk_rl { - unsigned int high; /* Upper threshold for rate limiter (free run - - * user I/O rate limiter - */ - unsigned int high_pw; /* High rounded up as a power of 2 */ - -#define PBLK_USER_HIGH_THRS 8 /* Begin write limit at 12% available blks */ -#define PBLK_USER_LOW_THRS 10 /* Aggressive GC at 10% available blocks */ - - int rb_windows_pw; /* Number of rate windows in the write buffer - * given as a power-of-2. This guarantees that - * when user I/O is being rate limited, there - * will be reserved enough space for the GC to - * place its payload. A window is of - * pblk->max_write_pgs size, which in NVMe is - * 64, i.e., 256kb. - */ - int rb_budget; /* Total number of entries available for I/O */ - int rb_user_max; /* Max buffer entries available for user I/O */ - int rb_gc_max; /* Max buffer entries available for GC I/O */ - int rb_gc_rsv; /* Reserved buffer entries for GC I/O */ - int rb_state; /* Rate-limiter current state */ - int rb_max_io; /* Maximum size for an I/O giving the config */ - - atomic_t rb_user_cnt; /* User I/O buffer counter */ - atomic_t rb_gc_cnt; /* GC I/O buffer counter */ - atomic_t rb_space; /* Space limit in case of reaching capacity */ - - int rsv_blocks; /* Reserved blocks for GC */ - - int rb_user_active; - int rb_gc_active; - - atomic_t werr_lines; /* Number of write error lines that needs gc */ - - struct timer_list u_timer; - - unsigned long total_blocks; - - atomic_t free_blocks; /* Total number of free blocks (+ OP) */ - atomic_t free_user_blocks; /* Number of user free blocks (no OP) */ -}; - -#define PBLK_LINE_EMPTY (~0U) - -enum { - /* Line Types */ - PBLK_LINETYPE_FREE = 0, - PBLK_LINETYPE_LOG = 1, - PBLK_LINETYPE_DATA = 2, - - /* Line state */ - PBLK_LINESTATE_NEW = 9, - PBLK_LINESTATE_FREE = 10, - PBLK_LINESTATE_OPEN = 11, - PBLK_LINESTATE_CLOSED = 12, - PBLK_LINESTATE_GC = 13, - PBLK_LINESTATE_BAD = 14, - PBLK_LINESTATE_CORRUPT = 15, - - /* GC group */ - PBLK_LINEGC_NONE = 20, - PBLK_LINEGC_EMPTY = 21, - PBLK_LINEGC_LOW = 22, - PBLK_LINEGC_MID = 23, - PBLK_LINEGC_HIGH = 24, - PBLK_LINEGC_FULL = 25, - PBLK_LINEGC_WERR = 26 -}; - -#define PBLK_MAGIC 0x70626c6b /*pblk*/ - -/* emeta/smeta persistent storage format versions: - * Changes in major version requires offline migration. - * Changes in minor version are handled automatically during - * recovery. - */ - -#define SMETA_VERSION_MAJOR (0) -#define SMETA_VERSION_MINOR (1) - -#define EMETA_VERSION_MAJOR (0) -#define EMETA_VERSION_MINOR (2) - -struct line_header { - __le32 crc; - __le32 identifier; /* pblk identifier */ - __u8 uuid[16]; /* instance uuid */ - __le16 type; /* line type */ - __u8 version_major; /* version major */ - __u8 version_minor; /* version minor */ - __le32 id; /* line id for current line */ -}; - -struct line_smeta { - struct line_header header; - - __le32 crc; /* Full structure including struct crc */ - /* Previous line metadata */ - __le32 prev_id; /* Line id for previous line */ - - /* Current line metadata */ - __le64 seq_nr; /* Sequence number for current line */ - - /* Active writers */ - __le32 window_wr_lun; /* Number of parallel LUNs to write */ - - __le32 rsvd[2]; - - __le64 lun_bitmap[]; -}; - - -/* - * Metadata layout in media: - * First sector: - * 1. struct line_emeta - * 2. bad block bitmap (u64 * window_wr_lun) - * 3. write amplification counters - * Mid sectors (start at lbas_sector): - * 3. nr_lbas (u64) forming lba list - * Last sectors (start at vsc_sector): - * 4. u32 valid sector count (vsc) for all lines (~0U: free line) - */ -struct line_emeta { - struct line_header header; - - __le32 crc; /* Full structure including struct crc */ - - /* Previous line metadata */ - __le32 prev_id; /* Line id for prev line */ - - /* Current line metadata */ - __le64 seq_nr; /* Sequence number for current line */ - - /* Active writers */ - __le32 window_wr_lun; /* Number of parallel LUNs to write */ - - /* Bookkeeping for recovery */ - __le32 next_id; /* Line id for next line */ - __le64 nr_lbas; /* Number of lbas mapped in line */ - __le64 nr_valid_lbas; /* Number of valid lbas mapped in line */ - __le64 bb_bitmap[]; /* Updated bad block bitmap for line */ -}; - - -/* Write amplification counters stored on media */ -struct wa_counters { - __le64 user; /* Number of user written sectors */ - __le64 gc; /* Number of sectors written by GC*/ - __le64 pad; /* Number of padded sectors */ -}; - -struct pblk_emeta { - struct line_emeta *buf; /* emeta buffer in media format */ - int mem; /* Write offset - points to next - * writable entry in memory - */ - atomic_t sync; /* Synced - backpointer that signals the - * last entry that has been successfully - * persisted to media - */ - unsigned int nr_entries; /* Number of emeta entries */ -}; - -struct pblk_smeta { - struct line_smeta *buf; /* smeta buffer in persistent format */ -}; - -struct pblk_w_err_gc { - int has_write_err; - int has_gc_err; - __le64 *lba_list; -}; - -struct pblk_line { - struct pblk *pblk; - unsigned int id; /* Line number corresponds to the - * block line - */ - unsigned int seq_nr; /* Unique line sequence number */ - - int state; /* PBLK_LINESTATE_X */ - int type; /* PBLK_LINETYPE_X */ - int gc_group; /* PBLK_LINEGC_X */ - struct list_head list; /* Free, GC lists */ - - unsigned long *lun_bitmap; /* Bitmap for LUNs mapped in line */ - - struct nvm_chk_meta *chks; /* Chunks forming line */ - - struct pblk_smeta *smeta; /* Start metadata */ - struct pblk_emeta *emeta; /* End medatada */ - - int meta_line; /* Metadata line id */ - int meta_distance; /* Distance between data and metadata */ - - u64 emeta_ssec; /* Sector where emeta starts */ - - unsigned int sec_in_line; /* Number of usable secs in line */ - - atomic_t blk_in_line; /* Number of good blocks in line */ - unsigned long *blk_bitmap; /* Bitmap for valid/invalid blocks */ - unsigned long *erase_bitmap; /* Bitmap for erased blocks */ - - unsigned long *map_bitmap; /* Bitmap for mapped sectors in line */ - unsigned long *invalid_bitmap; /* Bitmap for invalid sectors in line */ - - atomic_t left_eblks; /* Blocks left for erasing */ - atomic_t left_seblks; /* Blocks left for sync erasing */ - - int left_msecs; /* Sectors left for mapping */ - unsigned int cur_sec; /* Sector map pointer */ - unsigned int nr_valid_lbas; /* Number of valid lbas in line */ - - __le32 *vsc; /* Valid sector count in line */ - - struct kref ref; /* Write buffer L2P references */ - atomic_t sec_to_update; /* Outstanding L2P updates to ppa */ - - struct pblk_w_err_gc *w_err_gc; /* Write error gc recovery metadata */ - - spinlock_t lock; /* Necessary for invalid_bitmap only */ -}; - -#define PBLK_DATA_LINES 4 - -enum { - PBLK_EMETA_TYPE_HEADER = 1, /* struct line_emeta first sector */ - PBLK_EMETA_TYPE_LLBA = 2, /* lba list - type: __le64 */ - PBLK_EMETA_TYPE_VSC = 3, /* vsc list - type: __le32 */ -}; - -struct pblk_line_mgmt { - int nr_lines; /* Total number of full lines */ - int nr_free_lines; /* Number of full lines in free list */ - - /* Free lists - use free_lock */ - struct list_head free_list; /* Full lines ready to use */ - struct list_head corrupt_list; /* Full lines corrupted */ - struct list_head bad_list; /* Full lines bad */ - - /* GC lists - use gc_lock */ - struct list_head *gc_lists[PBLK_GC_NR_LISTS]; - struct list_head gc_high_list; /* Full lines ready to GC, high isc */ - struct list_head gc_mid_list; /* Full lines ready to GC, mid isc */ - struct list_head gc_low_list; /* Full lines ready to GC, low isc */ - - struct list_head gc_werr_list; /* Write err recovery list */ - - struct list_head gc_full_list; /* Full lines ready to GC, no valid */ - struct list_head gc_empty_list; /* Full lines close, all valid */ - - struct pblk_line *log_line; /* Current FTL log line */ - struct pblk_line *data_line; /* Current data line */ - struct pblk_line *log_next; /* Next FTL log line */ - struct pblk_line *data_next; /* Next data line */ - - struct list_head emeta_list; /* Lines queued to schedule emeta */ - - __le32 *vsc_list; /* Valid sector counts for all lines */ - - /* Pre-allocated metadata for data lines */ - struct pblk_smeta *sline_meta[PBLK_DATA_LINES]; - struct pblk_emeta *eline_meta[PBLK_DATA_LINES]; - unsigned long meta_bitmap; - - /* Cache and mempool for map/invalid bitmaps */ - struct kmem_cache *bitmap_cache; - mempool_t *bitmap_pool; - - /* Helpers for fast bitmap calculations */ - unsigned long *bb_template; - unsigned long *bb_aux; - - unsigned long d_seq_nr; /* Data line unique sequence number */ - unsigned long l_seq_nr; /* Log line unique sequence number */ - - spinlock_t free_lock; - spinlock_t close_lock; - spinlock_t gc_lock; -}; - -struct pblk_line_meta { - unsigned int smeta_len; /* Total length for smeta */ - unsigned int smeta_sec; /* Sectors needed for smeta */ - - unsigned int emeta_len[4]; /* Lengths for emeta: - * [0]: Total - * [1]: struct line_emeta + - * bb_bitmap + struct wa_counters - * [2]: L2P portion - * [3]: vsc - */ - unsigned int emeta_sec[4]; /* Sectors needed for emeta. Same layout - * as emeta_len - */ - - unsigned int emeta_bb; /* Boundary for bb that affects emeta */ - - unsigned int vsc_list_len; /* Length for vsc list */ - unsigned int sec_bitmap_len; /* Length for sector bitmap in line */ - unsigned int blk_bitmap_len; /* Length for block bitmap in line */ - unsigned int lun_bitmap_len; /* Length for lun bitmap in line */ - - unsigned int blk_per_line; /* Number of blocks in a full line */ - unsigned int sec_per_line; /* Number of sectors in a line */ - unsigned int dsec_per_line; /* Number of data sectors in a line */ - unsigned int min_blk_line; /* Min. number of good blocks in line */ - - unsigned int mid_thrs; /* Threshold for GC mid list */ - unsigned int high_thrs; /* Threshold for GC high list */ - - unsigned int meta_distance; /* Distance between data and metadata */ -}; - -enum { - PBLK_STATE_RUNNING = 0, - PBLK_STATE_STOPPING = 1, - PBLK_STATE_RECOVERING = 2, - PBLK_STATE_STOPPED = 3, -}; - -/* Internal format to support not power-of-2 device formats */ -struct pblk_addrf { - /* gen to dev */ - int sec_stripe; - int ch_stripe; - int lun_stripe; - - /* dev to gen */ - int sec_lun_stripe; - int sec_ws_stripe; -}; - -struct pblk { - struct nvm_tgt_dev *dev; - struct gendisk *disk; - - struct kobject kobj; - - struct pblk_lun *luns; - - struct pblk_line *lines; /* Line array */ - struct pblk_line_mgmt l_mg; /* Line management */ - struct pblk_line_meta lm; /* Line metadata */ - - struct nvm_addrf addrf; /* Aligned address format */ - struct pblk_addrf uaddrf; /* Unaligned address format */ - int addrf_len; - - struct pblk_rb rwb; - - int state; /* pblk line state */ - - int min_write_pgs; /* Minimum amount of pages required by controller */ - int min_write_pgs_data; /* Minimum amount of payload pages */ - int max_write_pgs; /* Maximum amount of pages supported by controller */ - int oob_meta_size; /* Size of OOB sector metadata */ - - sector_t capacity; /* Device capacity when bad blocks are subtracted */ - - int op; /* Percentage of device used for over-provisioning */ - int op_blks; /* Number of blocks used for over-provisioning */ - - /* pblk provisioning values. Used by rate limiter */ - struct pblk_rl rl; - - int sec_per_write; - - guid_t instance_uuid; - - /* Persistent write amplification counters, 4kb sector I/Os */ - atomic64_t user_wa; /* Sectors written by user */ - atomic64_t gc_wa; /* Sectors written by GC */ - atomic64_t pad_wa; /* Padded sectors written */ - - /* Reset values for delta write amplification measurements */ - u64 user_rst_wa; - u64 gc_rst_wa; - u64 pad_rst_wa; - - /* Counters used for calculating padding distribution */ - atomic64_t *pad_dist; /* Padding distribution buckets */ - u64 nr_flush_rst; /* Flushes reset value for pad dist.*/ - atomic64_t nr_flush; /* Number of flush/fua I/O */ - -#ifdef CONFIG_NVM_PBLK_DEBUG - /* Non-persistent debug counters, 4kb sector I/Os */ - atomic_long_t inflight_writes; /* Inflight writes (user and gc) */ - atomic_long_t padded_writes; /* Sectors padded due to flush/fua */ - atomic_long_t padded_wb; /* Sectors padded in write buffer */ - atomic_long_t req_writes; /* Sectors stored on write buffer */ - atomic_long_t sub_writes; /* Sectors submitted from buffer */ - atomic_long_t sync_writes; /* Sectors synced to media */ - atomic_long_t inflight_reads; /* Inflight sector read requests */ - atomic_long_t cache_reads; /* Read requests that hit the cache */ - atomic_long_t sync_reads; /* Completed sector read requests */ - atomic_long_t recov_writes; /* Sectors submitted from recovery */ - atomic_long_t recov_gc_writes; /* Sectors submitted from write GC */ - atomic_long_t recov_gc_reads; /* Sectors submitted from read GC */ -#endif - - spinlock_t lock; - - atomic_long_t read_failed; - atomic_long_t read_empty; - atomic_long_t read_high_ecc; - atomic_long_t read_failed_gc; - atomic_long_t write_failed; - atomic_long_t erase_failed; - - atomic_t inflight_io; /* General inflight I/O counter */ - - struct task_struct *writer_ts; - - /* Simple translation map of logical addresses to physical addresses. - * The logical addresses is known by the host system, while the physical - * addresses are used when writing to the disk block device. - */ - unsigned char *trans_map; - spinlock_t trans_lock; - - struct list_head compl_list; - - spinlock_t resubmit_lock; /* Resubmit list lock */ - struct list_head resubmit_list; /* Resubmit list for failed writes*/ - - mempool_t page_bio_pool; - mempool_t gen_ws_pool; - mempool_t rec_pool; - mempool_t r_rq_pool; - mempool_t w_rq_pool; - mempool_t e_rq_pool; - - struct workqueue_struct *close_wq; - struct workqueue_struct *bb_wq; - struct workqueue_struct *r_end_wq; - - struct timer_list wtimer; - - struct pblk_gc gc; -}; - -struct pblk_line_ws { - struct pblk *pblk; - struct pblk_line *line; - void *priv; - struct work_struct ws; -}; - -#define pblk_g_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_g_ctx)) -#define pblk_w_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_c_ctx)) - -#define pblk_err(pblk, fmt, ...) \ - pr_err("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__) -#define pblk_info(pblk, fmt, ...) \ - pr_info("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__) -#define pblk_warn(pblk, fmt, ...) \ - pr_warn("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__) -#define pblk_debug(pblk, fmt, ...) \ - pr_debug("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__) - -/* - * pblk ring buffer operations - */ -int pblk_rb_init(struct pblk_rb *rb, unsigned int size, unsigned int threshold, - unsigned int seg_sz); -int pblk_rb_may_write_user(struct pblk_rb *rb, struct bio *bio, - unsigned int nr_entries, unsigned int *pos); -int pblk_rb_may_write_gc(struct pblk_rb *rb, unsigned int nr_entries, - unsigned int *pos); -void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data, - struct pblk_w_ctx w_ctx, unsigned int pos); -void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data, - struct pblk_w_ctx w_ctx, struct pblk_line *line, - u64 paddr, unsigned int pos); -struct pblk_w_ctx *pblk_rb_w_ctx(struct pblk_rb *rb, unsigned int pos); -void pblk_rb_flush(struct pblk_rb *rb); - -void pblk_rb_sync_l2p(struct pblk_rb *rb); -unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd, - unsigned int pos, unsigned int nr_entries, - unsigned int count); -int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba, - struct ppa_addr ppa); -unsigned int pblk_rb_read_commit(struct pblk_rb *rb, unsigned int entries); - -unsigned int pblk_rb_sync_init(struct pblk_rb *rb, unsigned long *flags); -unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries); -unsigned int pblk_rb_ptr_wrap(struct pblk_rb *rb, unsigned int p, - unsigned int nr_entries); -void pblk_rb_sync_end(struct pblk_rb *rb, unsigned long *flags); -unsigned int pblk_rb_flush_point_count(struct pblk_rb *rb); - -unsigned int pblk_rb_read_count(struct pblk_rb *rb); -unsigned int pblk_rb_sync_count(struct pblk_rb *rb); -unsigned int pblk_rb_wrap_pos(struct pblk_rb *rb, unsigned int pos); - -int pblk_rb_tear_down_check(struct pblk_rb *rb); -int pblk_rb_pos_oob(struct pblk_rb *rb, u64 pos); -void pblk_rb_free(struct pblk_rb *rb); -ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf); - -/* - * pblk core - */ -struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int type); -void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int type); -int pblk_alloc_rqd_meta(struct pblk *pblk, struct nvm_rq *rqd); -void pblk_free_rqd_meta(struct pblk *pblk, struct nvm_rq *rqd); -void pblk_set_sec_per_write(struct pblk *pblk, int sec_per_write); -int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd, - struct pblk_c_ctx *c_ctx); -void pblk_discard(struct pblk *pblk, struct bio *bio); -struct nvm_chk_meta *pblk_get_chunk_meta(struct pblk *pblk); -struct nvm_chk_meta *pblk_chunk_get_off(struct pblk *pblk, - struct nvm_chk_meta *lp, - struct ppa_addr ppa); -void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd); -void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd); -int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd, void *buf); -int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd, void *buf); -int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line); -void pblk_check_chunk_state_update(struct pblk *pblk, struct nvm_rq *rqd); -struct pblk_line *pblk_line_get(struct pblk *pblk); -struct pblk_line *pblk_line_get_first_data(struct pblk *pblk); -struct pblk_line *pblk_line_replace_data(struct pblk *pblk); -void pblk_ppa_to_line_put(struct pblk *pblk, struct ppa_addr ppa); -void pblk_rq_to_line_put(struct pblk *pblk, struct nvm_rq *rqd); -int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line); -void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line); -struct pblk_line *pblk_line_get_data(struct pblk *pblk); -struct pblk_line *pblk_line_get_erase(struct pblk *pblk); -int pblk_line_erase(struct pblk *pblk, struct pblk_line *line); -int pblk_line_is_full(struct pblk_line *line); -void pblk_line_free(struct pblk_line *line); -void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line); -void pblk_line_close(struct pblk *pblk, struct pblk_line *line); -void pblk_line_close_ws(struct work_struct *work); -void pblk_pipeline_stop(struct pblk *pblk); -void __pblk_pipeline_stop(struct pblk *pblk); -void __pblk_pipeline_flush(struct pblk *pblk); -void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv, - void (*work)(struct work_struct *), gfp_t gfp_mask, - struct workqueue_struct *wq); -u64 pblk_line_smeta_start(struct pblk *pblk, struct pblk_line *line); -int pblk_line_smeta_read(struct pblk *pblk, struct pblk_line *line); -int pblk_line_emeta_read(struct pblk *pblk, struct pblk_line *line, - void *emeta_buf); -int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr erase_ppa); -void pblk_line_put(struct kref *ref); -void pblk_line_put_wq(struct kref *ref); -struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line); -u64 pblk_lookup_page(struct pblk *pblk, struct pblk_line *line); -void pblk_dealloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs); -u64 pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs); -u64 __pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs); -int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail, - unsigned long secs_to_flush, bool skip_meta); -void pblk_down_rq(struct pblk *pblk, struct ppa_addr ppa, - unsigned long *lun_bitmap); -void pblk_down_chunk(struct pblk *pblk, struct ppa_addr ppa); -void pblk_up_chunk(struct pblk *pblk, struct ppa_addr ppa); -void pblk_up_rq(struct pblk *pblk, unsigned long *lun_bitmap); -int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags, - int nr_pages); -void pblk_bio_free_pages(struct pblk *pblk, struct bio *bio, int off, - int nr_pages); -void pblk_map_invalidate(struct pblk *pblk, struct ppa_addr ppa); -void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line, - u64 paddr); -void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa); -void pblk_update_map_cache(struct pblk *pblk, sector_t lba, - struct ppa_addr ppa); -void pblk_update_map_dev(struct pblk *pblk, sector_t lba, - struct ppa_addr ppa, struct ppa_addr entry_line); -int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa, - struct pblk_line *gc_line, u64 paddr); -void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas, - u64 *lba_list, int nr_secs); -int pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas, - sector_t blba, int nr_secs, bool *from_cache); -void *pblk_get_meta_for_writes(struct pblk *pblk, struct nvm_rq *rqd); -void pblk_get_packed_meta(struct pblk *pblk, struct nvm_rq *rqd); - -/* - * pblk user I/O write path - */ -void pblk_write_to_cache(struct pblk *pblk, struct bio *bio, - unsigned long flags); -int pblk_write_gc_to_cache(struct pblk *pblk, struct pblk_gc_rq *gc_rq); - -/* - * pblk map - */ -int pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd, - unsigned int sentry, unsigned long *lun_bitmap, - unsigned int valid_secs, struct ppa_addr *erase_ppa); -int pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry, - unsigned long *lun_bitmap, unsigned int valid_secs, - unsigned int off); - -/* - * pblk write thread - */ -int pblk_write_ts(void *data); -void pblk_write_timer_fn(struct timer_list *t); -void pblk_write_should_kick(struct pblk *pblk); -void pblk_write_kick(struct pblk *pblk); - -/* - * pblk read path - */ -extern struct bio_set pblk_bio_set; -void pblk_submit_read(struct pblk *pblk, struct bio *bio); -int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq); -/* - * pblk recovery - */ -struct pblk_line *pblk_recov_l2p(struct pblk *pblk); -int pblk_recov_pad(struct pblk *pblk); -int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta); - -/* - * pblk gc - */ -#define PBLK_GC_MAX_READERS 8 /* Max number of outstanding GC reader jobs */ -#define PBLK_GC_RQ_QD 128 /* Queue depth for inflight GC requests */ -#define PBLK_GC_L_QD 4 /* Queue depth for inflight GC lines */ - -int pblk_gc_init(struct pblk *pblk); -void pblk_gc_exit(struct pblk *pblk, bool graceful); -void pblk_gc_should_start(struct pblk *pblk); -void pblk_gc_should_stop(struct pblk *pblk); -void pblk_gc_should_kick(struct pblk *pblk); -void pblk_gc_free_full_lines(struct pblk *pblk); -void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled, - int *gc_active); -int pblk_gc_sysfs_force(struct pblk *pblk, int force); -void pblk_put_line_back(struct pblk *pblk, struct pblk_line *line); - -/* - * pblk rate limiter - */ -void pblk_rl_init(struct pblk_rl *rl, int budget, int threshold); -void pblk_rl_free(struct pblk_rl *rl); -void pblk_rl_update_rates(struct pblk_rl *rl); -int pblk_rl_high_thrs(struct pblk_rl *rl); -unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl); -unsigned long pblk_rl_nr_user_free_blks(struct pblk_rl *rl); -int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries); -void pblk_rl_inserted(struct pblk_rl *rl, int nr_entries); -void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries); -int pblk_rl_gc_may_insert(struct pblk_rl *rl, int nr_entries); -void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries); -void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc); -int pblk_rl_max_io(struct pblk_rl *rl); -void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line); -void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line, - bool used); -int pblk_rl_is_limit(struct pblk_rl *rl); - -void pblk_rl_werr_line_in(struct pblk_rl *rl); -void pblk_rl_werr_line_out(struct pblk_rl *rl); - -/* - * pblk sysfs - */ -int pblk_sysfs_init(struct gendisk *tdisk); -void pblk_sysfs_exit(struct gendisk *tdisk); - -static inline struct nvm_rq *nvm_rq_from_c_ctx(void *c_ctx) -{ - return c_ctx - sizeof(struct nvm_rq); -} - -static inline void *emeta_to_bb(struct line_emeta *emeta) -{ - return emeta->bb_bitmap; -} - -static inline void *emeta_to_wa(struct pblk_line_meta *lm, - struct line_emeta *emeta) -{ - return emeta->bb_bitmap + lm->blk_bitmap_len; -} - -static inline void *emeta_to_lbas(struct pblk *pblk, struct line_emeta *emeta) -{ - return ((void *)emeta + pblk->lm.emeta_len[1]); -} - -static inline void *emeta_to_vsc(struct pblk *pblk, struct line_emeta *emeta) -{ - return (emeta_to_lbas(pblk, emeta) + pblk->lm.emeta_len[2]); -} - -static inline int pblk_line_vsc(struct pblk_line *line) -{ - return le32_to_cpu(*line->vsc); -} - -static inline int pblk_ppa_to_line_id(struct ppa_addr p) -{ - return p.a.blk; -} - -static inline struct pblk_line *pblk_ppa_to_line(struct pblk *pblk, - struct ppa_addr p) -{ - return &pblk->lines[pblk_ppa_to_line_id(p)]; -} - -static inline int pblk_ppa_to_pos(struct nvm_geo *geo, struct ppa_addr p) -{ - return p.a.lun * geo->num_ch + p.a.ch; -} - -static inline struct ppa_addr addr_to_gen_ppa(struct pblk *pblk, u64 paddr, - u64 line_id) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct ppa_addr ppa; - - if (geo->version == NVM_OCSSD_SPEC_12) { - struct nvm_addrf_12 *ppaf = (struct nvm_addrf_12 *)&pblk->addrf; - - ppa.ppa = 0; - ppa.g.blk = line_id; - ppa.g.pg = (paddr & ppaf->pg_mask) >> ppaf->pg_offset; - ppa.g.lun = (paddr & ppaf->lun_mask) >> ppaf->lun_offset; - ppa.g.ch = (paddr & ppaf->ch_mask) >> ppaf->ch_offset; - ppa.g.pl = (paddr & ppaf->pln_mask) >> ppaf->pln_offset; - ppa.g.sec = (paddr & ppaf->sec_mask) >> ppaf->sec_offset; - } else { - struct pblk_addrf *uaddrf = &pblk->uaddrf; - int secs, chnls, luns; - - ppa.ppa = 0; - - ppa.m.chk = line_id; - - paddr = div_u64_rem(paddr, uaddrf->sec_stripe, &secs); - ppa.m.sec = secs; - - paddr = div_u64_rem(paddr, uaddrf->ch_stripe, &chnls); - ppa.m.grp = chnls; - - paddr = div_u64_rem(paddr, uaddrf->lun_stripe, &luns); - ppa.m.pu = luns; - - ppa.m.sec += uaddrf->sec_stripe * paddr; - } - - return ppa; -} - -static inline struct nvm_chk_meta *pblk_dev_ppa_to_chunk(struct pblk *pblk, - struct ppa_addr p) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line *line = pblk_ppa_to_line(pblk, p); - int pos = pblk_ppa_to_pos(geo, p); - - return &line->chks[pos]; -} - -static inline u64 pblk_dev_ppa_to_chunk_addr(struct pblk *pblk, - struct ppa_addr p) -{ - struct nvm_tgt_dev *dev = pblk->dev; - - return dev_to_chunk_addr(dev->parent, &pblk->addrf, p); -} - -static inline u64 pblk_dev_ppa_to_line_addr(struct pblk *pblk, - struct ppa_addr p) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - u64 paddr; - - if (geo->version == NVM_OCSSD_SPEC_12) { - struct nvm_addrf_12 *ppaf = (struct nvm_addrf_12 *)&pblk->addrf; - - paddr = (u64)p.g.ch << ppaf->ch_offset; - paddr |= (u64)p.g.lun << ppaf->lun_offset; - paddr |= (u64)p.g.pg << ppaf->pg_offset; - paddr |= (u64)p.g.pl << ppaf->pln_offset; - paddr |= (u64)p.g.sec << ppaf->sec_offset; - } else { - struct pblk_addrf *uaddrf = &pblk->uaddrf; - u64 secs = p.m.sec; - int sec_stripe; - - paddr = (u64)p.m.grp * uaddrf->sec_stripe; - paddr += (u64)p.m.pu * uaddrf->sec_lun_stripe; - - secs = div_u64_rem(secs, uaddrf->sec_stripe, &sec_stripe); - paddr += secs * uaddrf->sec_ws_stripe; - paddr += sec_stripe; - } - - return paddr; -} - -static inline struct ppa_addr pblk_ppa32_to_ppa64(struct pblk *pblk, u32 ppa32) -{ - struct nvm_tgt_dev *dev = pblk->dev; - - return nvm_ppa32_to_ppa64(dev->parent, &pblk->addrf, ppa32); -} - -static inline u32 pblk_ppa64_to_ppa32(struct pblk *pblk, struct ppa_addr ppa64) -{ - struct nvm_tgt_dev *dev = pblk->dev; - - return nvm_ppa64_to_ppa32(dev->parent, &pblk->addrf, ppa64); -} - -static inline struct ppa_addr pblk_trans_map_get(struct pblk *pblk, - sector_t lba) -{ - struct ppa_addr ppa; - - if (pblk->addrf_len < 32) { - u32 *map = (u32 *)pblk->trans_map; - - ppa = pblk_ppa32_to_ppa64(pblk, map[lba]); - } else { - struct ppa_addr *map = (struct ppa_addr *)pblk->trans_map; - - ppa = map[lba]; - } - - return ppa; -} - -static inline void pblk_trans_map_set(struct pblk *pblk, sector_t lba, - struct ppa_addr ppa) -{ - if (pblk->addrf_len < 32) { - u32 *map = (u32 *)pblk->trans_map; - - map[lba] = pblk_ppa64_to_ppa32(pblk, ppa); - } else { - u64 *map = (u64 *)pblk->trans_map; - - map[lba] = ppa.ppa; - } -} - -static inline int pblk_ppa_empty(struct ppa_addr ppa_addr) -{ - return (ppa_addr.ppa == ADDR_EMPTY); -} - -static inline void pblk_ppa_set_empty(struct ppa_addr *ppa_addr) -{ - ppa_addr->ppa = ADDR_EMPTY; -} - -static inline bool pblk_ppa_comp(struct ppa_addr lppa, struct ppa_addr rppa) -{ - return (lppa.ppa == rppa.ppa); -} - -static inline int pblk_addr_in_cache(struct ppa_addr ppa) -{ - return (ppa.ppa != ADDR_EMPTY && ppa.c.is_cached); -} - -static inline int pblk_addr_to_cacheline(struct ppa_addr ppa) -{ - return ppa.c.line; -} - -static inline struct ppa_addr pblk_cacheline_to_addr(int addr) -{ - struct ppa_addr p; - - p.c.line = addr; - p.c.is_cached = 1; - - return p; -} - -static inline u32 pblk_calc_meta_header_crc(struct pblk *pblk, - struct line_header *header) -{ - u32 crc = ~(u32)0; - - crc = crc32_le(crc, (unsigned char *)header + sizeof(crc), - sizeof(struct line_header) - sizeof(crc)); - - return crc; -} - -static inline u32 pblk_calc_smeta_crc(struct pblk *pblk, - struct line_smeta *smeta) -{ - struct pblk_line_meta *lm = &pblk->lm; - u32 crc = ~(u32)0; - - crc = crc32_le(crc, (unsigned char *)smeta + - sizeof(struct line_header) + sizeof(crc), - lm->smeta_len - - sizeof(struct line_header) - sizeof(crc)); - - return crc; -} - -static inline u32 pblk_calc_emeta_crc(struct pblk *pblk, - struct line_emeta *emeta) -{ - struct pblk_line_meta *lm = &pblk->lm; - u32 crc = ~(u32)0; - - crc = crc32_le(crc, (unsigned char *)emeta + - sizeof(struct line_header) + sizeof(crc), - lm->emeta_len[0] - - sizeof(struct line_header) - sizeof(crc)); - - return crc; -} - -static inline int pblk_io_aligned(struct pblk *pblk, int nr_secs) -{ - return !(nr_secs % pblk->min_write_pgs); -} - -#ifdef CONFIG_NVM_PBLK_DEBUG -static inline void print_ppa(struct pblk *pblk, struct ppa_addr *p, - char *msg, int error) -{ - struct nvm_geo *geo = &pblk->dev->geo; - - if (p->c.is_cached) { - pblk_err(pblk, "ppa: (%s: %x) cache line: %llu\n", - msg, error, (u64)p->c.line); - } else if (geo->version == NVM_OCSSD_SPEC_12) { - pblk_err(pblk, "ppa: (%s: %x):ch:%d,lun:%d,blk:%d,pg:%d,pl:%d,sec:%d\n", - msg, error, - p->g.ch, p->g.lun, p->g.blk, - p->g.pg, p->g.pl, p->g.sec); - } else { - pblk_err(pblk, "ppa: (%s: %x):ch:%d,lun:%d,chk:%d,sec:%d\n", - msg, error, - p->m.grp, p->m.pu, p->m.chk, p->m.sec); - } -} - -static inline void pblk_print_failed_rqd(struct pblk *pblk, struct nvm_rq *rqd, - int error) -{ - int bit = -1; - - if (rqd->nr_ppas == 1) { - print_ppa(pblk, &rqd->ppa_addr, "rqd", error); - return; - } - - while ((bit = find_next_bit((void *)&rqd->ppa_status, rqd->nr_ppas, - bit + 1)) < rqd->nr_ppas) { - print_ppa(pblk, &rqd->ppa_list[bit], "rqd", error); - } - - pblk_err(pblk, "error:%d, ppa_status:%llx\n", error, rqd->ppa_status); -} - -static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev, - struct ppa_addr *ppas, int nr_ppas) -{ - struct nvm_geo *geo = &tgt_dev->geo; - struct ppa_addr *ppa; - int i; - - for (i = 0; i < nr_ppas; i++) { - ppa = &ppas[i]; - - if (geo->version == NVM_OCSSD_SPEC_12) { - if (!ppa->c.is_cached && - ppa->g.ch < geo->num_ch && - ppa->g.lun < geo->num_lun && - ppa->g.pl < geo->num_pln && - ppa->g.blk < geo->num_chk && - ppa->g.pg < geo->num_pg && - ppa->g.sec < geo->ws_min) - continue; - } else { - if (!ppa->c.is_cached && - ppa->m.grp < geo->num_ch && - ppa->m.pu < geo->num_lun && - ppa->m.chk < geo->num_chk && - ppa->m.sec < geo->clba) - continue; - } - - print_ppa(tgt_dev->q->queuedata, ppa, "boundary", i); - - return 1; - } - return 0; -} - -static inline int pblk_check_io(struct pblk *pblk, struct nvm_rq *rqd) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd); - - if (pblk_boundary_ppa_checks(dev, ppa_list, rqd->nr_ppas)) { - WARN_ON(1); - return -EINVAL; - } - - if (rqd->opcode == NVM_OP_PWRITE) { - struct pblk_line *line; - int i; - - for (i = 0; i < rqd->nr_ppas; i++) { - line = pblk_ppa_to_line(pblk, ppa_list[i]); - - spin_lock(&line->lock); - if (line->state != PBLK_LINESTATE_OPEN) { - pblk_err(pblk, "bad ppa: line:%d,state:%d\n", - line->id, line->state); - WARN_ON(1); - spin_unlock(&line->lock); - return -EINVAL; - } - spin_unlock(&line->lock); - } - } - - return 0; -} -#endif - -static inline int pblk_boundary_paddr_checks(struct pblk *pblk, u64 paddr) -{ - struct pblk_line_meta *lm = &pblk->lm; - - if (paddr > lm->sec_per_line) - return 1; - - return 0; -} - -static inline unsigned int pblk_get_bi_idx(struct bio *bio) -{ - return bio->bi_iter.bi_idx; -} - -static inline sector_t pblk_get_lba(struct bio *bio) -{ - return bio->bi_iter.bi_sector / NR_PHY_IN_LOG; -} - -static inline unsigned int pblk_get_secs(struct bio *bio) -{ - return bio->bi_iter.bi_size / PBLK_EXPOSED_PAGE_SIZE; -} - -static inline char *pblk_disk_name(struct pblk *pblk) -{ - struct gendisk *disk = pblk->disk; - - return disk->disk_name; -} - -static inline unsigned int pblk_get_min_chks(struct pblk *pblk) -{ - struct pblk_line_meta *lm = &pblk->lm; - /* In a worst-case scenario every line will have OP invalid sectors. - * We will then need a minimum of 1/OP lines to free up a single line - */ - - return DIV_ROUND_UP(100, pblk->op) * lm->blk_per_line; -} - -static inline struct pblk_sec_meta *pblk_get_meta(struct pblk *pblk, - void *meta, int index) -{ - return meta + - max_t(int, sizeof(struct pblk_sec_meta), pblk->oob_meta_size) - * index; -} - -static inline int pblk_dma_meta_size(struct pblk *pblk) -{ - return max_t(int, sizeof(struct pblk_sec_meta), pblk->oob_meta_size) - * NVM_MAX_VLBA; -} - -static inline int pblk_is_oob_meta_supported(struct pblk *pblk) -{ - return pblk->oob_meta_size >= sizeof(struct pblk_sec_meta); -} -#endif /* PBLK_H_ */ diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index dae460699a8f..0ced36aee890 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -15,6 +15,7 @@ if MD config BLK_DEV_MD tristate "RAID support" + select BLOCK_HOLDER_DEPRECATED if SYSFS help This driver lets you combine several hard disk partitions into one logical block device. This can be used to simply append one @@ -201,6 +202,7 @@ config BLK_DEV_DM_BUILTIN config BLK_DEV_DM tristate "Device mapper support" + select BLOCK_HOLDER_DEPRECATED if SYSFS select BLK_DEV_DM_BUILTIN depends on DAX || DAX=n help @@ -361,7 +363,7 @@ config DM_WRITECACHE config DM_EBS tristate "Emulated block size target (EXPERIMENTAL)" - depends on BLK_DEV_DM + depends on BLK_DEV_DM && !HIGHMEM select DM_BUFIO help dm-ebs emulates smaller logical block size on backing devices diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig index d1ca4d059c20..cf3e8096942a 100644 --- a/drivers/md/bcache/Kconfig +++ b/drivers/md/bcache/Kconfig @@ -2,6 +2,7 @@ config BCACHE tristate "Block device as cache" + select BLOCK_HOLDER_DEPRECATED if SYSFS select CRC64 help Allows a block device to be used as cache for other devices; uses diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 183a58c89377..0595559de174 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -378,7 +378,7 @@ static void do_btree_node_write(struct btree *b) struct bvec_iter_all iter_all; bio_for_each_segment_all(bv, b->bio, iter_all) { - memcpy(page_address(bv->bv_page), addr, PAGE_SIZE); + memcpy(bvec_virt(bv), addr, PAGE_SIZE); addr += PAGE_SIZE; } diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 185246a0d855..f2874c77ff79 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -885,11 +885,6 @@ static void bcache_device_free(struct bcache_device *d) bcache_device_detach(d); if (disk) { - bool disk_added = (disk->flags & GENHD_FL_UP) != 0; - - if (disk_added) - del_gendisk(disk); - blk_cleanup_disk(disk); ida_simple_remove(&bcache_device_idx, first_minor_to_idx(disk->first_minor)); @@ -931,20 +926,20 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long); d->full_dirty_stripes = kvzalloc(n, GFP_KERNEL); if (!d->full_dirty_stripes) - return -ENOMEM; + goto out_free_stripe_sectors_dirty; idx = ida_simple_get(&bcache_device_idx, 0, BCACHE_DEVICE_IDX_MAX, GFP_KERNEL); if (idx < 0) - return idx; + goto out_free_full_dirty_stripes; if (bioset_init(&d->bio_split, 4, offsetof(struct bbio, bio), BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER)) - goto err; + goto out_ida_remove; d->disk = blk_alloc_disk(NUMA_NO_NODE); if (!d->disk) - goto err; + goto out_bioset_exit; set_capacity(d->disk, sectors); snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx); @@ -987,8 +982,14 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, return 0; -err: +out_bioset_exit: + bioset_exit(&d->bio_split); +out_ida_remove: ida_simple_remove(&bcache_device_idx, idx); +out_free_full_dirty_stripes: + kvfree(d->full_dirty_stripes); +out_free_stripe_sectors_dirty: + kvfree(d->stripe_sectors_dirty); return -ENOMEM; } @@ -1365,8 +1366,10 @@ static void cached_dev_free(struct closure *cl) mutex_lock(&bch_register_lock); - if (atomic_read(&dc->running)) + if (atomic_read(&dc->running)) { bd_unlink_disk_holder(dc->bdev, dc->disk.disk); + del_gendisk(dc->disk.disk); + } bcache_device_free(&dc->disk); list_del(&dc->list); @@ -1512,6 +1515,7 @@ static void flash_dev_free(struct closure *cl) mutex_lock(&bch_register_lock); atomic_long_sub(bcache_dev_sectors_dirty(d), &d->c->flash_dev_dirty_sectors); + del_gendisk(d->disk); bcache_device_free(d); mutex_unlock(&bch_register_lock); kobject_put(&d->kobj); diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index bca4a7c97da7..b64460a76267 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -15,8 +15,6 @@ #include "closure.h" -#define PAGE_SECTORS (PAGE_SIZE / 512) - struct closure; #ifdef CONFIG_BCACHE_DEBUG diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c index 71475a2410be..0c509dae0ff8 100644 --- a/drivers/md/dm-ebs-target.c +++ b/drivers/md/dm-ebs-target.c @@ -74,7 +74,7 @@ static int __ebs_rw_bvec(struct ebs_c *ec, int rw, struct bio_vec *bv, struct bv if (unlikely(!bv->bv_page || !bv_len)) return -EIO; - pa = page_address(bv->bv_page) + bv->bv_offset; + pa = bvec_virt(bv); /* Handle overlapping page <-> blocks */ while (bv_len) { diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 20f2510db1f6..a9ea361769a7 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -1819,7 +1819,7 @@ again: unsigned this_len; BUG_ON(PageHighMem(biv.bv_page)); - tag = lowmem_page_address(biv.bv_page) + biv.bv_offset; + tag = bvec_virt(&biv); this_len = min(biv.bv_len, data_to_process); r = dm_integrity_rw_tag(ic, tag, &dio->metadata_block, &dio->metadata_offset, this_len, dio->op == REQ_OP_READ ? TAG_READ : TAG_WRITE); @@ -2006,7 +2006,7 @@ retry_kmap: unsigned tag_now = min(biv.bv_len, tag_todo); char *tag_addr; BUG_ON(PageHighMem(biv.bv_page)); - tag_addr = lowmem_page_address(biv.bv_page) + biv.bv_offset; + tag_addr = bvec_virt(&biv); if (likely(dio->op == REQ_OP_WRITE)) memcpy(tag_ptr, tag_addr, tag_now); else diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 2209cbcd84db..2575074a2204 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1436,9 +1436,6 @@ static int table_load(struct file *filp, struct dm_ioctl *param, size_t param_si } if (dm_get_md_type(md) == DM_TYPE_NONE) { - /* Initial table load: acquire type of table. */ - dm_set_md_type(md, dm_table_get_type(t)); - /* setup md->queue to reflect md's type (may block) */ r = dm_setup_md_queue(md, t); if (r) { @@ -2187,7 +2184,6 @@ int __init dm_early_create(struct dm_ioctl *dmi, if (r) goto err_destroy_table; - md->type = dm_table_get_type(t); /* setup md->queue to reflect md's type (may block) */ r = dm_setup_md_queue(md, t); if (r) { diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 0dbd48cbdff9..5b95eea517d1 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -559,7 +559,6 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t) err = blk_mq_init_allocated_queue(md->tag_set, md->queue); if (err) goto out_tag_set; - elevator_init_mq(md->queue); return 0; out_tag_set: diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 27d36f26fb7b..870abdef6dbd 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -2153,7 +2153,7 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, } dm_update_keyslot_manager(q, t); - blk_queue_update_readahead(q); + disk_update_readahead(t->md->disk); return 0; } diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index e21e29e81bbf..3d2cf811ec3e 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -1214,14 +1214,13 @@ static void memcpy_flushcache_optimized(void *dest, void *source, size_t size) static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data) { void *buf; - unsigned long flags; unsigned size; int rw = bio_data_dir(bio); unsigned remaining_size = wc->block_size; do { struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter); - buf = bvec_kmap_irq(&bv, &flags); + buf = bvec_kmap_local(&bv); size = bv.bv_len; if (unlikely(size > remaining_size)) size = remaining_size; @@ -1239,7 +1238,7 @@ static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data memcpy_flushcache_optimized(data, buf, size); } - bvec_kunmap_irq(buf, &flags); + kunmap_local(buf); data = (char *)data + size; remaining_size -= size; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 2c5f9e585211..7981b7287628 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1693,14 +1693,13 @@ static void cleanup_mapped_device(struct mapped_device *md) spin_lock(&_minor_lock); md->disk->private_data = NULL; spin_unlock(&_minor_lock); - del_gendisk(md->disk); - } - - if (md->queue) + if (dm_get_md_type(md) != DM_TYPE_NONE) { + dm_sysfs_exit(md); + del_gendisk(md->disk); + } dm_queue_destroy_keyslot_manager(md->queue); - - if (md->disk) blk_cleanup_disk(md->disk); + } cleanup_srcu_struct(&md->io_barrier); @@ -1792,7 +1791,6 @@ static struct mapped_device *alloc_dev(int minor) goto bad; } - add_disk_no_queue_reg(md->disk); format_dev_t(md->name, MKDEV(_major, minor)); md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0); @@ -1993,19 +1991,12 @@ static struct dm_table *__unbind(struct mapped_device *md) */ int dm_create(int minor, struct mapped_device **result) { - int r; struct mapped_device *md; md = alloc_dev(minor); if (!md) return -ENXIO; - r = dm_sysfs_init(md); - if (r) { - free_dev(md); - return r; - } - *result = md; return 0; } @@ -2056,9 +2047,9 @@ EXPORT_SYMBOL_GPL(dm_get_queue_limits); */ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) { - int r; + enum dm_queue_mode type = dm_table_get_type(t); struct queue_limits limits; - enum dm_queue_mode type = dm_get_md_type(md); + int r; switch (type) { case DM_TYPE_REQUEST_BASED: @@ -2086,8 +2077,14 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) if (r) return r; - blk_register_queue(md->disk); + add_disk(md->disk); + r = dm_sysfs_init(md); + if (r) { + del_gendisk(md->disk); + return r; + } + md->type = type; return 0; } @@ -2193,7 +2190,6 @@ static void __dm_destroy(struct mapped_device *md, bool wait) DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)", dm_device_name(md), atomic_read(&md->holders)); - dm_sysfs_exit(md); dm_table_destroy(__unbind(md)); free_dev(md); } diff --git a/drivers/md/md.h b/drivers/md/md.h index 832547cf038f..4c96c36bd01a 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -764,9 +764,7 @@ struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev); static inline bool is_mddev_broken(struct md_rdev *rdev, const char *md_type) { - int flags = rdev->bdev->bd_disk->flags; - - if (!(flags & GENHD_FL_UP)) { + if (!disk_live(rdev->bdev->bd_disk)) { if (!test_and_set_bit(MD_BROKEN, &rdev->mddev->flags)) pr_warn("md: %s: %s array has a missing/failed member\n", mdname(rdev->mddev), md_type); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 3c44c4bb40fc..19598bd38939 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1329,6 +1329,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, struct raid1_plug_cb *plug = NULL; int first_clone; int max_sectors; + bool write_behind = false; if (mddev_is_clustered(mddev) && md_cluster_ops->area_resyncing(mddev, WRITE, @@ -1381,6 +1382,15 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, max_sectors = r1_bio->sectors; for (i = 0; i < disks; i++) { struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); + + /* + * The write-behind io is only attempted on drives marked as + * write-mostly, which means we could allocate write behind + * bio later. + */ + if (rdev && test_bit(WriteMostly, &rdev->flags)) + write_behind = true; + if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { atomic_inc(&rdev->nr_pending); blocked_rdev = rdev; @@ -1454,6 +1464,15 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, goto retry_write; } + /* + * When using a bitmap, we may call alloc_behind_master_bio below. + * alloc_behind_master_bio allocates a copy of the data payload a page + * at a time and thus needs a new bio that can fit the whole payload + * this bio in page sized chunks. + */ + if (write_behind && bitmap) + max_sectors = min_t(int, max_sectors, + BIO_MAX_VECS * (PAGE_SIZE >> 9)); if (max_sectors < bio_sectors(bio)) { struct bio *split = bio_split(bio, max_sectors, GFP_NOIO, &conf->bio_split); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 07119d7e0fdf..aa2636582841 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1712,6 +1712,11 @@ retry_discard: } else r10_bio->master_bio = (struct bio *)first_r10bio; + /* + * first select target devices under rcu_lock and + * inc refcount on their rdev. Record them by setting + * bios[x] to bio + */ rcu_read_lock(); for (disk = 0; disk < geo->raid_disks; disk++) { struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev); @@ -1743,9 +1748,6 @@ retry_discard: for (disk = 0; disk < geo->raid_disks; disk++) { sector_t dev_start, dev_end; struct bio *mbio, *rbio = NULL; - struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev); - struct md_rdev *rrdev = rcu_dereference( - conf->mirrors[disk].replacement); /* * Now start to calculate the start and end address for each disk. @@ -1775,9 +1777,12 @@ retry_discard: /* * It only handles discard bio which size is >= stripe size, so - * dev_end > dev_start all the time + * dev_end > dev_start all the time. + * It doesn't need to use rcu lock to get rdev here. We already + * add rdev->nr_pending in the first loop. */ if (r10_bio->devs[disk].bio) { + struct md_rdev *rdev = conf->mirrors[disk].rdev; mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); mbio->bi_end_io = raid10_end_discard_request; mbio->bi_private = r10_bio; @@ -1790,6 +1795,7 @@ retry_discard: bio_endio(mbio); } if (r10_bio->devs[disk].repl_bio) { + struct md_rdev *rrdev = conf->mirrors[disk].replacement; rbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); rbio->bi_end_io = raid10_end_discard_request; rbio->bi_private = r10_bio; diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index ce8aed562929..6a15fdf6e5f2 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -128,8 +128,6 @@ struct mmc_blk_data { * track of the current selected device partition. */ unsigned int part_curr; - struct device_attribute force_ro; - struct device_attribute power_ro_lock; int area_type; /* debugfs files (only in main mmc_blk_data) */ @@ -281,6 +279,9 @@ out_put: return count; } +static DEVICE_ATTR(ro_lock_until_next_power_on, 0, + power_ro_lock_show, power_ro_lock_store); + static ssize_t force_ro_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -313,6 +314,44 @@ out: return ret; } +static DEVICE_ATTR(force_ro, 0644, force_ro_show, force_ro_store); + +static struct attribute *mmc_disk_attrs[] = { + &dev_attr_force_ro.attr, + &dev_attr_ro_lock_until_next_power_on.attr, + NULL, +}; + +static umode_t mmc_disk_attrs_is_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct device *dev = container_of(kobj, struct device, kobj); + struct mmc_blk_data *md = mmc_blk_get(dev_to_disk(dev)); + umode_t mode = a->mode; + + if (a == &dev_attr_ro_lock_until_next_power_on.attr && + (md->area_type & MMC_BLK_DATA_AREA_BOOT) && + md->queue.card->ext_csd.boot_ro_lockable) { + mode = S_IRUGO; + if (!(md->queue.card->ext_csd.boot_ro_lock & + EXT_CSD_BOOT_WP_B_PWR_WP_DIS)) + mode |= S_IWUSR; + } + + mmc_blk_put(md); + return mode; +} + +static const struct attribute_group mmc_disk_attr_group = { + .is_visible = mmc_disk_attrs_is_visible, + .attrs = mmc_disk_attrs, +}; + +static const struct attribute_group *mmc_disk_attr_groups[] = { + &mmc_disk_attr_group, + NULL, +}; + static int mmc_blk_open(struct block_device *bdev, fmode_t mode) { struct mmc_blk_data *md = mmc_blk_get(bdev->bd_disk); @@ -792,6 +831,26 @@ static int mmc_blk_compat_ioctl(struct block_device *bdev, fmode_t mode, } #endif +static int mmc_blk_alternative_gpt_sector(struct gendisk *disk, + sector_t *sector) +{ + struct mmc_blk_data *md; + int ret; + + md = mmc_blk_get(disk); + if (!md) + return -EINVAL; + + if (md->queue.card) + ret = mmc_card_alternative_gpt_sector(md->queue.card, sector); + else + ret = -ENODEV; + + mmc_blk_put(md); + + return ret; +} + static const struct block_device_operations mmc_bdops = { .open = mmc_blk_open, .release = mmc_blk_release, @@ -801,6 +860,7 @@ static const struct block_device_operations mmc_bdops = { #ifdef CONFIG_COMPAT .compat_ioctl = mmc_blk_compat_ioctl, #endif + .alternative_gpt_sector = mmc_blk_alternative_gpt_sector, }; static int mmc_blk_part_switch_pre(struct mmc_card *card, @@ -2289,7 +2349,8 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card, sector_t size, bool default_ro, const char *subname, - int area_type) + int area_type, + unsigned int part_type) { struct mmc_blk_data *md; int devidx, ret; @@ -2336,6 +2397,7 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card, kref_init(&md->kref); md->queue.blkdata = md; + md->part_type = part_type; md->disk->major = MMC_BLOCK_MAJOR; md->disk->minors = perdev_minors; @@ -2388,6 +2450,10 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card, md->disk->disk_name, mmc_card_id(card), mmc_card_name(card), cap_str, md->read_only ? "(ro)" : ""); + /* used in ->open, must be set before add_disk: */ + if (area_type == MMC_BLK_DATA_AREA_MAIN) + dev_set_drvdata(&card->dev, md); + device_add_disk(md->parent, md->disk, mmc_disk_attr_groups); return md; err_kfree: @@ -2417,7 +2483,7 @@ static struct mmc_blk_data *mmc_blk_alloc(struct mmc_card *card) } return mmc_blk_alloc_req(card, &card->dev, size, false, NULL, - MMC_BLK_DATA_AREA_MAIN); + MMC_BLK_DATA_AREA_MAIN, 0); } static int mmc_blk_alloc_part(struct mmc_card *card, @@ -2431,10 +2497,9 @@ static int mmc_blk_alloc_part(struct mmc_card *card, struct mmc_blk_data *part_md; part_md = mmc_blk_alloc_req(card, disk_to_dev(md->disk), size, default_ro, - subname, area_type); + subname, area_type, part_type); if (IS_ERR(part_md)) return PTR_ERR(part_md); - part_md->part_type = part_type; list_add(&part_md->part, &md->part); return 0; @@ -2635,27 +2700,13 @@ static int mmc_blk_alloc_parts(struct mmc_card *card, struct mmc_blk_data *md) static void mmc_blk_remove_req(struct mmc_blk_data *md) { - struct mmc_card *card; - - if (md) { - /* - * Flush remaining requests and free queues. It - * is freeing the queue that stops new requests - * from being accepted. - */ - card = md->queue.card; - if (md->disk->flags & GENHD_FL_UP) { - device_remove_file(disk_to_dev(md->disk), &md->force_ro); - if ((md->area_type & MMC_BLK_DATA_AREA_BOOT) && - card->ext_csd.boot_ro_lockable) - device_remove_file(disk_to_dev(md->disk), - &md->power_ro_lock); - - del_gendisk(md->disk); - } - mmc_cleanup_queue(&md->queue); - mmc_blk_put(md); - } + /* + * Flush remaining requests and free queues. It is freeing the queue + * that stops new requests from being accepted. + */ + del_gendisk(md->disk); + mmc_cleanup_queue(&md->queue); + mmc_blk_put(md); } static void mmc_blk_remove_parts(struct mmc_card *card, @@ -2679,51 +2730,6 @@ static void mmc_blk_remove_parts(struct mmc_card *card, } } -static int mmc_add_disk(struct mmc_blk_data *md) -{ - int ret; - struct mmc_card *card = md->queue.card; - - device_add_disk(md->parent, md->disk, NULL); - md->force_ro.show = force_ro_show; - md->force_ro.store = force_ro_store; - sysfs_attr_init(&md->force_ro.attr); - md->force_ro.attr.name = "force_ro"; - md->force_ro.attr.mode = S_IRUGO | S_IWUSR; - ret = device_create_file(disk_to_dev(md->disk), &md->force_ro); - if (ret) - goto force_ro_fail; - - if ((md->area_type & MMC_BLK_DATA_AREA_BOOT) && - card->ext_csd.boot_ro_lockable) { - umode_t mode; - - if (card->ext_csd.boot_ro_lock & EXT_CSD_BOOT_WP_B_PWR_WP_DIS) - mode = S_IRUGO; - else - mode = S_IRUGO | S_IWUSR; - - md->power_ro_lock.show = power_ro_lock_show; - md->power_ro_lock.store = power_ro_lock_store; - sysfs_attr_init(&md->power_ro_lock.attr); - md->power_ro_lock.attr.mode = mode; - md->power_ro_lock.attr.name = - "ro_lock_until_next_power_on"; - ret = device_create_file(disk_to_dev(md->disk), - &md->power_ro_lock); - if (ret) - goto power_ro_lock_fail; - } - return ret; - -power_ro_lock_fail: - device_remove_file(disk_to_dev(md->disk), &md->force_ro); -force_ro_fail: - del_gendisk(md->disk); - - return ret; -} - #ifdef CONFIG_DEBUG_FS static int mmc_dbg_card_status_get(void *data, u64 *val) @@ -2889,7 +2895,7 @@ static void mmc_blk_remove_debugfs(struct mmc_card *card, static int mmc_blk_probe(struct mmc_card *card) { - struct mmc_blk_data *md, *part_md; + struct mmc_blk_data *md; int ret = 0; /* @@ -2917,18 +2923,6 @@ static int mmc_blk_probe(struct mmc_card *card) if (ret) goto out; - dev_set_drvdata(&card->dev, md); - - ret = mmc_add_disk(md); - if (ret) - goto out; - - list_for_each_entry(part_md, &md->part, part) { - ret = mmc_add_disk(part_md); - if (ret) - goto out; - } - /* Add two debugfs entries */ mmc_blk_add_debugfs(card, md); diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index 95fedcf56e4a..605f5e8648c1 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -2149,6 +2149,41 @@ int mmc_detect_card_removed(struct mmc_host *host) } EXPORT_SYMBOL(mmc_detect_card_removed); +int mmc_card_alternative_gpt_sector(struct mmc_card *card, sector_t *gpt_sector) +{ + unsigned int boot_sectors_num; + + if ((!(card->host->caps2 & MMC_CAP2_ALT_GPT_TEGRA))) + return -EOPNOTSUPP; + + /* filter out unrelated cards */ + if (card->ext_csd.rev < 3 || + !mmc_card_mmc(card) || + !mmc_card_is_blockaddr(card) || + mmc_card_is_removable(card->host)) + return -ENOENT; + + /* + * eMMC storage has two special boot partitions in addition to the + * main one. NVIDIA's bootloader linearizes eMMC boot0->boot1->main + * accesses, this means that the partition table addresses are shifted + * by the size of boot partitions. In accordance with the eMMC + * specification, the boot partition size is calculated as follows: + * + * boot partition size = 128K byte x BOOT_SIZE_MULT + * + * Calculate number of sectors occupied by the both boot partitions. + */ + boot_sectors_num = card->ext_csd.raw_boot_mult * SZ_128K / + SZ_512 * MMC_NUM_BOOT_PARTITION; + + /* Defined by NVIDIA and used by Android devices. */ + *gpt_sector = card->ext_csd.sectors - boot_sectors_num - 1; + + return 0; +} +EXPORT_SYMBOL(mmc_card_alternative_gpt_sector); + void mmc_rescan(struct work_struct *work) { struct mmc_host *host = diff --git a/drivers/mmc/core/core.h b/drivers/mmc/core/core.h index 0c4de2030b3f..7931a4f0137d 100644 --- a/drivers/mmc/core/core.h +++ b/drivers/mmc/core/core.h @@ -119,6 +119,8 @@ void mmc_release_host(struct mmc_host *host); void mmc_get_card(struct mmc_card *card, struct mmc_ctx *ctx); void mmc_put_card(struct mmc_card *card, struct mmc_ctx *ctx); +int mmc_card_alternative_gpt_sector(struct mmc_card *card, sector_t *sector); + /** * mmc_claim_host - exclusively claim a host * @host: mmc host to claim diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c index 838726b68ff3..29e58ffae379 100644 --- a/drivers/mmc/core/mmc.c +++ b/drivers/mmc/core/mmc.c @@ -418,6 +418,8 @@ static int mmc_decode_ext_csd(struct mmc_card *card, u8 *ext_csd) ext_csd[EXT_CSD_ERASE_TIMEOUT_MULT]; card->ext_csd.raw_hc_erase_grp_size = ext_csd[EXT_CSD_HC_ERASE_GRP_SIZE]; + card->ext_csd.raw_boot_mult = + ext_csd[EXT_CSD_BOOT_MULT]; if (card->ext_csd.rev >= 3) { u8 sa_shift = ext_csd[EXT_CSD_S_A_TIMEOUT]; card->ext_csd.part_config = ext_csd[EXT_CSD_PART_CONFIG]; diff --git a/drivers/mmc/host/sdhci-tegra.c b/drivers/mmc/host/sdhci-tegra.c index 387ce9cdbd7c..a5001875876b 100644 --- a/drivers/mmc/host/sdhci-tegra.c +++ b/drivers/mmc/host/sdhci-tegra.c @@ -116,6 +116,8 @@ */ #define NVQUIRK_HAS_TMCLK BIT(10) +#define NVQUIRK_HAS_ANDROID_GPT_SECTOR BIT(11) + /* SDMMC CQE Base Address for Tegra Host Ver 4.1 and Higher */ #define SDHCI_TEGRA_CQE_BASE_ADDR 0xF000 @@ -1361,6 +1363,7 @@ static const struct sdhci_tegra_soc_data soc_data_tegra20 = { .pdata = &sdhci_tegra20_pdata, .dma_mask = DMA_BIT_MASK(32), .nvquirks = NVQUIRK_FORCE_SDHCI_SPEC_200 | + NVQUIRK_HAS_ANDROID_GPT_SECTOR | NVQUIRK_ENABLE_BLOCK_GAP_DET, }; @@ -1390,6 +1393,7 @@ static const struct sdhci_tegra_soc_data soc_data_tegra30 = { .nvquirks = NVQUIRK_ENABLE_SDHCI_SPEC_300 | NVQUIRK_ENABLE_SDR50 | NVQUIRK_ENABLE_SDR104 | + NVQUIRK_HAS_ANDROID_GPT_SECTOR | NVQUIRK_HAS_PADCALIB, }; @@ -1422,6 +1426,7 @@ static const struct sdhci_pltfm_data sdhci_tegra114_pdata = { static const struct sdhci_tegra_soc_data soc_data_tegra114 = { .pdata = &sdhci_tegra114_pdata, .dma_mask = DMA_BIT_MASK(32), + .nvquirks = NVQUIRK_HAS_ANDROID_GPT_SECTOR, }; static const struct sdhci_pltfm_data sdhci_tegra124_pdata = { @@ -1438,6 +1443,7 @@ static const struct sdhci_pltfm_data sdhci_tegra124_pdata = { static const struct sdhci_tegra_soc_data soc_data_tegra124 = { .pdata = &sdhci_tegra124_pdata, .dma_mask = DMA_BIT_MASK(34), + .nvquirks = NVQUIRK_HAS_ANDROID_GPT_SECTOR, }; static const struct sdhci_ops tegra210_sdhci_ops = { @@ -1616,6 +1622,9 @@ static int sdhci_tegra_probe(struct platform_device *pdev) tegra_host->pad_control_available = false; tegra_host->soc_data = soc_data; + if (soc_data->nvquirks & NVQUIRK_HAS_ANDROID_GPT_SECTOR) + host->mmc->caps2 |= MMC_CAP2_ALT_GPT_TEGRA; + if (soc_data->nvquirks & NVQUIRK_NEEDS_PAD_CONTROL) { rc = tegra_sdhci_init_pinctrl_info(&pdev->dev, tegra_host); if (rc == 0) diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index c3f3d77f1aac..dc0450ca23a3 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -33,12 +33,12 @@ config NVME_HWMON in the system. config NVME_FABRICS + select NVME_CORE tristate config NVME_RDMA tristate "NVM Express over Fabrics RDMA host driver" depends on INFINIBAND && INFINIBAND_ADDR_TRANS && BLOCK - select NVME_CORE select NVME_FABRICS select SG_POOL help @@ -55,7 +55,6 @@ config NVME_FC tristate "NVM Express over Fabrics FC host driver" depends on BLOCK depends on HAS_DMA - select NVME_CORE select NVME_FABRICS select SG_POOL help @@ -72,7 +71,6 @@ config NVME_TCP tristate "NVM Express over Fabrics TCP host driver" depends on INET depends on BLOCK - select NVME_CORE select NVME_FABRICS select CRYPTO select CRYPTO_CRC32C diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile index cbc509784b2e..dfaacd472e5d 100644 --- a/drivers/nvme/host/Makefile +++ b/drivers/nvme/host/Makefile @@ -12,7 +12,6 @@ obj-$(CONFIG_NVME_TCP) += nvme-tcp.o nvme-core-y := core.o ioctl.o nvme-core-$(CONFIG_TRACING) += trace.o nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o -nvme-core-$(CONFIG_NVM) += lightnvm.o nvme-core-$(CONFIG_BLK_DEV_ZONED) += zns.o nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS) += fault_inject.o nvme-core-$(CONFIG_NVME_HWMON) += hwmon.o diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index dfd9dec0c1f6..8679a108f571 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -587,9 +587,6 @@ static void nvme_free_ns(struct kref *kref) { struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref); - if (ns->ndev) - nvme_nvm_unregister(ns); - put_disk(ns->disk); nvme_put_ns_head(ns->head); nvme_put_ctrl(ns->ctrl); @@ -968,12 +965,11 @@ void nvme_cleanup_cmd(struct request *req) { if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { struct nvme_ctrl *ctrl = nvme_req(req)->ctrl; - struct page *page = req->special_vec.bv_page; - if (page == ctrl->discard_page) + if (req->special_vec.bv_page == ctrl->discard_page) clear_bit_unlock(0, &ctrl->discard_page_busy); else - kfree(page_address(page) + req->special_vec.bv_offset); + kfree(bvec_virt(&req->special_vec)); } } EXPORT_SYMBOL_GPL(nvme_cleanup_cmd); @@ -1029,7 +1025,8 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req) return BLK_STS_IOERR; } - cmd->common.command_id = req->tag; + nvme_req(req)->genctr++; + cmd->common.command_id = nvme_cid(req); trace_nvme_setup_cmd(req, cmd); return ret; } @@ -1822,7 +1819,7 @@ static void nvme_update_disk_info(struct gendisk *disk, static inline bool nvme_first_scan(struct gendisk *disk) { /* nvme_alloc_ns() scans the disk prior to adding it */ - return !(disk->flags & GENHD_FL_UP); + return !disk_live(disk); } static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id) @@ -1890,7 +1887,7 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id) nvme_update_disk_info(ns->head->disk, ns, id); blk_stack_limits(&ns->head->disk->queue->limits, &ns->queue->limits, 0); - blk_queue_update_readahead(ns->head->disk->queue); + disk_update_readahead(ns->head->disk); blk_mq_unfreeze_queue(ns->head->disk->queue); } return 0; @@ -3218,9 +3215,6 @@ static const struct attribute_group nvme_ns_id_attr_group = { const struct attribute_group *nvme_ns_id_attr_groups[] = { &nvme_ns_id_attr_group, -#ifdef CONFIG_NVM - &nvme_nvm_attr_group, -#endif NULL, }; @@ -3729,9 +3723,14 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, if (!ns) goto out_free_id; - ns->queue = blk_mq_init_queue(ctrl->tagset); - if (IS_ERR(ns->queue)) + disk = blk_mq_alloc_disk(ctrl->tagset, ns); + if (IS_ERR(disk)) goto out_free_ns; + disk->fops = &nvme_bdev_ops; + disk->private_data = ns; + + ns->disk = disk; + ns->queue = disk->queue; if (ctrl->opts && ctrl->opts->data_digest) blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, ns->queue); @@ -3740,20 +3739,12 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, if (ctrl->ops->flags & NVME_F_PCI_P2PDMA) blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue); - ns->queue->queuedata = ns; ns->ctrl = ctrl; kref_init(&ns->kref); if (nvme_init_ns_head(ns, nsid, ids, id->nmic & NVME_NS_NMIC_SHARED)) - goto out_free_queue; + goto out_cleanup_disk; - disk = alloc_disk_node(0, node); - if (!disk) - goto out_unlink_ns; - - disk->fops = &nvme_bdev_ops; - disk->private_data = ns; - disk->queue = ns->queue; /* * Without the multipath code enabled, multiple controller per * subsystems are visible as devices and thus we cannot use the @@ -3762,17 +3753,9 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, if (!nvme_mpath_set_disk_name(ns, disk->disk_name, &disk->flags)) sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance); - ns->disk = disk; if (nvme_update_ns_info(ns, id)) - goto out_put_disk; - - if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) { - if (nvme_nvm_register(ns, disk->disk_name, node)) { - dev_warn(ctrl->device, "LightNVM init failure\n"); - goto out_put_disk; - } - } + goto out_unlink_ns; down_write(&ctrl->namespaces_rwsem); list_add_tail(&ns->list, &ctrl->namespaces); @@ -3789,10 +3772,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, kfree(id); return; - out_put_disk: - /* prevent double queue cleanup */ - ns->disk->queue = NULL; - put_disk(ns->disk); + out_unlink_ns: mutex_lock(&ctrl->subsys->lock); list_del_rcu(&ns->siblings); @@ -3800,8 +3780,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, list_del_init(&ns->head->entry); mutex_unlock(&ctrl->subsys->lock); nvme_put_ns_head(ns->head); - out_free_queue: - blk_cleanup_queue(ns->queue); + out_cleanup_disk: + blk_cleanup_disk(disk); out_free_ns: kfree(ns); out_free_id: @@ -3826,14 +3806,12 @@ static void nvme_ns_remove(struct nvme_ns *ns) nvme_mpath_clear_current_path(ns); synchronize_srcu(&ns->head->srcu); /* wait for concurrent submissions */ - if (ns->disk->flags & GENHD_FL_UP) { - if (!nvme_ns_head_multipath(ns->head)) - nvme_cdev_del(&ns->cdev, &ns->cdev_device); - del_gendisk(ns->disk); - blk_cleanup_queue(ns->queue); - if (blk_get_integrity(ns->disk)) - blk_integrity_unregister(ns->disk); - } + if (!nvme_ns_head_multipath(ns->head)) + nvme_cdev_del(&ns->cdev, &ns->cdev_device); + del_gendisk(ns->disk); + blk_cleanup_queue(ns->queue); + if (blk_get_integrity(ns->disk)) + blk_integrity_unregister(ns->disk); down_write(&ns->ctrl->namespaces_rwsem); list_del_init(&ns->list); diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index a5469fd9d4c3..668c6bb7a567 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -719,7 +719,6 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, ret = -EINVAL; goto out; } - nvmf_host_put(opts->host); opts->host = nvmf_host_add(p); kfree(p); if (!opts->host) { diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 305ddd415e45..22314962842d 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -342,9 +342,7 @@ static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd, case NVME_IOCTL_IO64_CMD: return nvme_user_cmd64(ns->ctrl, ns, argp); default: - if (!ns->ndev) - return -ENOTTY; - return nvme_nvm_ioctl(ns, cmd, argp); + return -ENOTTY; } } diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c deleted file mode 100644 index e9d9ad47f70f..000000000000 --- a/drivers/nvme/host/lightnvm.c +++ /dev/null @@ -1,1274 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * nvme-lightnvm.c - LightNVM NVMe device - * - * Copyright (C) 2014-2015 IT University of Copenhagen - * Initial release: Matias Bjorling - */ - -#include "nvme.h" - -#include -#include -#include -#include -#include -#include - -enum nvme_nvm_admin_opcode { - nvme_nvm_admin_identity = 0xe2, - nvme_nvm_admin_get_bb_tbl = 0xf2, - nvme_nvm_admin_set_bb_tbl = 0xf1, -}; - -enum nvme_nvm_log_page { - NVME_NVM_LOG_REPORT_CHUNK = 0xca, -}; - -struct nvme_nvm_ph_rw { - __u8 opcode; - __u8 flags; - __u16 command_id; - __le32 nsid; - __u64 rsvd2; - __le64 metadata; - __le64 prp1; - __le64 prp2; - __le64 spba; - __le16 length; - __le16 control; - __le32 dsmgmt; - __le64 resv; -}; - -struct nvme_nvm_erase_blk { - __u8 opcode; - __u8 flags; - __u16 command_id; - __le32 nsid; - __u64 rsvd[2]; - __le64 prp1; - __le64 prp2; - __le64 spba; - __le16 length; - __le16 control; - __le32 dsmgmt; - __le64 resv; -}; - -struct nvme_nvm_identity { - __u8 opcode; - __u8 flags; - __u16 command_id; - __le32 nsid; - __u64 rsvd[2]; - __le64 prp1; - __le64 prp2; - __u32 rsvd11[6]; -}; - -struct nvme_nvm_getbbtbl { - __u8 opcode; - __u8 flags; - __u16 command_id; - __le32 nsid; - __u64 rsvd[2]; - __le64 prp1; - __le64 prp2; - __le64 spba; - __u32 rsvd4[4]; -}; - -struct nvme_nvm_setbbtbl { - __u8 opcode; - __u8 flags; - __u16 command_id; - __le32 nsid; - __le64 rsvd[2]; - __le64 prp1; - __le64 prp2; - __le64 spba; - __le16 nlb; - __u8 value; - __u8 rsvd3; - __u32 rsvd4[3]; -}; - -struct nvme_nvm_command { - union { - struct nvme_common_command common; - struct nvme_nvm_ph_rw ph_rw; - struct nvme_nvm_erase_blk erase; - struct nvme_nvm_identity identity; - struct nvme_nvm_getbbtbl get_bb; - struct nvme_nvm_setbbtbl set_bb; - }; -}; - -struct nvme_nvm_id12_grp { - __u8 mtype; - __u8 fmtype; - __le16 res16; - __u8 num_ch; - __u8 num_lun; - __u8 num_pln; - __u8 rsvd1; - __le16 num_chk; - __le16 num_pg; - __le16 fpg_sz; - __le16 csecs; - __le16 sos; - __le16 rsvd2; - __le32 trdt; - __le32 trdm; - __le32 tprt; - __le32 tprm; - __le32 tbet; - __le32 tbem; - __le32 mpos; - __le32 mccap; - __le16 cpar; - __u8 reserved[906]; -} __packed; - -struct nvme_nvm_id12_addrf { - __u8 ch_offset; - __u8 ch_len; - __u8 lun_offset; - __u8 lun_len; - __u8 pln_offset; - __u8 pln_len; - __u8 blk_offset; - __u8 blk_len; - __u8 pg_offset; - __u8 pg_len; - __u8 sec_offset; - __u8 sec_len; - __u8 res[4]; -} __packed; - -struct nvme_nvm_id12 { - __u8 ver_id; - __u8 vmnt; - __u8 cgrps; - __u8 res; - __le32 cap; - __le32 dom; - struct nvme_nvm_id12_addrf ppaf; - __u8 resv[228]; - struct nvme_nvm_id12_grp grp; - __u8 resv2[2880]; -} __packed; - -struct nvme_nvm_bb_tbl { - __u8 tblid[4]; - __le16 verid; - __le16 revid; - __le32 rvsd1; - __le32 tblks; - __le32 tfact; - __le32 tgrown; - __le32 tdresv; - __le32 thresv; - __le32 rsvd2[8]; - __u8 blk[]; -}; - -struct nvme_nvm_id20_addrf { - __u8 grp_len; - __u8 pu_len; - __u8 chk_len; - __u8 lba_len; - __u8 resv[4]; -}; - -struct nvme_nvm_id20 { - __u8 mjr; - __u8 mnr; - __u8 resv[6]; - - struct nvme_nvm_id20_addrf lbaf; - - __le32 mccap; - __u8 resv2[12]; - - __u8 wit; - __u8 resv3[31]; - - /* Geometry */ - __le16 num_grp; - __le16 num_pu; - __le32 num_chk; - __le32 clba; - __u8 resv4[52]; - - /* Write data requirements */ - __le32 ws_min; - __le32 ws_opt; - __le32 mw_cunits; - __le32 maxoc; - __le32 maxocpu; - __u8 resv5[44]; - - /* Performance related metrics */ - __le32 trdt; - __le32 trdm; - __le32 twrt; - __le32 twrm; - __le32 tcrst; - __le32 tcrsm; - __u8 resv6[40]; - - /* Reserved area */ - __u8 resv7[2816]; - - /* Vendor specific */ - __u8 vs[1024]; -}; - -struct nvme_nvm_chk_meta { - __u8 state; - __u8 type; - __u8 wi; - __u8 rsvd[5]; - __le64 slba; - __le64 cnlb; - __le64 wp; -}; - -/* - * Check we didn't inadvertently grow the command struct - */ -static inline void _nvme_nvm_check_size(void) -{ - BUILD_BUG_ON(sizeof(struct nvme_nvm_identity) != 64); - BUILD_BUG_ON(sizeof(struct nvme_nvm_ph_rw) != 64); - BUILD_BUG_ON(sizeof(struct nvme_nvm_erase_blk) != 64); - BUILD_BUG_ON(sizeof(struct nvme_nvm_getbbtbl) != 64); - BUILD_BUG_ON(sizeof(struct nvme_nvm_setbbtbl) != 64); - BUILD_BUG_ON(sizeof(struct nvme_nvm_id12_grp) != 960); - BUILD_BUG_ON(sizeof(struct nvme_nvm_id12_addrf) != 16); - BUILD_BUG_ON(sizeof(struct nvme_nvm_id12) != NVME_IDENTIFY_DATA_SIZE); - BUILD_BUG_ON(sizeof(struct nvme_nvm_bb_tbl) != 64); - BUILD_BUG_ON(sizeof(struct nvme_nvm_id20_addrf) != 8); - BUILD_BUG_ON(sizeof(struct nvme_nvm_id20) != NVME_IDENTIFY_DATA_SIZE); - BUILD_BUG_ON(sizeof(struct nvme_nvm_chk_meta) != 32); - BUILD_BUG_ON(sizeof(struct nvme_nvm_chk_meta) != - sizeof(struct nvm_chk_meta)); -} - -static void nvme_nvm_set_addr_12(struct nvm_addrf_12 *dst, - struct nvme_nvm_id12_addrf *src) -{ - dst->ch_len = src->ch_len; - dst->lun_len = src->lun_len; - dst->blk_len = src->blk_len; - dst->pg_len = src->pg_len; - dst->pln_len = src->pln_len; - dst->sec_len = src->sec_len; - - dst->ch_offset = src->ch_offset; - dst->lun_offset = src->lun_offset; - dst->blk_offset = src->blk_offset; - dst->pg_offset = src->pg_offset; - dst->pln_offset = src->pln_offset; - dst->sec_offset = src->sec_offset; - - dst->ch_mask = ((1ULL << dst->ch_len) - 1) << dst->ch_offset; - dst->lun_mask = ((1ULL << dst->lun_len) - 1) << dst->lun_offset; - dst->blk_mask = ((1ULL << dst->blk_len) - 1) << dst->blk_offset; - dst->pg_mask = ((1ULL << dst->pg_len) - 1) << dst->pg_offset; - dst->pln_mask = ((1ULL << dst->pln_len) - 1) << dst->pln_offset; - dst->sec_mask = ((1ULL << dst->sec_len) - 1) << dst->sec_offset; -} - -static int nvme_nvm_setup_12(struct nvme_nvm_id12 *id, - struct nvm_geo *geo) -{ - struct nvme_nvm_id12_grp *src; - int sec_per_pg, sec_per_pl, pg_per_blk; - - if (id->cgrps != 1) - return -EINVAL; - - src = &id->grp; - - if (src->mtype != 0) { - pr_err("nvm: memory type not supported\n"); - return -EINVAL; - } - - /* 1.2 spec. only reports a single version id - unfold */ - geo->major_ver_id = id->ver_id; - geo->minor_ver_id = 2; - - /* Set compacted version for upper layers */ - geo->version = NVM_OCSSD_SPEC_12; - - geo->num_ch = src->num_ch; - geo->num_lun = src->num_lun; - geo->all_luns = geo->num_ch * geo->num_lun; - - geo->num_chk = le16_to_cpu(src->num_chk); - - geo->csecs = le16_to_cpu(src->csecs); - geo->sos = le16_to_cpu(src->sos); - - pg_per_blk = le16_to_cpu(src->num_pg); - sec_per_pg = le16_to_cpu(src->fpg_sz) / geo->csecs; - sec_per_pl = sec_per_pg * src->num_pln; - geo->clba = sec_per_pl * pg_per_blk; - - geo->all_chunks = geo->all_luns * geo->num_chk; - geo->total_secs = geo->clba * geo->all_chunks; - - geo->ws_min = sec_per_pg; - geo->ws_opt = sec_per_pg; - geo->mw_cunits = geo->ws_opt << 3; /* default to MLC safe values */ - - /* Do not impose values for maximum number of open blocks as it is - * unspecified in 1.2. Users of 1.2 must be aware of this and eventually - * specify these values through a quirk if restrictions apply. - */ - geo->maxoc = geo->all_luns * geo->num_chk; - geo->maxocpu = geo->num_chk; - - geo->mccap = le32_to_cpu(src->mccap); - - geo->trdt = le32_to_cpu(src->trdt); - geo->trdm = le32_to_cpu(src->trdm); - geo->tprt = le32_to_cpu(src->tprt); - geo->tprm = le32_to_cpu(src->tprm); - geo->tbet = le32_to_cpu(src->tbet); - geo->tbem = le32_to_cpu(src->tbem); - - /* 1.2 compatibility */ - geo->vmnt = id->vmnt; - geo->cap = le32_to_cpu(id->cap); - geo->dom = le32_to_cpu(id->dom); - - geo->mtype = src->mtype; - geo->fmtype = src->fmtype; - - geo->cpar = le16_to_cpu(src->cpar); - geo->mpos = le32_to_cpu(src->mpos); - - geo->pln_mode = NVM_PLANE_SINGLE; - - if (geo->mpos & 0x020202) { - geo->pln_mode = NVM_PLANE_DOUBLE; - geo->ws_opt <<= 1; - } else if (geo->mpos & 0x040404) { - geo->pln_mode = NVM_PLANE_QUAD; - geo->ws_opt <<= 2; - } - - geo->num_pln = src->num_pln; - geo->num_pg = le16_to_cpu(src->num_pg); - geo->fpg_sz = le16_to_cpu(src->fpg_sz); - - nvme_nvm_set_addr_12((struct nvm_addrf_12 *)&geo->addrf, &id->ppaf); - - return 0; -} - -static void nvme_nvm_set_addr_20(struct nvm_addrf *dst, - struct nvme_nvm_id20_addrf *src) -{ - dst->ch_len = src->grp_len; - dst->lun_len = src->pu_len; - dst->chk_len = src->chk_len; - dst->sec_len = src->lba_len; - - dst->sec_offset = 0; - dst->chk_offset = dst->sec_len; - dst->lun_offset = dst->chk_offset + dst->chk_len; - dst->ch_offset = dst->lun_offset + dst->lun_len; - - dst->ch_mask = ((1ULL << dst->ch_len) - 1) << dst->ch_offset; - dst->lun_mask = ((1ULL << dst->lun_len) - 1) << dst->lun_offset; - dst->chk_mask = ((1ULL << dst->chk_len) - 1) << dst->chk_offset; - dst->sec_mask = ((1ULL << dst->sec_len) - 1) << dst->sec_offset; -} - -static int nvme_nvm_setup_20(struct nvme_nvm_id20 *id, - struct nvm_geo *geo) -{ - geo->major_ver_id = id->mjr; - geo->minor_ver_id = id->mnr; - - /* Set compacted version for upper layers */ - geo->version = NVM_OCSSD_SPEC_20; - - geo->num_ch = le16_to_cpu(id->num_grp); - geo->num_lun = le16_to_cpu(id->num_pu); - geo->all_luns = geo->num_ch * geo->num_lun; - - geo->num_chk = le32_to_cpu(id->num_chk); - geo->clba = le32_to_cpu(id->clba); - - geo->all_chunks = geo->all_luns * geo->num_chk; - geo->total_secs = geo->clba * geo->all_chunks; - - geo->ws_min = le32_to_cpu(id->ws_min); - geo->ws_opt = le32_to_cpu(id->ws_opt); - geo->mw_cunits = le32_to_cpu(id->mw_cunits); - geo->maxoc = le32_to_cpu(id->maxoc); - geo->maxocpu = le32_to_cpu(id->maxocpu); - - geo->trdt = le32_to_cpu(id->trdt); - geo->trdm = le32_to_cpu(id->trdm); - geo->tprt = le32_to_cpu(id->twrt); - geo->tprm = le32_to_cpu(id->twrm); - geo->tbet = le32_to_cpu(id->tcrst); - geo->tbem = le32_to_cpu(id->tcrsm); - - nvme_nvm_set_addr_20(&geo->addrf, &id->lbaf); - - return 0; -} - -static int nvme_nvm_identity(struct nvm_dev *nvmdev) -{ - struct nvme_ns *ns = nvmdev->q->queuedata; - struct nvme_nvm_id12 *id; - struct nvme_nvm_command c = {}; - int ret; - - c.identity.opcode = nvme_nvm_admin_identity; - c.identity.nsid = cpu_to_le32(ns->head->ns_id); - - id = kmalloc(sizeof(struct nvme_nvm_id12), GFP_KERNEL); - if (!id) - return -ENOMEM; - - ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, (struct nvme_command *)&c, - id, sizeof(struct nvme_nvm_id12)); - if (ret) { - ret = -EIO; - goto out; - } - - /* - * The 1.2 and 2.0 specifications share the first byte in their geometry - * command to make it possible to know what version a device implements. - */ - switch (id->ver_id) { - case 1: - ret = nvme_nvm_setup_12(id, &nvmdev->geo); - break; - case 2: - ret = nvme_nvm_setup_20((struct nvme_nvm_id20 *)id, - &nvmdev->geo); - break; - default: - dev_err(ns->ctrl->device, "OCSSD revision not supported (%d)\n", - id->ver_id); - ret = -EINVAL; - } - -out: - kfree(id); - return ret; -} - -static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa, - u8 *blks) -{ - struct request_queue *q = nvmdev->q; - struct nvm_geo *geo = &nvmdev->geo; - struct nvme_ns *ns = q->queuedata; - struct nvme_ctrl *ctrl = ns->ctrl; - struct nvme_nvm_command c = {}; - struct nvme_nvm_bb_tbl *bb_tbl; - int nr_blks = geo->num_chk * geo->num_pln; - int tblsz = sizeof(struct nvme_nvm_bb_tbl) + nr_blks; - int ret = 0; - - c.get_bb.opcode = nvme_nvm_admin_get_bb_tbl; - c.get_bb.nsid = cpu_to_le32(ns->head->ns_id); - c.get_bb.spba = cpu_to_le64(ppa.ppa); - - bb_tbl = kzalloc(tblsz, GFP_KERNEL); - if (!bb_tbl) - return -ENOMEM; - - ret = nvme_submit_sync_cmd(ctrl->admin_q, (struct nvme_command *)&c, - bb_tbl, tblsz); - if (ret) { - dev_err(ctrl->device, "get bad block table failed (%d)\n", ret); - ret = -EIO; - goto out; - } - - if (bb_tbl->tblid[0] != 'B' || bb_tbl->tblid[1] != 'B' || - bb_tbl->tblid[2] != 'L' || bb_tbl->tblid[3] != 'T') { - dev_err(ctrl->device, "bbt format mismatch\n"); - ret = -EINVAL; - goto out; - } - - if (le16_to_cpu(bb_tbl->verid) != 1) { - ret = -EINVAL; - dev_err(ctrl->device, "bbt version not supported\n"); - goto out; - } - - if (le32_to_cpu(bb_tbl->tblks) != nr_blks) { - ret = -EINVAL; - dev_err(ctrl->device, - "bbt unsuspected blocks returned (%u!=%u)", - le32_to_cpu(bb_tbl->tblks), nr_blks); - goto out; - } - - memcpy(blks, bb_tbl->blk, geo->num_chk * geo->num_pln); -out: - kfree(bb_tbl); - return ret; -} - -static int nvme_nvm_set_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr *ppas, - int nr_ppas, int type) -{ - struct nvme_ns *ns = nvmdev->q->queuedata; - struct nvme_nvm_command c = {}; - int ret = 0; - - c.set_bb.opcode = nvme_nvm_admin_set_bb_tbl; - c.set_bb.nsid = cpu_to_le32(ns->head->ns_id); - c.set_bb.spba = cpu_to_le64(ppas->ppa); - c.set_bb.nlb = cpu_to_le16(nr_ppas - 1); - c.set_bb.value = type; - - ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, (struct nvme_command *)&c, - NULL, 0); - if (ret) - dev_err(ns->ctrl->device, "set bad block table failed (%d)\n", - ret); - return ret; -} - -/* - * Expect the lba in device format - */ -static int nvme_nvm_get_chk_meta(struct nvm_dev *ndev, - sector_t slba, int nchks, - struct nvm_chk_meta *meta) -{ - struct nvm_geo *geo = &ndev->geo; - struct nvme_ns *ns = ndev->q->queuedata; - struct nvme_ctrl *ctrl = ns->ctrl; - struct nvme_nvm_chk_meta *dev_meta, *dev_meta_off; - struct ppa_addr ppa; - size_t left = nchks * sizeof(struct nvme_nvm_chk_meta); - size_t log_pos, offset, len; - int i, max_len; - int ret = 0; - - /* - * limit requests to maximum 256K to avoid issuing arbitrary large - * requests when the device does not specific a maximum transfer size. - */ - max_len = min_t(unsigned int, ctrl->max_hw_sectors << 9, 256 * 1024); - - dev_meta = kmalloc(max_len, GFP_KERNEL); - if (!dev_meta) - return -ENOMEM; - - /* Normalize lba address space to obtain log offset */ - ppa.ppa = slba; - ppa = dev_to_generic_addr(ndev, ppa); - - log_pos = ppa.m.chk; - log_pos += ppa.m.pu * geo->num_chk; - log_pos += ppa.m.grp * geo->num_lun * geo->num_chk; - - offset = log_pos * sizeof(struct nvme_nvm_chk_meta); - - while (left) { - len = min_t(unsigned int, left, max_len); - - memset(dev_meta, 0, max_len); - dev_meta_off = dev_meta; - - ret = nvme_get_log(ctrl, ns->head->ns_id, - NVME_NVM_LOG_REPORT_CHUNK, 0, NVME_CSI_NVM, - dev_meta, len, offset); - if (ret) { - dev_err(ctrl->device, "Get REPORT CHUNK log error\n"); - break; - } - - for (i = 0; i < len; i += sizeof(struct nvme_nvm_chk_meta)) { - meta->state = dev_meta_off->state; - meta->type = dev_meta_off->type; - meta->wi = dev_meta_off->wi; - meta->slba = le64_to_cpu(dev_meta_off->slba); - meta->cnlb = le64_to_cpu(dev_meta_off->cnlb); - meta->wp = le64_to_cpu(dev_meta_off->wp); - - meta++; - dev_meta_off++; - } - - offset += len; - left -= len; - } - - kfree(dev_meta); - - return ret; -} - -static inline void nvme_nvm_rqtocmd(struct nvm_rq *rqd, struct nvme_ns *ns, - struct nvme_nvm_command *c) -{ - c->ph_rw.opcode = rqd->opcode; - c->ph_rw.nsid = cpu_to_le32(ns->head->ns_id); - c->ph_rw.spba = cpu_to_le64(rqd->ppa_addr.ppa); - c->ph_rw.metadata = cpu_to_le64(rqd->dma_meta_list); - c->ph_rw.control = cpu_to_le16(rqd->flags); - c->ph_rw.length = cpu_to_le16(rqd->nr_ppas - 1); -} - -static void nvme_nvm_end_io(struct request *rq, blk_status_t status) -{ - struct nvm_rq *rqd = rq->end_io_data; - - rqd->ppa_status = le64_to_cpu(nvme_req(rq)->result.u64); - rqd->error = nvme_req(rq)->status; - nvm_end_io(rqd); - - kfree(nvme_req(rq)->cmd); - blk_mq_free_request(rq); -} - -static struct request *nvme_nvm_alloc_request(struct request_queue *q, - struct nvm_rq *rqd, - struct nvme_nvm_command *cmd) -{ - struct nvme_ns *ns = q->queuedata; - struct request *rq; - - nvme_nvm_rqtocmd(rqd, ns, cmd); - - rq = nvme_alloc_request(q, (struct nvme_command *)cmd, 0); - if (IS_ERR(rq)) - return rq; - - rq->cmd_flags &= ~REQ_FAILFAST_DRIVER; - - if (rqd->bio) - blk_rq_append_bio(rq, rqd->bio); - else - rq->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM); - - return rq; -} - -static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd, - void *buf) -{ - struct nvm_geo *geo = &dev->geo; - struct request_queue *q = dev->q; - struct nvme_nvm_command *cmd; - struct request *rq; - int ret; - - cmd = kzalloc(sizeof(struct nvme_nvm_command), GFP_KERNEL); - if (!cmd) - return -ENOMEM; - - rq = nvme_nvm_alloc_request(q, rqd, cmd); - if (IS_ERR(rq)) { - ret = PTR_ERR(rq); - goto err_free_cmd; - } - - if (buf) { - ret = blk_rq_map_kern(q, rq, buf, geo->csecs * rqd->nr_ppas, - GFP_KERNEL); - if (ret) - goto err_free_cmd; - } - - rq->end_io_data = rqd; - - blk_execute_rq_nowait(NULL, rq, 0, nvme_nvm_end_io); - - return 0; - -err_free_cmd: - kfree(cmd); - return ret; -} - -static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name, - int size) -{ - struct nvme_ns *ns = nvmdev->q->queuedata; - - return dma_pool_create(name, ns->ctrl->dev, size, PAGE_SIZE, 0); -} - -static void nvme_nvm_destroy_dma_pool(void *pool) -{ - struct dma_pool *dma_pool = pool; - - dma_pool_destroy(dma_pool); -} - -static void *nvme_nvm_dev_dma_alloc(struct nvm_dev *dev, void *pool, - gfp_t mem_flags, dma_addr_t *dma_handler) -{ - return dma_pool_alloc(pool, mem_flags, dma_handler); -} - -static void nvme_nvm_dev_dma_free(void *pool, void *addr, - dma_addr_t dma_handler) -{ - dma_pool_free(pool, addr, dma_handler); -} - -static struct nvm_dev_ops nvme_nvm_dev_ops = { - .identity = nvme_nvm_identity, - - .get_bb_tbl = nvme_nvm_get_bb_tbl, - .set_bb_tbl = nvme_nvm_set_bb_tbl, - - .get_chk_meta = nvme_nvm_get_chk_meta, - - .submit_io = nvme_nvm_submit_io, - - .create_dma_pool = nvme_nvm_create_dma_pool, - .destroy_dma_pool = nvme_nvm_destroy_dma_pool, - .dev_dma_alloc = nvme_nvm_dev_dma_alloc, - .dev_dma_free = nvme_nvm_dev_dma_free, -}; - -static int nvme_nvm_submit_user_cmd(struct request_queue *q, - struct nvme_ns *ns, - struct nvme_nvm_command *vcmd, - void __user *ubuf, unsigned int bufflen, - void __user *meta_buf, unsigned int meta_len, - void __user *ppa_buf, unsigned int ppa_len, - u32 *result, u64 *status, unsigned int timeout) -{ - bool write = nvme_is_write((struct nvme_command *)vcmd); - struct nvm_dev *dev = ns->ndev; - struct request *rq; - struct bio *bio = NULL; - __le64 *ppa_list = NULL; - dma_addr_t ppa_dma; - __le64 *metadata = NULL; - dma_addr_t metadata_dma; - DECLARE_COMPLETION_ONSTACK(wait); - int ret = 0; - - rq = nvme_alloc_request(q, (struct nvme_command *)vcmd, 0); - if (IS_ERR(rq)) { - ret = -ENOMEM; - goto err_cmd; - } - - if (timeout) - rq->timeout = timeout; - - if (ppa_buf && ppa_len) { - ppa_list = dma_pool_alloc(dev->dma_pool, GFP_KERNEL, &ppa_dma); - if (!ppa_list) { - ret = -ENOMEM; - goto err_rq; - } - if (copy_from_user(ppa_list, (void __user *)ppa_buf, - sizeof(u64) * (ppa_len + 1))) { - ret = -EFAULT; - goto err_ppa; - } - vcmd->ph_rw.spba = cpu_to_le64(ppa_dma); - } else { - vcmd->ph_rw.spba = cpu_to_le64((uintptr_t)ppa_buf); - } - - if (ubuf && bufflen) { - ret = blk_rq_map_user(q, rq, NULL, ubuf, bufflen, GFP_KERNEL); - if (ret) - goto err_ppa; - bio = rq->bio; - - if (meta_buf && meta_len) { - metadata = dma_pool_alloc(dev->dma_pool, GFP_KERNEL, - &metadata_dma); - if (!metadata) { - ret = -ENOMEM; - goto err_map; - } - - if (write) { - if (copy_from_user(metadata, - (void __user *)meta_buf, - meta_len)) { - ret = -EFAULT; - goto err_meta; - } - } - vcmd->ph_rw.metadata = cpu_to_le64(metadata_dma); - } - - bio_set_dev(bio, ns->disk->part0); - } - - blk_execute_rq(NULL, rq, 0); - - if (nvme_req(rq)->flags & NVME_REQ_CANCELLED) - ret = -EINTR; - else if (nvme_req(rq)->status & 0x7ff) - ret = -EIO; - if (result) - *result = nvme_req(rq)->status & 0x7ff; - if (status) - *status = le64_to_cpu(nvme_req(rq)->result.u64); - - if (metadata && !ret && !write) { - if (copy_to_user(meta_buf, (void *)metadata, meta_len)) - ret = -EFAULT; - } -err_meta: - if (meta_buf && meta_len) - dma_pool_free(dev->dma_pool, metadata, metadata_dma); -err_map: - if (bio) - blk_rq_unmap_user(bio); -err_ppa: - if (ppa_buf && ppa_len) - dma_pool_free(dev->dma_pool, ppa_list, ppa_dma); -err_rq: - blk_mq_free_request(rq); -err_cmd: - return ret; -} - -static int nvme_nvm_submit_vio(struct nvme_ns *ns, - struct nvm_user_vio __user *uvio) -{ - struct nvm_user_vio vio; - struct nvme_nvm_command c; - unsigned int length; - int ret; - - if (copy_from_user(&vio, uvio, sizeof(vio))) - return -EFAULT; - if (vio.flags) - return -EINVAL; - - memset(&c, 0, sizeof(c)); - c.ph_rw.opcode = vio.opcode; - c.ph_rw.nsid = cpu_to_le32(ns->head->ns_id); - c.ph_rw.control = cpu_to_le16(vio.control); - c.ph_rw.length = cpu_to_le16(vio.nppas); - - length = (vio.nppas + 1) << ns->lba_shift; - - ret = nvme_nvm_submit_user_cmd(ns->queue, ns, &c, - (void __user *)(uintptr_t)vio.addr, length, - (void __user *)(uintptr_t)vio.metadata, - vio.metadata_len, - (void __user *)(uintptr_t)vio.ppa_list, vio.nppas, - &vio.result, &vio.status, 0); - - if (ret && copy_to_user(uvio, &vio, sizeof(vio))) - return -EFAULT; - - return ret; -} - -static int nvme_nvm_user_vcmd(struct nvme_ns *ns, int admin, - struct nvm_passthru_vio __user *uvcmd) -{ - struct nvm_passthru_vio vcmd; - struct nvme_nvm_command c; - struct request_queue *q; - unsigned int timeout = 0; - int ret; - - if (copy_from_user(&vcmd, uvcmd, sizeof(vcmd))) - return -EFAULT; - if ((vcmd.opcode != 0xF2) && (!capable(CAP_SYS_ADMIN))) - return -EACCES; - if (vcmd.flags) - return -EINVAL; - - memset(&c, 0, sizeof(c)); - c.common.opcode = vcmd.opcode; - c.common.nsid = cpu_to_le32(ns->head->ns_id); - c.common.cdw2[0] = cpu_to_le32(vcmd.cdw2); - c.common.cdw2[1] = cpu_to_le32(vcmd.cdw3); - /* cdw11-12 */ - c.ph_rw.length = cpu_to_le16(vcmd.nppas); - c.ph_rw.control = cpu_to_le16(vcmd.control); - c.common.cdw13 = cpu_to_le32(vcmd.cdw13); - c.common.cdw14 = cpu_to_le32(vcmd.cdw14); - c.common.cdw15 = cpu_to_le32(vcmd.cdw15); - - if (vcmd.timeout_ms) - timeout = msecs_to_jiffies(vcmd.timeout_ms); - - q = admin ? ns->ctrl->admin_q : ns->queue; - - ret = nvme_nvm_submit_user_cmd(q, ns, - (struct nvme_nvm_command *)&c, - (void __user *)(uintptr_t)vcmd.addr, vcmd.data_len, - (void __user *)(uintptr_t)vcmd.metadata, - vcmd.metadata_len, - (void __user *)(uintptr_t)vcmd.ppa_list, vcmd.nppas, - &vcmd.result, &vcmd.status, timeout); - - if (ret && copy_to_user(uvcmd, &vcmd, sizeof(vcmd))) - return -EFAULT; - - return ret; -} - -int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, void __user *argp) -{ - switch (cmd) { - case NVME_NVM_IOCTL_ADMIN_VIO: - return nvme_nvm_user_vcmd(ns, 1, argp); - case NVME_NVM_IOCTL_IO_VIO: - return nvme_nvm_user_vcmd(ns, 0, argp); - case NVME_NVM_IOCTL_SUBMIT_VIO: - return nvme_nvm_submit_vio(ns, argp); - default: - return -ENOTTY; - } -} - -int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node) -{ - struct request_queue *q = ns->queue; - struct nvm_dev *dev; - struct nvm_geo *geo; - - _nvme_nvm_check_size(); - - dev = nvm_alloc_dev(node); - if (!dev) - return -ENOMEM; - - /* Note that csecs and sos will be overridden if it is a 1.2 drive. */ - geo = &dev->geo; - geo->csecs = 1 << ns->lba_shift; - geo->sos = ns->ms; - if (ns->features & NVME_NS_EXT_LBAS) - geo->ext = true; - else - geo->ext = false; - geo->mdts = ns->ctrl->max_hw_sectors; - - dev->q = q; - memcpy(dev->name, disk_name, DISK_NAME_LEN); - dev->ops = &nvme_nvm_dev_ops; - dev->private_data = ns; - ns->ndev = dev; - - return nvm_register(dev); -} - -void nvme_nvm_unregister(struct nvme_ns *ns) -{ - nvm_unregister(ns->ndev); -} - -static ssize_t nvm_dev_attr_show(struct device *dev, - struct device_attribute *dattr, char *page) -{ - struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - struct nvm_dev *ndev = ns->ndev; - struct nvm_geo *geo = &ndev->geo; - struct attribute *attr; - - if (!ndev) - return 0; - - attr = &dattr->attr; - - if (strcmp(attr->name, "version") == 0) { - if (geo->major_ver_id == 1) - return scnprintf(page, PAGE_SIZE, "%u\n", - geo->major_ver_id); - else - return scnprintf(page, PAGE_SIZE, "%u.%u\n", - geo->major_ver_id, - geo->minor_ver_id); - } else if (strcmp(attr->name, "capabilities") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->cap); - } else if (strcmp(attr->name, "read_typ") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->trdt); - } else if (strcmp(attr->name, "read_max") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->trdm); - } else { - return scnprintf(page, - PAGE_SIZE, - "Unhandled attr(%s) in `%s`\n", - attr->name, __func__); - } -} - -static ssize_t nvm_dev_attr_show_ppaf(struct nvm_addrf_12 *ppaf, char *page) -{ - return scnprintf(page, PAGE_SIZE, - "0x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n", - ppaf->ch_offset, ppaf->ch_len, - ppaf->lun_offset, ppaf->lun_len, - ppaf->pln_offset, ppaf->pln_len, - ppaf->blk_offset, ppaf->blk_len, - ppaf->pg_offset, ppaf->pg_len, - ppaf->sec_offset, ppaf->sec_len); -} - -static ssize_t nvm_dev_attr_show_12(struct device *dev, - struct device_attribute *dattr, char *page) -{ - struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - struct nvm_dev *ndev = ns->ndev; - struct nvm_geo *geo = &ndev->geo; - struct attribute *attr; - - if (!ndev) - return 0; - - attr = &dattr->attr; - - if (strcmp(attr->name, "vendor_opcode") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->vmnt); - } else if (strcmp(attr->name, "device_mode") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->dom); - /* kept for compatibility */ - } else if (strcmp(attr->name, "media_manager") == 0) { - return scnprintf(page, PAGE_SIZE, "%s\n", "gennvm"); - } else if (strcmp(attr->name, "ppa_format") == 0) { - return nvm_dev_attr_show_ppaf((void *)&geo->addrf, page); - } else if (strcmp(attr->name, "media_type") == 0) { /* u8 */ - return scnprintf(page, PAGE_SIZE, "%u\n", geo->mtype); - } else if (strcmp(attr->name, "flash_media_type") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->fmtype); - } else if (strcmp(attr->name, "num_channels") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_ch); - } else if (strcmp(attr->name, "num_luns") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_lun); - } else if (strcmp(attr->name, "num_planes") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_pln); - } else if (strcmp(attr->name, "num_blocks") == 0) { /* u16 */ - return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_chk); - } else if (strcmp(attr->name, "num_pages") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_pg); - } else if (strcmp(attr->name, "page_size") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->fpg_sz); - } else if (strcmp(attr->name, "hw_sector_size") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->csecs); - } else if (strcmp(attr->name, "oob_sector_size") == 0) {/* u32 */ - return scnprintf(page, PAGE_SIZE, "%u\n", geo->sos); - } else if (strcmp(attr->name, "prog_typ") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->tprt); - } else if (strcmp(attr->name, "prog_max") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->tprm); - } else if (strcmp(attr->name, "erase_typ") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->tbet); - } else if (strcmp(attr->name, "erase_max") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->tbem); - } else if (strcmp(attr->name, "multiplane_modes") == 0) { - return scnprintf(page, PAGE_SIZE, "0x%08x\n", geo->mpos); - } else if (strcmp(attr->name, "media_capabilities") == 0) { - return scnprintf(page, PAGE_SIZE, "0x%08x\n", geo->mccap); - } else if (strcmp(attr->name, "max_phys_secs") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", NVM_MAX_VLBA); - } else { - return scnprintf(page, PAGE_SIZE, - "Unhandled attr(%s) in `%s`\n", - attr->name, __func__); - } -} - -static ssize_t nvm_dev_attr_show_20(struct device *dev, - struct device_attribute *dattr, char *page) -{ - struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - struct nvm_dev *ndev = ns->ndev; - struct nvm_geo *geo = &ndev->geo; - struct attribute *attr; - - if (!ndev) - return 0; - - attr = &dattr->attr; - - if (strcmp(attr->name, "groups") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_ch); - } else if (strcmp(attr->name, "punits") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_lun); - } else if (strcmp(attr->name, "chunks") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_chk); - } else if (strcmp(attr->name, "clba") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->clba); - } else if (strcmp(attr->name, "ws_min") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->ws_min); - } else if (strcmp(attr->name, "ws_opt") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->ws_opt); - } else if (strcmp(attr->name, "maxoc") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->maxoc); - } else if (strcmp(attr->name, "maxocpu") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->maxocpu); - } else if (strcmp(attr->name, "mw_cunits") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->mw_cunits); - } else if (strcmp(attr->name, "write_typ") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->tprt); - } else if (strcmp(attr->name, "write_max") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->tprm); - } else if (strcmp(attr->name, "reset_typ") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->tbet); - } else if (strcmp(attr->name, "reset_max") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->tbem); - } else { - return scnprintf(page, PAGE_SIZE, - "Unhandled attr(%s) in `%s`\n", - attr->name, __func__); - } -} - -#define NVM_DEV_ATTR_RO(_name) \ - DEVICE_ATTR(_name, S_IRUGO, nvm_dev_attr_show, NULL) -#define NVM_DEV_ATTR_12_RO(_name) \ - DEVICE_ATTR(_name, S_IRUGO, nvm_dev_attr_show_12, NULL) -#define NVM_DEV_ATTR_20_RO(_name) \ - DEVICE_ATTR(_name, S_IRUGO, nvm_dev_attr_show_20, NULL) - -/* general attributes */ -static NVM_DEV_ATTR_RO(version); -static NVM_DEV_ATTR_RO(capabilities); - -static NVM_DEV_ATTR_RO(read_typ); -static NVM_DEV_ATTR_RO(read_max); - -/* 1.2 values */ -static NVM_DEV_ATTR_12_RO(vendor_opcode); -static NVM_DEV_ATTR_12_RO(device_mode); -static NVM_DEV_ATTR_12_RO(ppa_format); -static NVM_DEV_ATTR_12_RO(media_manager); -static NVM_DEV_ATTR_12_RO(media_type); -static NVM_DEV_ATTR_12_RO(flash_media_type); -static NVM_DEV_ATTR_12_RO(num_channels); -static NVM_DEV_ATTR_12_RO(num_luns); -static NVM_DEV_ATTR_12_RO(num_planes); -static NVM_DEV_ATTR_12_RO(num_blocks); -static NVM_DEV_ATTR_12_RO(num_pages); -static NVM_DEV_ATTR_12_RO(page_size); -static NVM_DEV_ATTR_12_RO(hw_sector_size); -static NVM_DEV_ATTR_12_RO(oob_sector_size); -static NVM_DEV_ATTR_12_RO(prog_typ); -static NVM_DEV_ATTR_12_RO(prog_max); -static NVM_DEV_ATTR_12_RO(erase_typ); -static NVM_DEV_ATTR_12_RO(erase_max); -static NVM_DEV_ATTR_12_RO(multiplane_modes); -static NVM_DEV_ATTR_12_RO(media_capabilities); -static NVM_DEV_ATTR_12_RO(max_phys_secs); - -/* 2.0 values */ -static NVM_DEV_ATTR_20_RO(groups); -static NVM_DEV_ATTR_20_RO(punits); -static NVM_DEV_ATTR_20_RO(chunks); -static NVM_DEV_ATTR_20_RO(clba); -static NVM_DEV_ATTR_20_RO(ws_min); -static NVM_DEV_ATTR_20_RO(ws_opt); -static NVM_DEV_ATTR_20_RO(maxoc); -static NVM_DEV_ATTR_20_RO(maxocpu); -static NVM_DEV_ATTR_20_RO(mw_cunits); -static NVM_DEV_ATTR_20_RO(write_typ); -static NVM_DEV_ATTR_20_RO(write_max); -static NVM_DEV_ATTR_20_RO(reset_typ); -static NVM_DEV_ATTR_20_RO(reset_max); - -static struct attribute *nvm_dev_attrs[] = { - /* version agnostic attrs */ - &dev_attr_version.attr, - &dev_attr_capabilities.attr, - &dev_attr_read_typ.attr, - &dev_attr_read_max.attr, - - /* 1.2 attrs */ - &dev_attr_vendor_opcode.attr, - &dev_attr_device_mode.attr, - &dev_attr_media_manager.attr, - &dev_attr_ppa_format.attr, - &dev_attr_media_type.attr, - &dev_attr_flash_media_type.attr, - &dev_attr_num_channels.attr, - &dev_attr_num_luns.attr, - &dev_attr_num_planes.attr, - &dev_attr_num_blocks.attr, - &dev_attr_num_pages.attr, - &dev_attr_page_size.attr, - &dev_attr_hw_sector_size.attr, - &dev_attr_oob_sector_size.attr, - &dev_attr_prog_typ.attr, - &dev_attr_prog_max.attr, - &dev_attr_erase_typ.attr, - &dev_attr_erase_max.attr, - &dev_attr_multiplane_modes.attr, - &dev_attr_media_capabilities.attr, - &dev_attr_max_phys_secs.attr, - - /* 2.0 attrs */ - &dev_attr_groups.attr, - &dev_attr_punits.attr, - &dev_attr_chunks.attr, - &dev_attr_clba.attr, - &dev_attr_ws_min.attr, - &dev_attr_ws_opt.attr, - &dev_attr_maxoc.attr, - &dev_attr_maxocpu.attr, - &dev_attr_mw_cunits.attr, - - &dev_attr_write_typ.attr, - &dev_attr_write_max.attr, - &dev_attr_reset_typ.attr, - &dev_attr_reset_max.attr, - - NULL, -}; - -static umode_t nvm_dev_attrs_visible(struct kobject *kobj, - struct attribute *attr, int index) -{ - struct device *dev = kobj_to_dev(kobj); - struct gendisk *disk = dev_to_disk(dev); - struct nvme_ns *ns = disk->private_data; - struct nvm_dev *ndev = ns->ndev; - struct device_attribute *dev_attr = - container_of(attr, typeof(*dev_attr), attr); - - if (!ndev) - return 0; - - if (dev_attr->show == nvm_dev_attr_show) - return attr->mode; - - switch (ndev->geo.major_ver_id) { - case 1: - if (dev_attr->show == nvm_dev_attr_show_12) - return attr->mode; - break; - case 2: - if (dev_attr->show == nvm_dev_attr_show_20) - return attr->mode; - break; - } - - return 0; -} - -const struct attribute_group nvme_nvm_attr_group = { - .name = "lightnvm", - .attrs = nvm_dev_attrs, - .is_visible = nvm_dev_attrs_visible, -}; diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 3f32c5e86bfc..37ce3e8b1db2 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -765,7 +765,7 @@ void nvme_mpath_shutdown_disk(struct nvme_ns_head *head) if (!head->disk) return; kblockd_schedule_work(&head->requeue_work); - if (head->disk->flags & GENHD_FL_UP) { + if (test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { nvme_cdev_del(&head->cdev, &head->cdev_device); del_gendisk(head->disk); } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 5cd1fa3b8464..a2e1f298b217 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include @@ -48,11 +47,6 @@ extern struct workqueue_struct *nvme_wq; extern struct workqueue_struct *nvme_reset_wq; extern struct workqueue_struct *nvme_delete_wq; -enum { - NVME_NS_LBA = 0, - NVME_NS_LIGHTNVM = 1, -}; - /* * List of workarounds for devices that required behavior not specified in * the standard. @@ -92,11 +86,6 @@ enum nvme_quirks { */ NVME_QUIRK_NO_DEEPEST_PS = (1 << 5), - /* - * Supports the LighNVM command set if indicated in vs[1]. - */ - NVME_QUIRK_LIGHTNVM = (1 << 6), - /* * Set MEDIUM priority on SQ creation */ @@ -158,6 +147,7 @@ enum nvme_quirks { struct nvme_request { struct nvme_command *cmd; union nvme_result result; + u8 genctr; u8 retries; u8 flags; u16 status; @@ -449,7 +439,6 @@ struct nvme_ns { u32 ana_grpid; #endif struct list_head siblings; - struct nvm_dev *ndev; struct kref kref; struct nvme_ns_head *head; @@ -497,6 +486,49 @@ struct nvme_ctrl_ops { int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size); }; +/* + * nvme command_id is constructed as such: + * | xxxx | xxxxxxxxxxxx | + * gen request tag + */ +#define nvme_genctr_mask(gen) (gen & 0xf) +#define nvme_cid_install_genctr(gen) (nvme_genctr_mask(gen) << 12) +#define nvme_genctr_from_cid(cid) ((cid & 0xf000) >> 12) +#define nvme_tag_from_cid(cid) (cid & 0xfff) + +static inline u16 nvme_cid(struct request *rq) +{ + return nvme_cid_install_genctr(nvme_req(rq)->genctr) | rq->tag; +} + +static inline struct request *nvme_find_rq(struct blk_mq_tags *tags, + u16 command_id) +{ + u8 genctr = nvme_genctr_from_cid(command_id); + u16 tag = nvme_tag_from_cid(command_id); + struct request *rq; + + rq = blk_mq_tag_to_rq(tags, tag); + if (unlikely(!rq)) { + pr_err("could not locate request for tag %#x\n", + tag); + return NULL; + } + if (unlikely(nvme_genctr_mask(nvme_req(rq)->genctr) != genctr)) { + dev_err(nvme_req(rq)->ctrl->device, + "request %#x genctr mismatch (got %#x expected %#x)\n", + tag, genctr, nvme_genctr_mask(nvme_req(rq)->genctr)); + return NULL; + } + return rq; +} + +static inline struct request *nvme_cid_to_rq(struct blk_mq_tags *tags, + u16 command_id) +{ + return blk_mq_tag_to_rq(tags, nvme_tag_from_cid(command_id)); +} + #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS void nvme_fault_inject_init(struct nvme_fault_inject *fault_inj, const char *dev_name); @@ -594,7 +626,8 @@ static inline void nvme_put_ctrl(struct nvme_ctrl *ctrl) static inline bool nvme_is_aen_req(u16 qid, __u16 command_id) { - return !qid && command_id >= NVME_AQ_BLK_MQ_DEPTH; + return !qid && + nvme_tag_from_cid(command_id) >= NVME_AQ_BLK_MQ_DEPTH; } void nvme_complete_rq(struct request *req); @@ -823,26 +856,6 @@ static inline int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf) } #endif -#ifdef CONFIG_NVM -int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node); -void nvme_nvm_unregister(struct nvme_ns *ns); -extern const struct attribute_group nvme_nvm_attr_group; -int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, void __user *argp); -#else -static inline int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, - int node) -{ - return 0; -} - -static inline void nvme_nvm_unregister(struct nvme_ns *ns) {}; -static inline int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, - void __user *argp) -{ - return -ENOTTY; -} -#endif /* CONFIG_NVM */ - static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev) { return dev_to_disk(dev)->private_data; diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 51852085239e..b82492cd7503 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -60,6 +60,8 @@ MODULE_PARM_DESC(sgl_threshold, "Use SGLs when average request segment size is larger or equal to " "this size. Use 0 to disable SGLs."); +#define NVME_PCI_MIN_QUEUE_SIZE 2 +#define NVME_PCI_MAX_QUEUE_SIZE 4095 static int io_queue_depth_set(const char *val, const struct kernel_param *kp); static const struct kernel_param_ops io_queue_depth_ops = { .set = io_queue_depth_set, @@ -68,7 +70,7 @@ static const struct kernel_param_ops io_queue_depth_ops = { static unsigned int io_queue_depth = 1024; module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644); -MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2"); +MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2 and < 4096"); static int io_queue_count_set(const char *val, const struct kernel_param *kp) { @@ -135,6 +137,7 @@ struct nvme_dev { u32 cmbloc; struct nvme_ctrl ctrl; u32 last_ps; + bool hmb; mempool_t *iod_mempool; @@ -153,18 +156,14 @@ struct nvme_dev { unsigned int nr_allocated_queues; unsigned int nr_write_queues; unsigned int nr_poll_queues; + + bool attrs_added; }; static int io_queue_depth_set(const char *val, const struct kernel_param *kp) { - int ret; - u32 n; - - ret = kstrtou32(val, 10, &n); - if (ret != 0 || n < 2) - return -EINVAL; - - return param_set_uint(val, kp); + return param_set_uint_minmax(val, kp, NVME_PCI_MIN_QUEUE_SIZE, + NVME_PCI_MAX_QUEUE_SIZE); } static inline unsigned int sq_idx(unsigned int qid, u32 stride) @@ -1014,7 +1013,7 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) return; } - req = blk_mq_tag_to_rq(nvme_queue_tagset(nvmeq), command_id); + req = nvme_find_rq(nvme_queue_tagset(nvmeq), command_id); if (unlikely(!req)) { dev_warn(nvmeq->dev->ctrl.device, "invalid id %d completed on queue %d\n", @@ -1808,17 +1807,6 @@ static int nvme_create_io_queues(struct nvme_dev *dev) return ret >= 0 ? 0 : ret; } -static ssize_t nvme_cmb_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); - - return scnprintf(buf, PAGE_SIZE, "cmbloc : x%08x\ncmbsz : x%08x\n", - ndev->cmbloc, ndev->cmbsz); -} -static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL); - static u64 nvme_cmb_size_unit(struct nvme_dev *dev) { u8 szu = (dev->cmbsz >> NVME_CMBSZ_SZU_SHIFT) & NVME_CMBSZ_SZU_MASK; @@ -1887,20 +1875,6 @@ static void nvme_map_cmb(struct nvme_dev *dev) if ((dev->cmbsz & (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) == (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) pci_p2pmem_publish(pdev, true); - - if (sysfs_add_file_to_group(&dev->ctrl.device->kobj, - &dev_attr_cmb.attr, NULL)) - dev_warn(dev->ctrl.device, - "failed to add sysfs attribute for CMB\n"); -} - -static inline void nvme_release_cmb(struct nvme_dev *dev) -{ - if (dev->cmb_size) { - sysfs_remove_file_from_group(&dev->ctrl.device->kobj, - &dev_attr_cmb.attr, NULL); - dev->cmb_size = 0; - } } static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits) @@ -1923,7 +1897,9 @@ static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits) dev_warn(dev->ctrl.device, "failed to set host mem (err %d, flags %#x).\n", ret, bits); - } + } else + dev->hmb = bits & NVME_HOST_MEM_ENABLE; + return ret; } @@ -2080,6 +2056,102 @@ static int nvme_setup_host_mem(struct nvme_dev *dev) return ret; } +static ssize_t cmb_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); + + return sysfs_emit(buf, "cmbloc : x%08x\ncmbsz : x%08x\n", + ndev->cmbloc, ndev->cmbsz); +} +static DEVICE_ATTR_RO(cmb); + +static ssize_t cmbloc_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); + + return sysfs_emit(buf, "%u\n", ndev->cmbloc); +} +static DEVICE_ATTR_RO(cmbloc); + +static ssize_t cmbsz_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); + + return sysfs_emit(buf, "%u\n", ndev->cmbsz); +} +static DEVICE_ATTR_RO(cmbsz); + +static ssize_t hmb_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); + + return sysfs_emit(buf, "%d\n", ndev->hmb); +} + +static ssize_t hmb_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); + bool new; + int ret; + + if (strtobool(buf, &new) < 0) + return -EINVAL; + + if (new == ndev->hmb) + return count; + + if (new) { + ret = nvme_setup_host_mem(ndev); + } else { + ret = nvme_set_host_mem(ndev, 0); + if (!ret) + nvme_free_host_mem(ndev); + } + + if (ret < 0) + return ret; + + return count; +} +static DEVICE_ATTR_RW(hmb); + +static umode_t nvme_pci_attrs_are_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct nvme_ctrl *ctrl = + dev_get_drvdata(container_of(kobj, struct device, kobj)); + struct nvme_dev *dev = to_nvme_dev(ctrl); + + if (a == &dev_attr_cmb.attr || + a == &dev_attr_cmbloc.attr || + a == &dev_attr_cmbsz.attr) { + if (!dev->cmbsz) + return 0; + } + if (a == &dev_attr_hmb.attr && !ctrl->hmpre) + return 0; + + return a->mode; +} + +static struct attribute *nvme_pci_attrs[] = { + &dev_attr_cmb.attr, + &dev_attr_cmbloc.attr, + &dev_attr_cmbsz.attr, + &dev_attr_hmb.attr, + NULL, +}; + +static const struct attribute_group nvme_pci_attr_group = { + .attrs = nvme_pci_attrs, + .is_visible = nvme_pci_attrs_are_visible, +}; + /* * nirqs is the number of interrupts available for write and read * queues. The core already reserved an interrupt for the admin queue. @@ -2751,6 +2823,10 @@ static void nvme_reset_work(struct work_struct *work) goto out; } + if (!dev->attrs_added && !sysfs_create_group(&dev->ctrl.device->kobj, + &nvme_pci_attr_group)) + dev->attrs_added = true; + nvme_start_ctrl(&dev->ctrl); return; @@ -2999,6 +3075,13 @@ static void nvme_shutdown(struct pci_dev *pdev) nvme_disable_prepare_reset(dev, true); } +static void nvme_remove_attrs(struct nvme_dev *dev) +{ + if (dev->attrs_added) + sysfs_remove_group(&dev->ctrl.device->kobj, + &nvme_pci_attr_group); +} + /* * The driver's remove may be called on a device in a partially initialized * state. This function must not have any dependencies on the device state in @@ -3020,7 +3103,7 @@ static void nvme_remove(struct pci_dev *pdev) nvme_stop_ctrl(&dev->ctrl); nvme_remove_namespaces(&dev->ctrl); nvme_dev_disable(dev, true); - nvme_release_cmb(dev); + nvme_remove_attrs(dev); nvme_free_host_mem(dev); nvme_dev_remove_admin(dev); nvme_free_queues(dev, 0); @@ -3047,8 +3130,13 @@ static int nvme_resume(struct device *dev) if (ndev->last_ps == U32_MAX || nvme_set_power_state(ctrl, ndev->last_ps) != 0) - return nvme_try_sched_reset(&ndev->ctrl); + goto reset; + if (ctrl->hmpre && nvme_setup_host_mem(ndev)) + goto reset; + return 0; +reset: + return nvme_try_sched_reset(ctrl); } static int nvme_suspend(struct device *dev) @@ -3072,15 +3160,9 @@ static int nvme_suspend(struct device *dev) * the PCI bus layer to put it into D3 in order to take the PCIe link * down, so as to allow the platform to achieve its minimum low-power * state (which may not be possible if the link is up). - * - * If a host memory buffer is enabled, shut down the device as the NVMe - * specification allows the device to access the host memory buffer in - * host DRAM from all power states, but hosts will fail access to DRAM - * during S3. */ if (pm_suspend_via_firmware() || !ctrl->npss || !pcie_aspm_enabled(pdev) || - ndev->nr_host_mem_descs || (ndev->ctrl.quirks & NVME_QUIRK_SIMPLE_SUSPEND)) return nvme_disable_prepare_reset(ndev, true); @@ -3091,6 +3173,17 @@ static int nvme_suspend(struct device *dev) if (ctrl->state != NVME_CTRL_LIVE) goto unfreeze; + /* + * Host memory access may not be successful in a system suspend state, + * but the specification allows the controller to access memory in a + * non-operational power state. + */ + if (ndev->hmb) { + ret = nvme_set_host_mem(ndev, 0); + if (ret < 0) + goto unfreeze; + } + ret = nvme_get_power_state(ctrl, &ndev->last_ps); if (ret < 0) goto unfreeze; @@ -3243,12 +3336,6 @@ static const struct pci_device_id nvme_id_table[] = { { PCI_DEVICE(0x1b4b, 0x1092), /* Lexar 256 GB SSD */ .driver_data = NVME_QUIRK_NO_NS_DESC_LIST | NVME_QUIRK_IGNORE_DEV_SUBNQN, }, - { PCI_DEVICE(0x1d1d, 0x1f1f), /* LighNVM qemu device */ - .driver_data = NVME_QUIRK_LIGHTNVM, }, - { PCI_DEVICE(0x1d1d, 0x2807), /* CNEX WL */ - .driver_data = NVME_QUIRK_LIGHTNVM, }, - { PCI_DEVICE(0x1d1d, 0x2601), /* CNEX Granby */ - .driver_data = NVME_QUIRK_LIGHTNVM, }, { PCI_DEVICE(0x10ec, 0x5762), /* ADATA SX6000LNP */ .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, }, { PCI_DEVICE(0x1cc1, 0x8201), /* ADATA SX8200PNP 512GB */ diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 7f6b3a991501..a68704e39084 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -735,13 +735,13 @@ static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl) if (ret) return ret; - ctrl->ctrl.queue_count = nr_io_queues + 1; - if (ctrl->ctrl.queue_count < 2) { + if (nr_io_queues == 0) { dev_err(ctrl->ctrl.device, "unable to set any I/O queues\n"); return -ENOMEM; } + ctrl->ctrl.queue_count = nr_io_queues + 1; dev_info(ctrl->ctrl.device, "creating %d I/O queues.\n", nr_io_queues); @@ -1730,10 +1730,10 @@ static void nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue, struct request *rq; struct nvme_rdma_request *req; - rq = blk_mq_tag_to_rq(nvme_rdma_tagset(queue), cqe->command_id); + rq = nvme_find_rq(nvme_rdma_tagset(queue), cqe->command_id); if (!rq) { dev_err(queue->ctrl->ctrl.device, - "tag 0x%x on QP %#x not found\n", + "got bad command_id %#x on QP %#x\n", cqe->command_id, queue->qp->qp_num); nvme_rdma_error_recovery(queue->ctrl); return; diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 8cb15ee5b249..645025620154 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -487,11 +487,11 @@ static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue, { struct request *rq; - rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), cqe->command_id); + rq = nvme_find_rq(nvme_tcp_tagset(queue), cqe->command_id); if (!rq) { dev_err(queue->ctrl->ctrl.device, - "queue %d tag 0x%x not found\n", - nvme_tcp_queue_id(queue), cqe->command_id); + "got bad cqe.command_id %#x on queue %d\n", + cqe->command_id, nvme_tcp_queue_id(queue)); nvme_tcp_error_recovery(&queue->ctrl->ctrl); return -EINVAL; } @@ -508,11 +508,11 @@ static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue, { struct request *rq; - rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id); + rq = nvme_find_rq(nvme_tcp_tagset(queue), pdu->command_id); if (!rq) { dev_err(queue->ctrl->ctrl.device, - "queue %d tag %#x not found\n", - nvme_tcp_queue_id(queue), pdu->command_id); + "got bad c2hdata.command_id %#x on queue %d\n", + pdu->command_id, nvme_tcp_queue_id(queue)); return -ENOENT; } @@ -606,7 +606,7 @@ static int nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req, data->hdr.plen = cpu_to_le32(data->hdr.hlen + hdgst + req->pdu_len + ddgst); data->ttag = pdu->ttag; - data->command_id = rq->tag; + data->command_id = nvme_cid(rq); data->data_offset = cpu_to_le32(req->data_sent); data->data_length = cpu_to_le32(req->pdu_len); return 0; @@ -619,11 +619,11 @@ static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue, struct request *rq; int ret; - rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id); + rq = nvme_find_rq(nvme_tcp_tagset(queue), pdu->command_id); if (!rq) { dev_err(queue->ctrl->ctrl.device, - "queue %d tag %#x not found\n", - nvme_tcp_queue_id(queue), pdu->command_id); + "got bad r2t.command_id %#x on queue %d\n", + pdu->command_id, nvme_tcp_queue_id(queue)); return -ENOENT; } req = blk_mq_rq_to_pdu(rq); @@ -702,17 +702,9 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, unsigned int *offset, size_t *len) { struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu; - struct nvme_tcp_request *req; - struct request *rq; - - rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id); - if (!rq) { - dev_err(queue->ctrl->ctrl.device, - "queue %d tag %#x not found\n", - nvme_tcp_queue_id(queue), pdu->command_id); - return -ENOENT; - } - req = blk_mq_rq_to_pdu(rq); + struct request *rq = + nvme_cid_to_rq(nvme_tcp_tagset(queue), pdu->command_id); + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); while (true) { int recv_len, ret; @@ -804,8 +796,8 @@ static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue, } if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) { - struct request *rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), - pdu->command_id); + struct request *rq = nvme_cid_to_rq(nvme_tcp_tagset(queue), + pdu->command_id); nvme_tcp_end_request(rq, NVME_SC_SUCCESS); queue->nr_cqe++; @@ -1228,6 +1220,7 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid) sock_release(queue->sock); kfree(queue->pdu); + mutex_destroy(&queue->send_mutex); mutex_destroy(&queue->queue_lock); } @@ -1533,6 +1526,7 @@ err_sock: sock_release(queue->sock); queue->sock = NULL; err_destroy_mutex: + mutex_destroy(&queue->send_mutex); mutex_destroy(&queue->queue_lock); return ret; } @@ -1769,13 +1763,13 @@ static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) if (ret) return ret; - ctrl->queue_count = nr_io_queues + 1; - if (ctrl->queue_count < 2) { + if (nr_io_queues == 0) { dev_err(ctrl->device, "unable to set any I/O queues\n"); return -ENOMEM; } + ctrl->queue_count = nr_io_queues + 1; dev_info(ctrl->device, "creating %d I/O queues.\n", nr_io_queues); diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c index 6543015b6121..2a89c5aa0790 100644 --- a/drivers/nvme/host/trace.c +++ b/drivers/nvme/host/trace.c @@ -72,6 +72,20 @@ static const char *nvme_trace_admin_identify(struct trace_seq *p, u8 *cdw10) return ret; } +static const char *nvme_trace_admin_set_features(struct trace_seq *p, + u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u8 fid = cdw10[0]; + u8 sv = cdw10[3] & 0x8; + u32 cdw11 = get_unaligned_le32(cdw10 + 4); + + trace_seq_printf(p, "fid=0x%x, sv=0x%x, cdw11=0x%x", fid, sv, cdw11); + trace_seq_putc(p, 0); + + return ret; +} + static const char *nvme_trace_admin_get_features(struct trace_seq *p, u8 *cdw10) { @@ -80,7 +94,7 @@ static const char *nvme_trace_admin_get_features(struct trace_seq *p, u8 sel = cdw10[1] & 0x7; u32 cdw11 = get_unaligned_le32(cdw10 + 4); - trace_seq_printf(p, "fid=0x%x sel=0x%x cdw11=0x%x", fid, sel, cdw11); + trace_seq_printf(p, "fid=0x%x, sel=0x%x, cdw11=0x%x", fid, sel, cdw11); trace_seq_putc(p, 0); return ret; @@ -201,6 +215,8 @@ const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, return nvme_trace_create_cq(p, cdw10); case nvme_admin_identify: return nvme_trace_admin_identify(p, cdw10); + case nvme_admin_set_features: + return nvme_trace_admin_set_features(p, cdw10); case nvme_admin_get_features: return nvme_trace_admin_get_features(p, cdw10); case nvme_admin_get_lba_status: diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig index 4be2ececbc45..973561c93888 100644 --- a/drivers/nvme/target/Kconfig +++ b/drivers/nvme/target/Kconfig @@ -31,7 +31,6 @@ config NVME_TARGET_PASSTHRU config NVME_TARGET_LOOP tristate "NVMe loopback device support" depends on NVME_TARGET - select NVME_CORE select NVME_FABRICS select SG_POOL help @@ -65,7 +64,6 @@ config NVME_TARGET_FC config NVME_TARGET_FCLOOP tristate "NVMe over Fabrics FC Transport Loopback Test driver" depends on NVME_TARGET - select NVME_CORE select NVME_FABRICS select SG_POOL depends on NVME_FC diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index ac7210a3ea1c..66d05eecc2a9 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -802,6 +802,7 @@ void nvmet_sq_destroy(struct nvmet_sq *sq) * controller teardown as a result of a keep-alive expiration. */ ctrl->reset_tbkas = true; + sq->ctrl->sqs[sq->qid] = NULL; nvmet_ctrl_put(ctrl); sq->ctrl = NULL; /* allows reusing the queue later */ } diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index 7d0f3523fdab..7d0454cee920 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -109,20 +109,37 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req) u16 qid = le16_to_cpu(c->qid); u16 sqsize = le16_to_cpu(c->sqsize); struct nvmet_ctrl *old; + u16 mqes = NVME_CAP_MQES(ctrl->cap); u16 ret; + if (!sqsize) { + pr_warn("queue size zero!\n"); + req->error_loc = offsetof(struct nvmf_connect_command, sqsize); + req->cqe->result.u32 = IPO_IATTR_CONNECT_SQE(sqsize); + ret = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + goto err; + } + + if (ctrl->sqs[qid] != NULL) { + pr_warn("qid %u has already been created\n", qid); + req->error_loc = offsetof(struct nvmf_connect_command, qid); + return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; + } + + if (sqsize > mqes) { + pr_warn("sqsize %u is larger than MQES supported %u cntlid %d\n", + sqsize, mqes, ctrl->cntlid); + req->error_loc = offsetof(struct nvmf_connect_command, sqsize); + req->cqe->result.u32 = IPO_IATTR_CONNECT_SQE(sqsize); + return NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + } + old = cmpxchg(&req->sq->ctrl, NULL, ctrl); if (old) { pr_warn("queue already connected!\n"); req->error_loc = offsetof(struct nvmf_connect_command, opcode); return NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR; } - if (!sqsize) { - pr_warn("queue size zero!\n"); - req->error_loc = offsetof(struct nvmf_connect_command, sqsize); - ret = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; - goto err; - } /* note: convert queue size from 0's-based value to 1's-based value */ nvmet_cq_setup(ctrl, req->cq, qid, sqsize + 1); @@ -138,6 +155,7 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req) if (ret) { pr_err("failed to install queue %d cntlid %d ret %x\n", qid, ctrl->cntlid, ret); + ctrl->sqs[qid] = NULL; goto err; } } @@ -260,11 +278,11 @@ static void nvmet_execute_io_connect(struct nvmet_req *req) } status = nvmet_install_queue(ctrl, req); - if (status) { - /* pass back cntlid that had the issue of installing queue */ - req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid); + if (status) goto out_ctrl_put; - } + + /* pass back cntlid for successful completion */ + req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid); pr_debug("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid); diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 3a17a7e26bbf..0285ccc7541f 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -107,10 +107,10 @@ static void nvme_loop_queue_response(struct nvmet_req *req) } else { struct request *rq; - rq = blk_mq_tag_to_rq(nvme_loop_tagset(queue), cqe->command_id); + rq = nvme_find_rq(nvme_loop_tagset(queue), cqe->command_id); if (!rq) { dev_err(queue->ctrl->ctrl.device, - "tag 0x%x on queue %d not found\n", + "got bad command_id %#x on queue %d\n", cqe->command_id, nvme_loop_queue_idx(queue)); return; } diff --git a/drivers/nvme/target/trace.c b/drivers/nvme/target/trace.c index 1373a3c67962..bff454d46255 100644 --- a/drivers/nvme/target/trace.c +++ b/drivers/nvme/target/trace.c @@ -27,7 +27,7 @@ static const char *nvmet_trace_admin_get_features(struct trace_seq *p, u8 sel = cdw10[1] & 0x7; u32 cdw11 = get_unaligned_le32(cdw10 + 4); - trace_seq_printf(p, "fid=0x%x sel=0x%x cdw11=0x%x", fid, sel, cdw11); + trace_seq_printf(p, "fid=0x%x, sel=0x%x, cdw11=0x%x", fid, sel, cdw11); trace_seq_putc(p, 0); return ret; @@ -49,6 +49,20 @@ static const char *nvmet_trace_get_lba_status(struct trace_seq *p, return ret; } +static const char *nvmet_trace_admin_set_features(struct trace_seq *p, + u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u8 fid = cdw10[0]; + u8 sv = cdw10[3] & 0x8; + u32 cdw11 = get_unaligned_le32(cdw10 + 4); + + trace_seq_printf(p, "fid=0x%x, sv=0x%x, cdw11=0x%x", fid, sv, cdw11); + trace_seq_putc(p, 0); + + return ret; +} + static const char *nvmet_trace_read_write(struct trace_seq *p, u8 *cdw10) { const char *ret = trace_seq_buffer_ptr(p); @@ -94,6 +108,8 @@ const char *nvmet_trace_parse_admin_cmd(struct trace_seq *p, switch (opcode) { case nvme_admin_identify: return nvmet_trace_admin_identify(p, cdw10); + case nvme_admin_set_features: + return nvmet_trace_admin_set_features(p, cdw10); case nvme_admin_get_features: return nvmet_trace_admin_get_features(p, cdw10); case nvme_admin_get_lba_status: diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c index 17f8b7a45f21..46bc30fe85d2 100644 --- a/drivers/nvme/target/zns.c +++ b/drivers/nvme/target/zns.c @@ -115,14 +115,11 @@ void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req) } status = nvmet_req_find_ns(req); - if (status) { - status = NVME_SC_INTERNAL; + if (status) goto done; - } if (!bdev_is_zoned(req->ns->bdev)) { req->error_loc = offsetof(struct nvme_identify, nsid); - status = NVME_SC_INVALID_NS | NVME_SC_DNR; goto done; } diff --git a/drivers/s390/block/dasd_diag.c b/drivers/s390/block/dasd_diag.c index 6bb775236c16..db5987281010 100644 --- a/drivers/s390/block/dasd_diag.c +++ b/drivers/s390/block/dasd_diag.c @@ -552,7 +552,7 @@ static struct dasd_ccw_req *dasd_diag_build_cp(struct dasd_device *memdev, dbio = dreq->bio; recid = first_rec; rq_for_each_segment(bv, req, iter) { - dst = page_address(bv.bv_page) + bv.bv_offset; + dst = bvec_virt(&bv); for (off = 0; off < bv.bv_len; off += blksize) { memset(dbio, 0, sizeof (struct dasd_diag_bio)); dbio->type = rw_cmd; diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index fb5d8152652d..460e0f1cca53 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -3276,7 +3276,7 @@ static int dasd_eckd_ese_read(struct dasd_ccw_req *cqr, struct irb *irb) end_blk = (curr_trk + 1) * recs_per_trk; rq_for_each_segment(bv, req, iter) { - dst = page_address(bv.bv_page) + bv.bv_offset; + dst = bvec_virt(&bv); for (off = 0; off < bv.bv_len; off += blksize) { if (first_blk + blk_count >= end_blk) { cqr->proc_bytes = blk_count * blksize; @@ -4008,7 +4008,7 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_cmd_single( last_rec - recid + 1, cmd, basedev, blksize); } rq_for_each_segment(bv, req, iter) { - dst = page_address(bv.bv_page) + bv.bv_offset; + dst = bvec_virt(&bv); if (dasd_page_cache) { char *copy = kmem_cache_alloc(dasd_page_cache, GFP_DMA | __GFP_NOWARN); @@ -4175,7 +4175,7 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_cmd_track( idaw_dst = NULL; idaw_len = 0; rq_for_each_segment(bv, req, iter) { - dst = page_address(bv.bv_page) + bv.bv_offset; + dst = bvec_virt(&bv); seg_len = bv.bv_len; while (seg_len) { if (new_track) { @@ -4518,7 +4518,7 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_tpm_track( new_track = 1; recid = first_rec; rq_for_each_segment(bv, req, iter) { - dst = page_address(bv.bv_page) + bv.bv_offset; + dst = bvec_virt(&bv); seg_len = bv.bv_len; while (seg_len) { if (new_track) { @@ -4551,7 +4551,7 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_tpm_track( } } else { rq_for_each_segment(bv, req, iter) { - dst = page_address(bv.bv_page) + bv.bv_offset; + dst = bvec_virt(&bv); last_tidaw = itcw_add_tidaw(itcw, 0x00, dst, bv.bv_len); if (IS_ERR(last_tidaw)) { @@ -4787,7 +4787,7 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_raw(struct dasd_device *startdev, idaws = idal_create_words(idaws, rawpadpage, PAGE_SIZE); } rq_for_each_segment(bv, req, iter) { - dst = page_address(bv.bv_page) + bv.bv_offset; + dst = bvec_virt(&bv); seg_len = bv.bv_len; if (cmd == DASD_ECKD_CCW_READ_TRACK) memset(dst, 0, seg_len); @@ -4848,7 +4848,7 @@ dasd_eckd_free_cp(struct dasd_ccw_req *cqr, struct request *req) if (private->uses_cdl == 0 || recid > 2*blk_per_trk) ccw++; rq_for_each_segment(bv, req, iter) { - dst = page_address(bv.bv_page) + bv.bv_offset; + dst = bvec_virt(&bv); for (off = 0; off < bv.bv_len; off += blksize) { /* Skip locate record. */ if (private->uses_cdl && recid <= 2*blk_per_trk) diff --git a/drivers/s390/block/dasd_fba.c b/drivers/s390/block/dasd_fba.c index 3ad319aee51e..e084f4dedddd 100644 --- a/drivers/s390/block/dasd_fba.c +++ b/drivers/s390/block/dasd_fba.c @@ -501,7 +501,7 @@ static struct dasd_ccw_req *dasd_fba_build_cp_regular( } recid = first_rec; rq_for_each_segment(bv, req, iter) { - dst = page_address(bv.bv_page) + bv.bv_offset; + dst = bvec_virt(&bv); if (dasd_page_cache) { char *copy = kmem_cache_alloc(dasd_page_cache, GFP_DMA | __GFP_NOWARN); @@ -583,7 +583,7 @@ dasd_fba_free_cp(struct dasd_ccw_req *cqr, struct request *req) if (private->rdc_data.mode.bits.data_chain != 0) ccw++; rq_for_each_segment(bv, req, iter) { - dst = page_address(bv.bv_page) + bv.bv_offset; + dst = bvec_virt(&bv); for (off = 0; off < bv.bv_len; off += blksize) { /* Skip locate record. */ if (private->rdc_data.mode.bits.data_chain == 0) diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c index 493e8469893c..fa966e0db6ca 100644 --- a/drivers/s390/block/dasd_genhd.c +++ b/drivers/s390/block/dasd_genhd.c @@ -24,6 +24,8 @@ #include "dasd_int.h" +static struct lock_class_key dasd_bio_compl_lkclass; + /* * Allocate and register gendisk structure for device. */ @@ -38,13 +40,15 @@ int dasd_gendisk_alloc(struct dasd_block *block) if (base->devindex >= DASD_PER_MAJOR) return -EBUSY; - gdp = alloc_disk(1 << DASD_PARTN_BITS); + gdp = __alloc_disk_node(block->request_queue, NUMA_NO_NODE, + &dasd_bio_compl_lkclass); if (!gdp) return -ENOMEM; /* Initialize gendisk structure. */ gdp->major = DASD_MAJOR; gdp->first_minor = base->devindex << DASD_PARTN_BITS; + gdp->minors = 1 << DASD_PARTN_BITS; gdp->fops = &dasd_device_operations; /* @@ -73,7 +77,6 @@ int dasd_gendisk_alloc(struct dasd_block *block) test_bit(DASD_FLAG_DEVICE_RO, &base->flags)) set_disk_ro(gdp, 1); dasd_add_link_to_gendisk(gdp, base); - gdp->queue = block->request_queue; block->gdp = gdp; set_capacity(block->gdp, 0); device_add_disk(&base->cdev->dev, block->gdp, NULL); diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 29180bdf0977..5be3d1c39a78 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -892,8 +892,7 @@ dcssblk_submit_bio(struct bio *bio) index = (bio->bi_iter.bi_sector >> 3); bio_for_each_segment(bvec, bio, iter) { - page_addr = (unsigned long) - page_address(bvec.bv_page) + bvec.bv_offset; + page_addr = (unsigned long)bvec_virt(&bvec); source_addr = dev_info->start + (index<<12) + bytes_done; if (unlikely((page_addr & 4095) != 0) || (bvec.bv_len & 4095) != 0) // More paranoia. diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index b8d55af763f9..610ebba0d66e 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -129,6 +129,7 @@ static DEFINE_MUTEX(sd_ref_mutex); static struct kmem_cache *sd_cdb_cache; static mempool_t *sd_cdb_pool; static mempool_t *sd_page_pool; +static struct lock_class_key sd_bio_compl_lkclass; static const char *sd_cache_types[] = { "write through", "none", "write back", @@ -886,7 +887,7 @@ static blk_status_t sd_setup_unmap_cmnd(struct scsi_cmnd *cmd) cmd->cmnd[0] = UNMAP; cmd->cmnd[8] = 24; - buf = page_address(rq->special_vec.bv_page); + buf = bvec_virt(&rq->special_vec); put_unaligned_be16(6 + 16, &buf[0]); put_unaligned_be16(16, &buf[2]); put_unaligned_be64(lba, &buf[8]); @@ -3408,7 +3409,8 @@ static int sd_probe(struct device *dev) if (!sdkp) goto out; - gd = alloc_disk(SD_MINORS); + gd = __alloc_disk_node(sdp->request_queue, NUMA_NO_NODE, + &sd_bio_compl_lkclass); if (!gd) goto out_free; @@ -3454,10 +3456,10 @@ static int sd_probe(struct device *dev) gd->major = sd_major((index & 0xf0) >> 4); gd->first_minor = ((index & 0xf) << 4) | (index & 0xfff00); + gd->minors = SD_MINORS; gd->fops = &sd_fops; gd->private_data = &sdkp->driver; - gd->queue = sdkp->device->request_queue; /* defaults, until the device tells us otherwise */ sdp->sector_size = 512; diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 91e2221bbb0d..d5889b4f0fd4 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -166,7 +166,7 @@ typedef struct sg_device { /* holds the state of each scsi generic device */ bool exclude; /* 1->open(O_EXCL) succeeded and is active */ int open_cnt; /* count of opens (perhaps < num(sfds) ) */ char sgdebug; /* 0->off, 1->sense, 9->dump dev, 10-> all devs */ - struct gendisk *disk; + char name[DISK_NAME_LEN]; struct cdev * cdev; /* char_dev [sysfs: /sys/cdev/major/sg] */ struct kref d_ref; } Sg_device; @@ -202,8 +202,7 @@ static void sg_device_destroy(struct kref *kref); #define SZ_SG_REQ_INFO sizeof(sg_req_info_t) #define sg_printk(prefix, sdp, fmt, a...) \ - sdev_prefix_printk(prefix, (sdp)->device, \ - (sdp)->disk->disk_name, fmt, ##a) + sdev_prefix_printk(prefix, (sdp)->device, (sdp)->name, fmt, ##a) /* * The SCSI interfaces that use read() and write() as an asynchronous variant of @@ -832,7 +831,7 @@ sg_common_write(Sg_fd * sfp, Sg_request * srp, srp->rq->timeout = timeout; kref_get(&sfp->f_ref); /* sg_rq_end_io() does kref_put(). */ - blk_execute_rq_nowait(sdp->disk, srp->rq, at_head, sg_rq_end_io); + blk_execute_rq_nowait(NULL, srp->rq, at_head, sg_rq_end_io); return 0; } @@ -1119,8 +1118,7 @@ sg_ioctl_common(struct file *filp, Sg_device *sdp, Sg_fd *sfp, return put_user(max_sectors_bytes(sdp->device->request_queue), ip); case BLKTRACESETUP: - return blk_trace_setup(sdp->device->request_queue, - sdp->disk->disk_name, + return blk_trace_setup(sdp->device->request_queue, sdp->name, MKDEV(SCSI_GENERIC_MAJOR, sdp->index), NULL, p); case BLKTRACESTART: @@ -1456,7 +1454,7 @@ static struct class *sg_sysfs_class; static int sg_sysfs_valid = 0; static Sg_device * -sg_alloc(struct gendisk *disk, struct scsi_device *scsidp) +sg_alloc(struct scsi_device *scsidp) { struct request_queue *q = scsidp->request_queue; Sg_device *sdp; @@ -1492,9 +1490,7 @@ sg_alloc(struct gendisk *disk, struct scsi_device *scsidp) SCSI_LOG_TIMEOUT(3, sdev_printk(KERN_INFO, scsidp, "sg_alloc: dev=%d \n", k)); - sprintf(disk->disk_name, "sg%d", k); - disk->first_minor = k; - sdp->disk = disk; + sprintf(sdp->name, "sg%d", k); sdp->device = scsidp; mutex_init(&sdp->open_rel_lock); INIT_LIST_HEAD(&sdp->sfds); @@ -1521,19 +1517,11 @@ static int sg_add_device(struct device *cl_dev, struct class_interface *cl_intf) { struct scsi_device *scsidp = to_scsi_device(cl_dev->parent); - struct gendisk *disk; Sg_device *sdp = NULL; struct cdev * cdev = NULL; int error; unsigned long iflags; - disk = alloc_disk(1); - if (!disk) { - pr_warn("%s: alloc_disk failed\n", __func__); - return -ENOMEM; - } - disk->major = SCSI_GENERIC_MAJOR; - error = -ENOMEM; cdev = cdev_alloc(); if (!cdev) { @@ -1543,7 +1531,7 @@ sg_add_device(struct device *cl_dev, struct class_interface *cl_intf) cdev->owner = THIS_MODULE; cdev->ops = &sg_fops; - sdp = sg_alloc(disk, scsidp); + sdp = sg_alloc(scsidp); if (IS_ERR(sdp)) { pr_warn("%s: sg_alloc failed\n", __func__); error = PTR_ERR(sdp); @@ -1561,7 +1549,7 @@ sg_add_device(struct device *cl_dev, struct class_interface *cl_intf) sg_class_member = device_create(sg_sysfs_class, cl_dev->parent, MKDEV(SCSI_GENERIC_MAJOR, sdp->index), - sdp, "%s", disk->disk_name); + sdp, "%s", sdp->name); if (IS_ERR(sg_class_member)) { pr_err("%s: device_create failed\n", __func__); error = PTR_ERR(sg_class_member); @@ -1589,7 +1577,6 @@ cdev_add_err: kfree(sdp); out: - put_disk(disk); if (cdev) cdev_del(cdev); return error; @@ -1613,7 +1600,6 @@ sg_device_destroy(struct kref *kref) SCSI_LOG_TIMEOUT(3, sg_printk(KERN_INFO, sdp, "sg_device_destroy\n")); - put_disk(sdp->disk); kfree(sdp); } @@ -2606,7 +2592,7 @@ static int sg_proc_seq_show_debug(struct seq_file *s, void *v) goto skip; read_lock(&sdp->sfd_lock); if (!list_empty(&sdp->sfds)) { - seq_printf(s, " >>> device=%s ", sdp->disk->disk_name); + seq_printf(s, " >>> device=%s ", sdp->name); if (atomic_read(&sdp->detaching)) seq_puts(s, "detaching pending close "); else if (sdp->device) { diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index a6d3ac0a6cbc..2942a4ec9bdd 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -106,6 +106,8 @@ static struct scsi_driver sr_template = { static unsigned long sr_index_bits[SR_DISKS / BITS_PER_LONG]; static DEFINE_SPINLOCK(sr_index_lock); +static struct lock_class_key sr_bio_compl_lkclass; + /* This semaphore is used to mediate the 0->1 reference get in the * face of object destruction (i.e. we can't allow a get on an * object after last put) */ @@ -712,7 +714,8 @@ static int sr_probe(struct device *dev) kref_init(&cd->kref); - disk = alloc_disk(1); + disk = __alloc_disk_node(sdev->request_queue, NUMA_NO_NODE, + &sr_bio_compl_lkclass); if (!disk) goto fail_free; mutex_init(&cd->lock); @@ -729,6 +732,7 @@ static int sr_probe(struct device *dev) disk->major = SCSI_CDROM_MAJOR; disk->first_minor = minor; + disk->minors = 1; sprintf(disk->disk_name, "sr%d", minor); disk->fops = &sr_bdops; disk->flags = GENHD_FL_CD | GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE; @@ -762,7 +766,6 @@ static int sr_probe(struct device *dev) set_capacity(disk, cd->capacity); disk->private_data = &cd->driver; - disk->queue = sdev->request_queue; if (register_cdrom(disk, &cd->cdi)) goto fail_minor; diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index c6f14540ae03..d1abc020f3c0 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -309,13 +309,8 @@ static char * st_incompatible(struct scsi_device* SDp) } -static inline char *tape_name(struct scsi_tape *tape) -{ - return tape->disk->disk_name; -} - #define st_printk(prefix, t, fmt, a...) \ - sdev_prefix_printk(prefix, (t)->device, tape_name(t), fmt, ##a) + sdev_prefix_printk(prefix, (t)->device, (t)->name, fmt, ##a) #ifdef DEBUG #define DEBC_printk(t, fmt, a...) \ if (debugging) { st_printk(ST_DEB_MSG, t, fmt, ##a ); } @@ -363,7 +358,7 @@ static int st_chk_result(struct scsi_tape *STp, struct st_request * SRpnt) int result = SRpnt->result; u8 scode; DEB(const char *stp;) - char *name = tape_name(STp); + char *name = STp->name; struct st_cmdstatus *cmdstatp; if (!result) @@ -3841,8 +3836,9 @@ static long st_ioctl_common(struct file *file, unsigned int cmd_in, void __user !capable(CAP_SYS_RAWIO)) i = -EPERM; else - i = scsi_cmd_ioctl(STp->disk->queue, STp->disk, - file->f_mode, cmd_in, p); + i = scsi_cmd_ioctl(STp->device->request_queue, + NULL, file->f_mode, cmd_in, + p); if (i != -ENOTTY) return i; break; @@ -4216,7 +4212,7 @@ static int create_one_cdev(struct scsi_tape *tape, int mode, int rew) i = mode << (4 - ST_NBR_MODE_BITS); snprintf(name, 10, "%s%s%s", rew ? "n" : "", - tape->disk->disk_name, st_formats[i]); + tape->name, st_formats[i]); dev = device_create(&st_sysfs_class, &tape->device->sdev_gendev, cdev_devno, &tape->modes[mode], "%s", name); @@ -4271,7 +4267,6 @@ static void remove_cdevs(struct scsi_tape *tape) static int st_probe(struct device *dev) { struct scsi_device *SDp = to_scsi_device(dev); - struct gendisk *disk = NULL; struct scsi_tape *tpnt = NULL; struct st_modedef *STm; struct st_partstat *STps; @@ -4301,27 +4296,13 @@ static int st_probe(struct device *dev) goto out; } - disk = alloc_disk(1); - if (!disk) { - sdev_printk(KERN_ERR, SDp, - "st: out of memory. Device not attached.\n"); - goto out_buffer_free; - } - tpnt = kzalloc(sizeof(struct scsi_tape), GFP_KERNEL); if (tpnt == NULL) { sdev_printk(KERN_ERR, SDp, "st: Can't allocate device descriptor.\n"); - goto out_put_disk; + goto out_buffer_free; } kref_init(&tpnt->kref); - tpnt->disk = disk; - disk->private_data = &tpnt->driver; - /* SCSI tape doesn't register this gendisk via add_disk(). Manually - * take queue reference that release_disk() expects. */ - if (!blk_get_queue(SDp->request_queue)) - goto out_put_disk; - disk->queue = SDp->request_queue; tpnt->driver = &st_template; tpnt->device = SDp; @@ -4394,10 +4375,10 @@ static int st_probe(struct device *dev) idr_preload_end(); if (error < 0) { pr_warn("st: idr allocation failed: %d\n", error); - goto out_put_queue; + goto out_free_tape; } tpnt->index = error; - sprintf(disk->disk_name, "st%d", tpnt->index); + sprintf(tpnt->name, "st%d", tpnt->index); tpnt->stats = kzalloc(sizeof(struct scsi_tape_stats), GFP_KERNEL); if (tpnt->stats == NULL) { sdev_printk(KERN_ERR, SDp, @@ -4414,9 +4395,9 @@ static int st_probe(struct device *dev) scsi_autopm_put_device(SDp); sdev_printk(KERN_NOTICE, SDp, - "Attached scsi tape %s\n", tape_name(tpnt)); + "Attached scsi tape %s\n", tpnt->name); sdev_printk(KERN_INFO, SDp, "%s: try direct i/o: %s (alignment %d B)\n", - tape_name(tpnt), tpnt->try_dio ? "yes" : "no", + tpnt->name, tpnt->try_dio ? "yes" : "no", queue_dma_alignment(SDp->request_queue) + 1); return 0; @@ -4428,10 +4409,7 @@ out_idr_remove: spin_lock(&st_index_lock); idr_remove(&st_index_idr, tpnt->index); spin_unlock(&st_index_lock); -out_put_queue: - blk_put_queue(disk->queue); -out_put_disk: - put_disk(disk); +out_free_tape: kfree(tpnt); out_buffer_free: kfree(buffer); @@ -4470,7 +4448,6 @@ static int st_remove(struct device *dev) static void scsi_tape_release(struct kref *kref) { struct scsi_tape *tpnt = to_scsi_tape(kref); - struct gendisk *disk = tpnt->disk; tpnt->device = NULL; @@ -4480,8 +4457,6 @@ static void scsi_tape_release(struct kref *kref) kfree(tpnt->buffer); } - disk->private_data = NULL; - put_disk(disk); kfree(tpnt->stats); kfree(tpnt); return; diff --git a/drivers/scsi/st.h b/drivers/scsi/st.h index 9d3c38bb0794..c0ef0d9aaf8a 100644 --- a/drivers/scsi/st.h +++ b/drivers/scsi/st.h @@ -187,7 +187,7 @@ struct scsi_tape { unsigned char last_cmnd[6]; unsigned char last_sense[16]; #endif - struct gendisk *disk; + char name[DISK_NAME_LEN]; struct kref kref; struct scsi_tape_stats *stats; }; diff --git a/fs/block_dev.c b/fs/block_dev.c index 9ef4f1fc2cb0..1f21ac984253 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -35,6 +35,7 @@ #include #include #include "internal.h" +#include "../block/blk.h" struct bdev_inode { struct block_device bdev; @@ -686,7 +687,8 @@ static loff_t block_llseek(struct file *file, loff_t offset, int whence) return retval; } -int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) +static int blkdev_fsync(struct file *filp, loff_t start, loff_t end, + int datasync) { struct inode *bd_inode = bdev_file_inode(filp); struct block_device *bdev = I_BDEV(bd_inode); @@ -707,7 +709,6 @@ int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) return error; } -EXPORT_SYMBOL(blkdev_fsync); /** * bdev_read_page() - Start reading a page from a block device @@ -801,7 +802,6 @@ static struct inode *bdev_alloc_inode(struct super_block *sb) if (!ei) return NULL; memset(&ei->bdev, 0, sizeof(ei->bdev)); - ei->bdev.bd_bdi = &noop_backing_dev_info; return &ei->vfs_inode; } @@ -812,8 +812,15 @@ static void bdev_free_inode(struct inode *inode) free_percpu(bdev->bd_stats); kfree(bdev->bd_meta_info); - if (!bdev_is_partition(bdev)) + if (!bdev_is_partition(bdev)) { + if (bdev->bd_disk && bdev->bd_disk->bdi) + bdi_put(bdev->bd_disk->bdi); kfree(bdev->bd_disk); + } + + if (MAJOR(bdev->bd_dev) == BLOCK_EXT_MAJOR) + blk_free_ext_minor(MINOR(bdev->bd_dev)); + kmem_cache_free(bdev_cachep, BDEV_I(inode)); } @@ -826,16 +833,9 @@ static void init_once(void *data) static void bdev_evict_inode(struct inode *inode) { - struct block_device *bdev = &BDEV_I(inode)->bdev; truncate_inode_pages_final(&inode->i_data); invalidate_inode_buffers(inode); /* is it needed here? */ clear_inode(inode); - /* Detach inode from wb early as bdi_put() may free bdi->wb */ - inode_detach_wb(inode); - if (bdev->bd_bdi != &noop_backing_dev_info) { - bdi_put(bdev->bd_bdi); - bdev->bd_bdi = &noop_backing_dev_info; - } } static const struct super_operations bdev_sops = { @@ -902,9 +902,6 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) bdev->bd_disk = disk; bdev->bd_partno = partno; bdev->bd_inode = inode; -#ifdef CONFIG_SYSFS - INIT_LIST_HEAD(&bdev->bd_holder_disks); -#endif bdev->bd_stats = alloc_percpu(struct disk_stats); if (!bdev->bd_stats) { iput(inode); @@ -921,31 +918,6 @@ void bdev_add(struct block_device *bdev, dev_t dev) insert_inode_hash(bdev->bd_inode); } -static struct block_device *bdget(dev_t dev) -{ - struct inode *inode; - - inode = ilookup(blockdev_superblock, dev); - if (!inode) - return NULL; - return &BDEV_I(inode)->bdev; -} - -/** - * bdgrab -- Grab a reference to an already referenced block device - * @bdev: Block device to grab a reference to. - * - * Returns the block_device with an additional reference when successful, - * or NULL if the inode is already beeing freed. - */ -struct block_device *bdgrab(struct block_device *bdev) -{ - if (!igrab(bdev->bd_inode)) - return NULL; - return bdev; -} -EXPORT_SYMBOL(bdgrab); - long nr_blockdev_pages(void) { struct inode *inode; @@ -959,12 +931,6 @@ long nr_blockdev_pages(void) return ret; } -void bdput(struct block_device *bdev) -{ - iput(bdev->bd_inode); -} -EXPORT_SYMBOL(bdput); - /** * bd_may_claim - test whether a block device can be claimed * @bdev: block device of interest @@ -1094,148 +1060,6 @@ void bd_abort_claiming(struct block_device *bdev, void *holder) } EXPORT_SYMBOL(bd_abort_claiming); -#ifdef CONFIG_SYSFS -struct bd_holder_disk { - struct list_head list; - struct gendisk *disk; - int refcnt; -}; - -static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev, - struct gendisk *disk) -{ - struct bd_holder_disk *holder; - - list_for_each_entry(holder, &bdev->bd_holder_disks, list) - if (holder->disk == disk) - return holder; - return NULL; -} - -static int add_symlink(struct kobject *from, struct kobject *to) -{ - return sysfs_create_link(from, to, kobject_name(to)); -} - -static void del_symlink(struct kobject *from, struct kobject *to) -{ - sysfs_remove_link(from, kobject_name(to)); -} - -/** - * bd_link_disk_holder - create symlinks between holding disk and slave bdev - * @bdev: the claimed slave bdev - * @disk: the holding disk - * - * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. - * - * This functions creates the following sysfs symlinks. - * - * - from "slaves" directory of the holder @disk to the claimed @bdev - * - from "holders" directory of the @bdev to the holder @disk - * - * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is - * passed to bd_link_disk_holder(), then: - * - * /sys/block/dm-0/slaves/sda --> /sys/block/sda - * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0 - * - * The caller must have claimed @bdev before calling this function and - * ensure that both @bdev and @disk are valid during the creation and - * lifetime of these symlinks. - * - * CONTEXT: - * Might sleep. - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) -{ - struct bd_holder_disk *holder; - int ret = 0; - - mutex_lock(&bdev->bd_disk->open_mutex); - - WARN_ON_ONCE(!bdev->bd_holder); - - /* FIXME: remove the following once add_disk() handles errors */ - if (WARN_ON(!disk->slave_dir || !bdev->bd_holder_dir)) - goto out_unlock; - - holder = bd_find_holder_disk(bdev, disk); - if (holder) { - holder->refcnt++; - goto out_unlock; - } - - holder = kzalloc(sizeof(*holder), GFP_KERNEL); - if (!holder) { - ret = -ENOMEM; - goto out_unlock; - } - - INIT_LIST_HEAD(&holder->list); - holder->disk = disk; - holder->refcnt = 1; - - ret = add_symlink(disk->slave_dir, bdev_kobj(bdev)); - if (ret) - goto out_free; - - ret = add_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); - if (ret) - goto out_del; - /* - * bdev could be deleted beneath us which would implicitly destroy - * the holder directory. Hold on to it. - */ - kobject_get(bdev->bd_holder_dir); - - list_add(&holder->list, &bdev->bd_holder_disks); - goto out_unlock; - -out_del: - del_symlink(disk->slave_dir, bdev_kobj(bdev)); -out_free: - kfree(holder); -out_unlock: - mutex_unlock(&bdev->bd_disk->open_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(bd_link_disk_holder); - -/** - * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder() - * @bdev: the calimed slave bdev - * @disk: the holding disk - * - * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. - * - * CONTEXT: - * Might sleep. - */ -void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) -{ - struct bd_holder_disk *holder; - - mutex_lock(&bdev->bd_disk->open_mutex); - - holder = bd_find_holder_disk(bdev, disk); - - if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { - del_symlink(disk->slave_dir, bdev_kobj(bdev)); - del_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); - kobject_put(bdev->bd_holder_dir); - list_del_init(&holder->list); - kfree(holder); - } - - mutex_unlock(&bdev->bd_disk->open_mutex); -} -EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); -#endif - static void blkdev_flush_mapping(struct block_device *bdev) { WARN_ON_ONCE(bdev->bd_holders); @@ -1260,11 +1084,8 @@ static int blkdev_get_whole(struct block_device *bdev, fmode_t mode) } } - if (!bdev->bd_openers) { + if (!bdev->bd_openers) set_init_blocksize(bdev); - if (bdev->bd_bdi == &noop_backing_dev_info) - bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info); - } if (test_bit(GD_NEED_PART_SCAN, &disk->state)) bdev_disk_changed(disk, false); bdev->bd_openers++; @@ -1282,16 +1103,14 @@ static void blkdev_put_whole(struct block_device *bdev, fmode_t mode) static int blkdev_get_part(struct block_device *part, fmode_t mode) { struct gendisk *disk = part->bd_disk; - struct block_device *whole; int ret; if (part->bd_openers) goto done; - whole = bdgrab(disk->part0); - ret = blkdev_get_whole(whole, mode); + ret = blkdev_get_whole(bdev_whole(part), mode); if (ret) - goto out_put_whole; + return ret; ret = -ENXIO; if (!bdev_nr_sectors(part)) @@ -1299,16 +1118,12 @@ static int blkdev_get_part(struct block_device *part, fmode_t mode) disk->open_partitions++; set_init_blocksize(part); - if (part->bd_bdi == &noop_backing_dev_info) - part->bd_bdi = bdi_get(disk->queue->backing_dev_info); done: part->bd_openers++; return 0; out_blkdev_put: - blkdev_put_whole(whole, mode); -out_put_whole: - bdput(whole); + blkdev_put_whole(bdev_whole(part), mode); return ret; } @@ -1321,42 +1136,42 @@ static void blkdev_put_part(struct block_device *part, fmode_t mode) blkdev_flush_mapping(part); whole->bd_disk->open_partitions--; blkdev_put_whole(whole, mode); - bdput(whole); } struct block_device *blkdev_get_no_open(dev_t dev) { struct block_device *bdev; - struct gendisk *disk; + struct inode *inode; - bdev = bdget(dev); - if (!bdev) { + inode = ilookup(blockdev_superblock, dev); + if (!inode) { blk_request_module(dev); - bdev = bdget(dev); - if (!bdev) + inode = ilookup(blockdev_superblock, dev); + if (!inode) return NULL; } - disk = bdev->bd_disk; - if (!kobject_get_unless_zero(&disk_to_dev(disk)->kobj)) - goto bdput; - if ((disk->flags & (GENHD_FL_UP | GENHD_FL_HIDDEN)) != GENHD_FL_UP) - goto put_disk; - if (!try_module_get(bdev->bd_disk->fops->owner)) - goto put_disk; + /* switch from the inode reference to a device mode one: */ + bdev = &BDEV_I(inode)->bdev; + if (!kobject_get_unless_zero(&bdev->bd_device.kobj)) + bdev = NULL; + iput(inode); + + if (!bdev) + return NULL; + if ((bdev->bd_disk->flags & GENHD_FL_HIDDEN) || + !try_module_get(bdev->bd_disk->fops->owner)) { + put_device(&bdev->bd_device); + return NULL; + } + return bdev; -put_disk: - put_disk(disk); -bdput: - bdput(bdev); - return NULL; } void blkdev_put_no_open(struct block_device *bdev) { module_put(bdev->bd_disk->fops->owner); - put_disk(bdev->bd_disk); - bdput(bdev); + put_device(&bdev->bd_device); } /** @@ -1409,7 +1224,7 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) mutex_lock(&disk->open_mutex); ret = -ENXIO; - if (!(disk->flags & GENHD_FL_UP)) + if (!disk_live(disk)) goto abort_claiming; if (bdev_is_partition(bdev)) ret = blkdev_get_part(bdev, mode); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 6642246206bd..daad532a4e2b 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -378,7 +378,7 @@ out: ret = kstrtol(name, 10, &data); if (ret) return ret; - if (data >= IOPRIO_BE_NR || data < 0) + if (data >= IOPRIO_NR_LEVELS || data < 0) return -EINVAL; cprc->ckpt_thread_ioprio = IOPRIO_PRIO_VALUE(class, data); diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index 860e884e56e8..978ac6751aeb 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -5,6 +5,7 @@ #include #include +#include #include "fat.h" struct fatent_operations { diff --git a/fs/io-wq.c b/fs/io-wq.c index 7d2ed8c7dd31..cd9bd095fb1b 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -51,6 +51,10 @@ struct io_worker { struct completion ref_done; + unsigned long create_state; + struct callback_head create_work; + int create_index; + struct rcu_head rcu; }; @@ -174,7 +178,7 @@ static void io_worker_exit(struct io_worker *worker) complete(&worker->ref_done); wait_for_completion(&worker->ref_done); - raw_spin_lock_irq(&wqe->lock); + raw_spin_lock(&wqe->lock); if (worker->flags & IO_WORKER_F_FREE) hlist_nulls_del_rcu(&worker->nulls_node); list_del_rcu(&worker->all_list); @@ -184,7 +188,7 @@ static void io_worker_exit(struct io_worker *worker) worker->flags = 0; current->flags &= ~PF_IO_WORKER; preempt_enable(); - raw_spin_unlock_irq(&wqe->lock); + raw_spin_unlock(&wqe->lock); kfree_rcu(worker, rcu); io_worker_ref_put(wqe->wq); @@ -250,18 +254,19 @@ static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct) if (!ret) { bool do_create = false, first = false; - raw_spin_lock_irq(&wqe->lock); + raw_spin_lock(&wqe->lock); if (acct->nr_workers < acct->max_workers) { - atomic_inc(&acct->nr_running); - atomic_inc(&wqe->wq->worker_refs); if (!acct->nr_workers) first = true; acct->nr_workers++; do_create = true; } - raw_spin_unlock_irq(&wqe->lock); - if (do_create) + raw_spin_unlock(&wqe->lock); + if (do_create) { + atomic_inc(&acct->nr_running); + atomic_inc(&wqe->wq->worker_refs); create_io_worker(wqe->wq, wqe, acct->index, first); + } } } @@ -272,60 +277,63 @@ static void io_wqe_inc_running(struct io_worker *worker) atomic_inc(&acct->nr_running); } -struct create_worker_data { - struct callback_head work; - struct io_wqe *wqe; - int index; -}; - static void create_worker_cb(struct callback_head *cb) { - struct create_worker_data *cwd; + struct io_worker *worker; struct io_wq *wq; struct io_wqe *wqe; struct io_wqe_acct *acct; bool do_create = false, first = false; - cwd = container_of(cb, struct create_worker_data, work); - wqe = cwd->wqe; + worker = container_of(cb, struct io_worker, create_work); + wqe = worker->wqe; wq = wqe->wq; - acct = &wqe->acct[cwd->index]; - raw_spin_lock_irq(&wqe->lock); + acct = &wqe->acct[worker->create_index]; + raw_spin_lock(&wqe->lock); if (acct->nr_workers < acct->max_workers) { if (!acct->nr_workers) first = true; acct->nr_workers++; do_create = true; } - raw_spin_unlock_irq(&wqe->lock); + raw_spin_unlock(&wqe->lock); if (do_create) { - create_io_worker(wq, wqe, cwd->index, first); + create_io_worker(wq, wqe, worker->create_index, first); } else { atomic_dec(&acct->nr_running); io_worker_ref_put(wq); } - kfree(cwd); + clear_bit_unlock(0, &worker->create_state); + io_worker_release(worker); } -static void io_queue_worker_create(struct io_wqe *wqe, struct io_wqe_acct *acct) +static void io_queue_worker_create(struct io_wqe *wqe, struct io_worker *worker, + struct io_wqe_acct *acct) { - struct create_worker_data *cwd; struct io_wq *wq = wqe->wq; /* raced with exit, just ignore create call */ if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) goto fail; + if (!io_worker_get(worker)) + goto fail; + /* + * create_state manages ownership of create_work/index. We should + * only need one entry per worker, as the worker going to sleep + * will trigger the condition, and waking will clear it once it + * runs the task_work. + */ + if (test_bit(0, &worker->create_state) || + test_and_set_bit_lock(0, &worker->create_state)) + goto fail_release; - cwd = kmalloc(sizeof(*cwd), GFP_ATOMIC); - if (cwd) { - init_task_work(&cwd->work, create_worker_cb); - cwd->wqe = wqe; - cwd->index = acct->index; - if (!task_work_add(wq->task, &cwd->work, TWA_SIGNAL)) - return; - - kfree(cwd); - } + init_task_work(&worker->create_work, create_worker_cb); + worker->create_index = acct->index; + if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL)) + return; + clear_bit_unlock(0, &worker->create_state); +fail_release: + io_worker_release(worker); fail: atomic_dec(&acct->nr_running); io_worker_ref_put(wq); @@ -343,7 +351,7 @@ static void io_wqe_dec_running(struct io_worker *worker) if (atomic_dec_and_test(&acct->nr_running) && io_wqe_run_queue(wqe)) { atomic_inc(&acct->nr_running); atomic_inc(&wqe->wq->worker_refs); - io_queue_worker_create(wqe, acct); + io_queue_worker_create(wqe, worker, acct); } } @@ -416,7 +424,28 @@ static void io_wait_on_hash(struct io_wqe *wqe, unsigned int hash) spin_unlock(&wq->hash->wait.lock); } -static struct io_wq_work *io_get_next_work(struct io_wqe *wqe) +/* + * We can always run the work if the worker is currently the same type as + * the work (eg both are bound, or both are unbound). If they are not the + * same, only allow it if incrementing the worker count would be allowed. + */ +static bool io_worker_can_run_work(struct io_worker *worker, + struct io_wq_work *work) +{ + struct io_wqe_acct *acct; + + if (!(worker->flags & IO_WORKER_F_BOUND) != + !(work->flags & IO_WQ_WORK_UNBOUND)) + return true; + + /* not the same type, check if we'd go over the limit */ + acct = io_work_get_acct(worker->wqe, work); + return acct->nr_workers < acct->max_workers; +} + +static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, + struct io_worker *worker, + bool *stalled) __must_hold(wqe->lock) { struct io_wq_work_node *node, *prev; @@ -428,6 +457,9 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe) work = container_of(node, struct io_wq_work, list); + if (!io_worker_can_run_work(worker, work)) + break; + /* not hashed, can run anytime */ if (!io_wq_is_hashed(work)) { wq_list_del(&wqe->work_list, node, prev); @@ -454,6 +486,7 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe) raw_spin_unlock(&wqe->lock); io_wait_on_hash(wqe, stall_hash); raw_spin_lock(&wqe->lock); + *stalled = true; } return NULL; @@ -477,9 +510,9 @@ static void io_assign_current_work(struct io_worker *worker, cond_resched(); } - spin_lock_irq(&worker->lock); + spin_lock(&worker->lock); worker->cur_work = work; - spin_unlock_irq(&worker->lock); + spin_unlock(&worker->lock); } static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work); @@ -493,6 +526,7 @@ static void io_worker_handle_work(struct io_worker *worker) do { struct io_wq_work *work; + bool stalled; get_next: /* * If we got some work, mark us as busy. If we didn't, but @@ -501,13 +535,14 @@ get_next: * can't make progress, any work completion or insertion will * clear the stalled flag. */ - work = io_get_next_work(wqe); + stalled = false; + work = io_get_next_work(wqe, worker, &stalled); if (work) __io_worker_busy(wqe, worker, work); - else if (!wq_list_empty(&wqe->work_list)) + else if (stalled) wqe->flags |= IO_WQE_FLAG_STALLED; - raw_spin_unlock_irq(&wqe->lock); + raw_spin_unlock(&wqe->lock); if (!work) break; io_assign_current_work(worker, work); @@ -539,16 +574,16 @@ get_next: clear_bit(hash, &wq->hash->map); if (wq_has_sleeper(&wq->hash->wait)) wake_up(&wq->hash->wait); - raw_spin_lock_irq(&wqe->lock); + raw_spin_lock(&wqe->lock); wqe->flags &= ~IO_WQE_FLAG_STALLED; /* skip unnecessary unlock-lock wqe->lock */ if (!work) goto get_next; - raw_spin_unlock_irq(&wqe->lock); + raw_spin_unlock(&wqe->lock); } } while (work); - raw_spin_lock_irq(&wqe->lock); + raw_spin_lock(&wqe->lock); } while (1); } @@ -569,13 +604,13 @@ static int io_wqe_worker(void *data) set_current_state(TASK_INTERRUPTIBLE); loop: - raw_spin_lock_irq(&wqe->lock); + raw_spin_lock(&wqe->lock); if (io_wqe_run_queue(wqe)) { io_worker_handle_work(worker); goto loop; } __io_worker_idle(wqe, worker); - raw_spin_unlock_irq(&wqe->lock); + raw_spin_unlock(&wqe->lock); if (io_flush_signals()) continue; ret = schedule_timeout(WORKER_IDLE_TIMEOUT); @@ -594,7 +629,7 @@ loop: } if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) { - raw_spin_lock_irq(&wqe->lock); + raw_spin_lock(&wqe->lock); io_worker_handle_work(worker); } @@ -636,9 +671,9 @@ void io_wq_worker_sleeping(struct task_struct *tsk) worker->flags &= ~IO_WORKER_F_RUNNING; - raw_spin_lock_irq(&worker->wqe->lock); + raw_spin_lock(&worker->wqe->lock); io_wqe_dec_running(worker); - raw_spin_unlock_irq(&worker->wqe->lock); + raw_spin_unlock(&worker->wqe->lock); } static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index, bool first) @@ -664,9 +699,9 @@ static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index, bo kfree(worker); fail: atomic_dec(&acct->nr_running); - raw_spin_lock_irq(&wqe->lock); + raw_spin_lock(&wqe->lock); acct->nr_workers--; - raw_spin_unlock_irq(&wqe->lock); + raw_spin_unlock(&wqe->lock); io_worker_ref_put(wq); return; } @@ -676,7 +711,7 @@ fail: set_cpus_allowed_ptr(tsk, wqe->cpu_mask); tsk->flags |= PF_NO_SETAFFINITY; - raw_spin_lock_irq(&wqe->lock); + raw_spin_lock(&wqe->lock); hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); list_add_tail_rcu(&worker->all_list, &wqe->all_list); worker->flags |= IO_WORKER_F_FREE; @@ -684,7 +719,7 @@ fail: worker->flags |= IO_WORKER_F_BOUND; if (first && (worker->flags & IO_WORKER_F_BOUND)) worker->flags |= IO_WORKER_F_FIXED; - raw_spin_unlock_irq(&wqe->lock); + raw_spin_unlock(&wqe->lock); wake_up_new_task(tsk); } @@ -759,8 +794,7 @@ append: static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) { struct io_wqe_acct *acct = io_work_get_acct(wqe, work); - int work_flags; - unsigned long flags; + bool do_wake; /* * If io-wq is exiting for this task, or if the request has explicitly @@ -772,14 +806,14 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) return; } - work_flags = work->flags; - raw_spin_lock_irqsave(&wqe->lock, flags); + raw_spin_lock(&wqe->lock); io_wqe_insert_work(wqe, work); wqe->flags &= ~IO_WQE_FLAG_STALLED; - raw_spin_unlock_irqrestore(&wqe->lock, flags); + do_wake = (work->flags & IO_WQ_WORK_CONCURRENT) || + !atomic_read(&acct->nr_running); + raw_spin_unlock(&wqe->lock); - if ((work_flags & IO_WQ_WORK_CONCURRENT) || - !atomic_read(&acct->nr_running)) + if (do_wake) io_wqe_wake_worker(wqe, acct); } @@ -805,19 +839,18 @@ void io_wq_hash_work(struct io_wq_work *work, void *val) static bool io_wq_worker_cancel(struct io_worker *worker, void *data) { struct io_cb_cancel_data *match = data; - unsigned long flags; /* * Hold the lock to avoid ->cur_work going out of scope, caller * may dereference the passed in work. */ - spin_lock_irqsave(&worker->lock, flags); + spin_lock(&worker->lock); if (worker->cur_work && match->fn(worker->cur_work, match->data)) { set_notify_signal(worker->task); match->nr_running++; } - spin_unlock_irqrestore(&worker->lock, flags); + spin_unlock(&worker->lock); return match->nr_running && !match->cancel_all; } @@ -845,16 +878,15 @@ static void io_wqe_cancel_pending_work(struct io_wqe *wqe, { struct io_wq_work_node *node, *prev; struct io_wq_work *work; - unsigned long flags; retry: - raw_spin_lock_irqsave(&wqe->lock, flags); + raw_spin_lock(&wqe->lock); wq_list_for_each(node, prev, &wqe->work_list) { work = container_of(node, struct io_wq_work, list); if (!match->fn(work, match->data)) continue; io_wqe_remove_pending(wqe, work, prev); - raw_spin_unlock_irqrestore(&wqe->lock, flags); + raw_spin_unlock(&wqe->lock); io_run_cancel(work, wqe); match->nr_pending++; if (!match->cancel_all) @@ -863,7 +895,7 @@ retry: /* not safe to continue after unlock */ goto retry; } - raw_spin_unlock_irqrestore(&wqe->lock, flags); + raw_spin_unlock(&wqe->lock); } static void io_wqe_cancel_running_work(struct io_wqe *wqe, @@ -1004,12 +1036,12 @@ err_wq: static bool io_task_work_match(struct callback_head *cb, void *data) { - struct create_worker_data *cwd; + struct io_worker *worker; if (cb->func != create_worker_cb) return false; - cwd = container_of(cb, struct create_worker_data, work); - return cwd->wqe->wq == data; + worker = container_of(cb, struct io_worker, create_work); + return worker->wqe->wq == data; } void io_wq_exit_start(struct io_wq *wq) @@ -1026,12 +1058,13 @@ static void io_wq_exit_workers(struct io_wq *wq) return; while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) { - struct create_worker_data *cwd; + struct io_worker *worker; - cwd = container_of(cb, struct create_worker_data, work); - atomic_dec(&cwd->wqe->acct[cwd->index].nr_running); + worker = container_of(cb, struct io_worker, create_work); + atomic_dec(&worker->wqe->acct[worker->create_index].nr_running); io_worker_ref_put(wq); - kfree(cwd); + clear_bit_unlock(0, &worker->create_state); + io_worker_release(worker); } rcu_read_lock(); @@ -1143,6 +1176,35 @@ int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask) return 0; } +/* + * Set max number of unbounded workers, returns old value. If new_count is 0, + * then just return the old value. + */ +int io_wq_max_workers(struct io_wq *wq, int *new_count) +{ + int i, node, prev = 0; + + for (i = 0; i < 2; i++) { + if (new_count[i] > task_rlimit(current, RLIMIT_NPROC)) + new_count[i] = task_rlimit(current, RLIMIT_NPROC); + } + + rcu_read_lock(); + for_each_node(node) { + struct io_wqe_acct *acct; + + for (i = 0; i < 2; i++) { + acct = &wq->wqes[node]->acct[i]; + prev = max_t(int, acct->max_workers, prev); + if (new_count[i]) + acct->max_workers = new_count[i]; + new_count[i] = prev; + } + } + rcu_read_unlock(); + return 0; +} + static __init int io_wq_init(void) { int ret; diff --git a/fs/io-wq.h b/fs/io-wq.h index 3999ee58ff26..bf5c4c533760 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -44,6 +44,7 @@ static inline void wq_list_add_after(struct io_wq_work_node *node, static inline void wq_list_add_tail(struct io_wq_work_node *node, struct io_wq_work_list *list) { + node->next = NULL; if (!list->first) { list->last = node; WRITE_ONCE(list->first, node); @@ -51,7 +52,6 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node, list->last->next = node; list->last = node; } - node->next = NULL; } static inline void wq_list_cut(struct io_wq_work_list *list, @@ -128,6 +128,7 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work); void io_wq_hash_work(struct io_wq_work *work, void *val); int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask); +int io_wq_max_workers(struct io_wq *wq, int *new_count); static inline bool io_wq_is_hashed(struct io_wq_work *work) { diff --git a/fs/io_uring.c b/fs/io_uring.c index a2e20a6fbfed..7cc458e0b636 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -92,17 +92,12 @@ #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8 -/* - * Shift of 9 is 512 entries, or exactly one page on 64-bit archs - */ -#define IORING_FILE_TABLE_SHIFT 9 -#define IORING_MAX_FILES_TABLE (1U << IORING_FILE_TABLE_SHIFT) -#define IORING_FILE_TABLE_MASK (IORING_MAX_FILES_TABLE - 1) -#define IORING_MAX_FIXED_FILES (64 * IORING_MAX_FILES_TABLE) +/* only define max */ +#define IORING_MAX_FIXED_FILES (1U << 15) #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ IORING_REGISTER_LAST + IORING_OP_LAST) -#define IO_RSRC_TAG_TABLE_SHIFT 9 +#define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3) #define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT) #define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1) @@ -235,8 +230,7 @@ struct io_rsrc_put { }; struct io_file_table { - /* two level table */ - struct io_fixed_file **files; + struct io_fixed_file *files; }; struct io_rsrc_node { @@ -301,18 +295,10 @@ struct io_sq_data { struct completion exited; }; -#define IO_IOPOLL_BATCH 8 #define IO_COMPL_BATCH 32 #define IO_REQ_CACHE_SIZE 32 #define IO_REQ_ALLOC_BATCH 8 -struct io_comp_state { - struct io_kiocb *reqs[IO_COMPL_BATCH]; - unsigned int nr; - /* inline/task_work completion list, under ->uring_lock */ - struct list_head free_list; -}; - struct io_submit_link { struct io_kiocb *head; struct io_kiocb *last; @@ -333,14 +319,11 @@ struct io_submit_state { /* * Batch completion logic */ - struct io_comp_state comp; + struct io_kiocb *compl_reqs[IO_COMPL_BATCH]; + unsigned int compl_nr; + /* inline/task_work completion list, under ->uring_lock */ + struct list_head free_list; - /* - * File reference cache - */ - struct file *file; - unsigned int fd; - unsigned int file_refs; unsigned int ios_left; }; @@ -392,6 +375,7 @@ struct io_ring_ctx { struct io_submit_state submit_state; struct list_head timeout_list; + struct list_head ltimeout_list; struct list_head cq_overflow_list; struct xarray io_buffers; struct xarray personalities; @@ -426,6 +410,8 @@ struct io_ring_ctx { struct { spinlock_t completion_lock; + spinlock_t timeout_lock; + /* * ->iopoll_list is protected by the ctx->uring_lock for * io_uring instances that don't use IORING_SETUP_SQPOLL. @@ -487,8 +473,8 @@ struct io_uring_task { spinlock_t task_lock; struct io_wq_work_list task_list; - unsigned long task_state; struct callback_head task_work; + bool task_running; }; /* @@ -523,6 +509,7 @@ struct io_timeout_data { struct hrtimer timer; struct timespec64 ts; enum hrtimer_mode mode; + u32 flags; }; struct io_accept { @@ -530,6 +517,7 @@ struct io_accept { struct sockaddr __user *addr; int __user *addr_len; int flags; + u32 file_slot; unsigned long nofile; }; @@ -553,6 +541,8 @@ struct io_timeout { struct list_head list; /* head of the link, used by linked timeouts only */ struct io_kiocb *head; + /* for linked completions */ + struct io_kiocb *prev; }; struct io_timeout_rem { @@ -562,6 +552,7 @@ struct io_timeout_rem { /* timeout update */ struct timespec64 ts; u32 flags; + bool ltimeout; }; struct io_rw { @@ -593,6 +584,7 @@ struct io_sr_msg { struct io_open { struct file *file; int dfd; + u32 file_slot; struct filename *filename; struct open_how how; unsigned long nofile; @@ -677,7 +669,6 @@ struct io_unlink { struct io_completion { struct file *file; - struct list_head list; u32 cflags; }; @@ -719,14 +710,15 @@ enum { REQ_F_NEED_CLEANUP_BIT, REQ_F_POLLED_BIT, REQ_F_BUFFER_SELECTED_BIT, - REQ_F_LTIMEOUT_ACTIVE_BIT, REQ_F_COMPLETE_INLINE_BIT, REQ_F_REISSUE_BIT, REQ_F_DONT_REISSUE_BIT, REQ_F_CREDS_BIT, + REQ_F_REFCOUNT_BIT, + REQ_F_ARM_LTIMEOUT_BIT, /* keep async read/write and isreg together and in order */ - REQ_F_ASYNC_READ_BIT, - REQ_F_ASYNC_WRITE_BIT, + REQ_F_NOWAIT_READ_BIT, + REQ_F_NOWAIT_WRITE_BIT, REQ_F_ISREG_BIT, /* not a real bit, just to check we're not overflowing the space */ @@ -763,8 +755,6 @@ enum { REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), /* buffer already selected */ REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), - /* linked timeout is active, i.e. prepared by link's head */ - REQ_F_LTIMEOUT_ACTIVE = BIT(REQ_F_LTIMEOUT_ACTIVE_BIT), /* completion is deferred through io_comp_state */ REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT), /* caller should reissue async */ @@ -772,13 +762,17 @@ enum { /* don't attempt request reissue, see io_rw_reissue() */ REQ_F_DONT_REISSUE = BIT(REQ_F_DONT_REISSUE_BIT), /* supports async reads */ - REQ_F_ASYNC_READ = BIT(REQ_F_ASYNC_READ_BIT), + REQ_F_NOWAIT_READ = BIT(REQ_F_NOWAIT_READ_BIT), /* supports async writes */ - REQ_F_ASYNC_WRITE = BIT(REQ_F_ASYNC_WRITE_BIT), + REQ_F_NOWAIT_WRITE = BIT(REQ_F_NOWAIT_WRITE_BIT), /* regular file */ REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), /* has creds assigned */ REQ_F_CREDS = BIT(REQ_F_CREDS_BIT), + /* skip refcounting if not set */ + REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT), + /* there is a linked timeout that has to be armed */ + REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT), }; struct async_poll { @@ -786,7 +780,7 @@ struct async_poll { struct io_poll_iocb *double_poll; }; -typedef void (*io_req_tw_func_t)(struct io_kiocb *req); +typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked); struct io_task_work { union { @@ -1045,37 +1039,38 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_UNLINKAT] = {}, }; +/* requests with any of those set should undergo io_disarm_next() */ +#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) + static bool io_disarm_next(struct io_kiocb *req); static void io_uring_del_tctx_node(unsigned long index); static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct task_struct *task, bool cancel_all); static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); -static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx); static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, long res, unsigned int cflags); static void io_put_req(struct io_kiocb *req); -static void io_put_req_deferred(struct io_kiocb *req, int nr); +static void io_put_req_deferred(struct io_kiocb *req); static void io_dismantle_req(struct io_kiocb *req); -static void io_put_task(struct task_struct *task, int nr); -static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req); static void io_queue_linked_timeout(struct io_kiocb *req); static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, struct io_uring_rsrc_update2 *up, unsigned nr_args); static void io_clean_op(struct io_kiocb *req); -static struct file *io_file_get(struct io_submit_state *state, +static struct file *io_file_get(struct io_ring_ctx *ctx, struct io_kiocb *req, int fd, bool fixed); static void __io_queue_sqe(struct io_kiocb *req); static void io_rsrc_put_work(struct work_struct *work); static void io_req_task_queue(struct io_kiocb *req); static void io_submit_flush_completions(struct io_ring_ctx *ctx); -static bool io_poll_remove_waitqs(struct io_kiocb *req); static int io_req_prep_async(struct io_kiocb *req); -static void io_fallback_req_func(struct work_struct *unused); +static int io_install_fixed_file(struct io_kiocb *req, struct file *file, + unsigned int issue_flags, u32 slot_index); +static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer); static struct kmem_cache *req_cachep; @@ -1094,9 +1089,65 @@ struct sock *io_uring_get_socket(struct file *file) } EXPORT_SYMBOL(io_uring_get_socket); +static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked) +{ + if (!*locked) { + mutex_lock(&ctx->uring_lock); + *locked = true; + } +} + #define io_for_each_link(pos, head) \ for (pos = (head); pos; pos = pos->link) +/* + * Shamelessly stolen from the mm implementation of page reference checking, + * see commit f958d7b528b1 for details. + */ +#define req_ref_zero_or_close_to_overflow(req) \ + ((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u) + +static inline bool req_ref_inc_not_zero(struct io_kiocb *req) +{ + WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); + return atomic_inc_not_zero(&req->refs); +} + +static inline bool req_ref_put_and_test(struct io_kiocb *req) +{ + if (likely(!(req->flags & REQ_F_REFCOUNT))) + return true; + + WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); + return atomic_dec_and_test(&req->refs); +} + +static inline void req_ref_put(struct io_kiocb *req) +{ + WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); + WARN_ON_ONCE(req_ref_put_and_test(req)); +} + +static inline void req_ref_get(struct io_kiocb *req) +{ + WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); + WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); + atomic_inc(&req->refs); +} + +static inline void __io_req_set_refcount(struct io_kiocb *req, int nr) +{ + if (!(req->flags & REQ_F_REFCOUNT)) { + req->flags |= REQ_F_REFCOUNT; + atomic_set(&req->refs, nr); + } +} + +static inline void io_req_set_refcount(struct io_kiocb *req) +{ + __io_req_set_refcount(req, 1); +} + static inline void io_req_set_rsrc_node(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; @@ -1141,6 +1192,12 @@ static inline void req_set_fail(struct io_kiocb *req) req->flags |= REQ_F_FAIL; } +static inline void req_fail_link_node(struct io_kiocb *req, int res) +{ + req_set_fail(req); + req->result = res; +} + static void io_ring_ctx_ref_free(struct percpu_ref *ref) { struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs); @@ -1153,6 +1210,27 @@ static inline bool io_is_timeout_noseq(struct io_kiocb *req) return !req->timeout.off; } +static void io_fallback_req_func(struct work_struct *work) +{ + struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, + fallback_work.work); + struct llist_node *node = llist_del_all(&ctx->fallback_llist); + struct io_kiocb *req, *tmp; + bool locked = false; + + percpu_ref_get(&ctx->refs); + llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node) + req->io_task_work.func(req, &locked); + + if (locked) { + if (ctx->submit_state.compl_nr) + io_submit_flush_completions(ctx); + mutex_unlock(&ctx->uring_lock); + } + percpu_ref_put(&ctx->refs); + +} + static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; @@ -1198,15 +1276,17 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->cq_wait); spin_lock_init(&ctx->completion_lock); + spin_lock_init(&ctx->timeout_lock); INIT_LIST_HEAD(&ctx->iopoll_list); INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); + INIT_LIST_HEAD(&ctx->ltimeout_list); spin_lock_init(&ctx->rsrc_ref_lock); INIT_LIST_HEAD(&ctx->rsrc_ref_list); INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work); init_llist_head(&ctx->rsrc_put_llist); INIT_LIST_HEAD(&ctx->tctx_list); - INIT_LIST_HEAD(&ctx->submit_state.comp.free_list); + INIT_LIST_HEAD(&ctx->submit_state.free_list); INIT_LIST_HEAD(&ctx->locked_free_list); INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func); return ctx; @@ -1236,6 +1316,20 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq) return false; } +#define FFS_ASYNC_READ 0x1UL +#define FFS_ASYNC_WRITE 0x2UL +#ifdef CONFIG_64BIT +#define FFS_ISREG 0x4UL +#else +#define FFS_ISREG 0x0UL +#endif +#define FFS_MASK ~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG) + +static inline bool io_req_ffs_set(struct io_kiocb *req) +{ + return IS_ENABLED(CONFIG_64BIT) && (req->flags & REQ_F_FIXED_FILE); +} + static void io_req_track_inflight(struct io_kiocb *req) { if (!(req->flags & REQ_F_INFLIGHT)) { @@ -1244,6 +1338,32 @@ static void io_req_track_inflight(struct io_kiocb *req) } } +static inline void io_unprep_linked_timeout(struct io_kiocb *req) +{ + req->flags &= ~REQ_F_LINK_TIMEOUT; +} + +static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req) +{ + if (WARN_ON_ONCE(!req->link)) + return NULL; + + req->flags &= ~REQ_F_ARM_LTIMEOUT; + req->flags |= REQ_F_LINK_TIMEOUT; + + /* linked timeouts should have two refs once prep'ed */ + io_req_set_refcount(req); + __io_req_set_refcount(req->link, 2); + return req->link; +} + +static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) +{ + if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT))) + return NULL; + return __io_prep_linked_timeout(req); +} + static void io_prep_async_work(struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; @@ -1283,22 +1403,25 @@ static void io_prep_async_link(struct io_kiocb *req) if (req->flags & REQ_F_LINK_TIMEOUT) { struct io_ring_ctx *ctx = req->ctx; - spin_lock_irq(&ctx->completion_lock); + spin_lock(&ctx->completion_lock); io_for_each_link(cur, req) io_prep_async_work(cur); - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); } else { io_for_each_link(cur, req) io_prep_async_work(cur); } } -static void io_queue_async_work(struct io_kiocb *req) +static void io_queue_async_work(struct io_kiocb *req, bool *locked) { struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *link = io_prep_linked_timeout(req); struct io_uring_task *tctx = req->task->io_uring; + /* must not take the lock, NULL it as a precaution */ + locked = NULL; + BUG_ON(!tctx); BUG_ON(!tctx->io_wq); @@ -1324,6 +1447,7 @@ static void io_queue_async_work(struct io_kiocb *req) static void io_kill_timeout(struct io_kiocb *req, int status) __must_hold(&req->ctx->completion_lock) + __must_hold(&req->ctx->timeout_lock) { struct io_timeout_data *io = req->async_data; @@ -1332,7 +1456,7 @@ static void io_kill_timeout(struct io_kiocb *req, int status) atomic_read(&req->ctx->cq_timeouts) + 1); list_del_init(&req->timeout.list); io_cqring_fill_event(req->ctx, req->user_data, status, 0); - io_put_req_deferred(req, 1); + io_put_req_deferred(req); } } @@ -1351,9 +1475,11 @@ static void io_queue_deferred(struct io_ring_ctx *ctx) } static void io_flush_timeouts(struct io_ring_ctx *ctx) + __must_hold(&ctx->completion_lock) { u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); + spin_lock_irq(&ctx->timeout_lock); while (!list_empty(&ctx->timeout_list)) { u32 events_needed, events_got; struct io_kiocb *req = list_first_entry(&ctx->timeout_list, @@ -1378,6 +1504,7 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx) io_kill_timeout(req, 0); } ctx->cq_last_tm_flush = seq; + spin_unlock_irq(&ctx->timeout_lock); } static void __io_commit_cqring_flush(struct io_ring_ctx *ctx) @@ -1434,13 +1561,22 @@ static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx) return !ctx->eventfd_async || io_wq_current_is_worker(); } +/* + * This should only get called when at least one event has been posted. + * Some applications rely on the eventfd notification count only changing + * IFF a new CQE has been added to the CQ ring. There's no depedency on + * 1:1 relationship between how many times this function is called (and + * hence the eventfd count) and number of CQEs posted to the CQ ring. + */ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) { - /* see waitqueue_active() comment */ - smp_mb(); - - if (waitqueue_active(&ctx->cq_wait)) - wake_up(&ctx->cq_wait); + /* + * wake_up_all() may seem excessive, but io_wake_function() and + * io_should_wake() handle the termination of the loop and only + * wake as many waiters as we need to. + */ + if (wq_has_sleeper(&ctx->cq_wait)) + wake_up_all(&ctx->cq_wait); if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait)) wake_up(&ctx->sq_data->wait); if (io_should_trigger_evfd(ctx)) @@ -1453,12 +1589,9 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) { - /* see waitqueue_active() comment */ - smp_mb(); - if (ctx->flags & IORING_SETUP_SQPOLL) { - if (waitqueue_active(&ctx->cq_wait)) - wake_up(&ctx->cq_wait); + if (wq_has_sleeper(&ctx->cq_wait)) + wake_up_all(&ctx->cq_wait); } if (io_should_trigger_evfd(ctx)) eventfd_signal(ctx->cq_ev_fd, 1); @@ -1471,14 +1604,13 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) /* Returns true if there are no backlogged entries after the flush */ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) { - unsigned long flags; bool all_flushed, posted; if (!force && __io_cqring_events(ctx) == ctx->cq_entries) return false; posted = false; - spin_lock_irqsave(&ctx->completion_lock, flags); + spin_lock(&ctx->completion_lock); while (!list_empty(&ctx->cq_overflow_list)) { struct io_uring_cqe *cqe = io_get_cqe(ctx); struct io_overflow_cqe *ocqe; @@ -1506,13 +1638,13 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) if (posted) io_commit_cqring(ctx); - spin_unlock_irqrestore(&ctx->completion_lock, flags); + spin_unlock(&ctx->completion_lock); if (posted) io_cqring_ev_posted(ctx); return all_flushed; } -static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) +static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx) { bool ret = true; @@ -1520,7 +1652,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) /* iopoll syncs against uring_lock, not completion_lock */ if (ctx->flags & IORING_SETUP_IOPOLL) mutex_lock(&ctx->uring_lock); - ret = __io_cqring_overflow_flush(ctx, force); + ret = __io_cqring_overflow_flush(ctx, false); if (ctx->flags & IORING_SETUP_IOPOLL) mutex_unlock(&ctx->uring_lock); } @@ -1528,39 +1660,37 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) return ret; } -/* - * Shamelessly stolen from the mm implementation of page reference checking, - * see commit f958d7b528b1 for details. - */ -#define req_ref_zero_or_close_to_overflow(req) \ - ((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u) - -static inline bool req_ref_inc_not_zero(struct io_kiocb *req) +/* must to be called somewhat shortly after putting a request */ +static inline void io_put_task(struct task_struct *task, int nr) { - return atomic_inc_not_zero(&req->refs); + struct io_uring_task *tctx = task->io_uring; + + if (likely(task == current)) { + tctx->cached_refs += nr; + } else { + percpu_counter_sub(&tctx->inflight, nr); + if (unlikely(atomic_read(&tctx->in_idle))) + wake_up(&tctx->wait); + put_task_struct_many(task, nr); + } } -static inline bool req_ref_sub_and_test(struct io_kiocb *req, int refs) +static void io_task_refs_refill(struct io_uring_task *tctx) { - WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); - return atomic_sub_and_test(refs, &req->refs); + unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR; + + percpu_counter_add(&tctx->inflight, refill); + refcount_add(refill, ¤t->usage); + tctx->cached_refs += refill; } -static inline bool req_ref_put_and_test(struct io_kiocb *req) +static inline void io_get_task_refs(int nr) { - WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); - return atomic_dec_and_test(&req->refs); -} + struct io_uring_task *tctx = current->io_uring; -static inline void req_ref_put(struct io_kiocb *req) -{ - WARN_ON_ONCE(req_ref_put_and_test(req)); -} - -static inline void req_ref_get(struct io_kiocb *req) -{ - WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); - atomic_inc(&req->refs); + tctx->cached_refs -= nr; + if (unlikely(tctx->cached_refs < 0)) + io_task_refs_refill(tctx); } static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, @@ -1624,9 +1754,8 @@ static void io_req_complete_post(struct io_kiocb *req, long res, unsigned int cflags) { struct io_ring_ctx *ctx = req->ctx; - unsigned long flags; - spin_lock_irqsave(&ctx->completion_lock, flags); + spin_lock(&ctx->completion_lock); __io_cqring_fill_event(ctx, req->user_data, res, cflags); /* * If we're the last reference to this request, add to our locked @@ -1634,7 +1763,7 @@ static void io_req_complete_post(struct io_kiocb *req, long res, */ if (req_ref_put_and_test(req)) { if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { - if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL)) + if (req->flags & IO_DISARM_MASK) io_disarm_next(req); if (req->link) { io_req_task_queue(req->link); @@ -1643,14 +1772,14 @@ static void io_req_complete_post(struct io_kiocb *req, long res, } io_dismantle_req(req); io_put_task(req->task, 1); - list_add(&req->compl.list, &ctx->locked_free_list); + list_add(&req->inflight_entry, &ctx->locked_free_list); ctx->locked_free_nr++; } else { if (!percpu_ref_tryget(&ctx->refs)) req = NULL; } io_commit_cqring(ctx); - spin_unlock_irqrestore(&ctx->completion_lock, flags); + spin_unlock(&ctx->completion_lock); if (req) { io_cqring_ev_posted(ctx); @@ -1690,24 +1819,35 @@ static inline void io_req_complete(struct io_kiocb *req, long res) static void io_req_complete_failed(struct io_kiocb *req, long res) { req_set_fail(req); - io_put_req(req); io_req_complete_post(req, res, 0); } -static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, - struct io_comp_state *cs) +/* + * Don't initialise the fields below on every allocation, but do that in + * advance and keep them valid across allocations. + */ +static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx) { - spin_lock_irq(&ctx->completion_lock); - list_splice_init(&ctx->locked_free_list, &cs->free_list); + req->ctx = ctx; + req->link = NULL; + req->async_data = NULL; + /* not necessary, but safer to zero */ + req->result = 0; +} + +static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, + struct io_submit_state *state) +{ + spin_lock(&ctx->completion_lock); + list_splice_init(&ctx->locked_free_list, &state->free_list); ctx->locked_free_nr = 0; - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); } /* Returns true IFF there are requests in the cache */ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx) { struct io_submit_state *state = &ctx->submit_state; - struct io_comp_state *cs = &state->comp; int nr; /* @@ -1716,14 +1856,14 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx) * side cache. */ if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH) - io_flush_cached_locked_reqs(ctx, cs); + io_flush_cached_locked_reqs(ctx, state); nr = state->free_reqs; - while (!list_empty(&cs->free_list)) { - struct io_kiocb *req = list_first_entry(&cs->free_list, - struct io_kiocb, compl.list); + while (!list_empty(&state->free_list)) { + struct io_kiocb *req = list_first_entry(&state->free_list, + struct io_kiocb, inflight_entry); - list_del(&req->compl.list); + list_del(&req->inflight_entry); state->reqs[nr++] = req; if (nr == ARRAY_SIZE(state->reqs)) break; @@ -1733,48 +1873,41 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx) return nr != 0; } +/* + * A request might get retired back into the request caches even before opcode + * handlers and io_issue_sqe() are done with it, e.g. inline completion path. + * Because of that, io_alloc_req() should be called only under ->uring_lock + * and with extra caution to not get a request that is still worked on. + */ static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx) + __must_hold(&ctx->uring_lock) { struct io_submit_state *state = &ctx->submit_state; + gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; + int ret, i; BUILD_BUG_ON(ARRAY_SIZE(state->reqs) < IO_REQ_ALLOC_BATCH); - if (!state->free_reqs) { - gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; - int ret, i; + if (likely(state->free_reqs || io_flush_cached_reqs(ctx))) + goto got_req; - if (io_flush_cached_reqs(ctx)) - goto got_req; + ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH, + state->reqs); - ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH, - state->reqs); - - /* - * Bulk alloc is all-or-nothing. If we fail to get a batch, - * retry single alloc to be on the safe side. - */ - if (unlikely(ret <= 0)) { - state->reqs[0] = kmem_cache_alloc(req_cachep, gfp); - if (!state->reqs[0]) - return NULL; - ret = 1; - } - - /* - * Don't initialise the fields below on every allocation, but - * do that in advance and keep valid on free. - */ - for (i = 0; i < ret; i++) { - struct io_kiocb *req = state->reqs[i]; - - req->ctx = ctx; - req->link = NULL; - req->async_data = NULL; - /* not necessary, but safer to zero */ - req->result = 0; - } - state->free_reqs = ret; + /* + * Bulk alloc is all-or-nothing. If we fail to get a batch, + * retry single alloc to be on the safe side. + */ + if (unlikely(ret <= 0)) { + state->reqs[0] = kmem_cache_alloc(req_cachep, gfp); + if (!state->reqs[0]) + return NULL; + ret = 1; } + + for (i = 0; i < ret; i++) + io_preinit_req(state->reqs[i], ctx); + state->free_reqs = ret; got_req: state->free_reqs--; return state->reqs[state->free_reqs]; @@ -1802,17 +1935,6 @@ static void io_dismantle_req(struct io_kiocb *req) } } -/* must to be called somewhat shortly after putting a request */ -static inline void io_put_task(struct task_struct *task, int nr) -{ - struct io_uring_task *tctx = task->io_uring; - - percpu_counter_sub(&tctx->inflight, nr); - if (unlikely(atomic_read(&tctx->in_idle))) - wake_up(&tctx->wait); - put_task_struct_many(task, nr); -} - static void __io_free_req(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; @@ -1820,7 +1942,11 @@ static void __io_free_req(struct io_kiocb *req) io_dismantle_req(req); io_put_task(req->task, 1); - kmem_cache_free(req_cachep, req); + spin_lock(&ctx->completion_lock); + list_add(&req->inflight_entry, &ctx->locked_free_list); + ctx->locked_free_nr++; + spin_unlock(&ctx->completion_lock); + percpu_ref_put(&ctx->refs); } @@ -1834,22 +1960,20 @@ static inline void io_remove_next_linked(struct io_kiocb *req) static bool io_kill_linked_timeout(struct io_kiocb *req) __must_hold(&req->ctx->completion_lock) + __must_hold(&req->ctx->timeout_lock) { struct io_kiocb *link = req->link; - /* - * Can happen if a linked timeout fired and link had been like - * req -> link t-out -> link t-out [-> ...] - */ - if (link && (link->flags & REQ_F_LTIMEOUT_ACTIVE)) { + if (link && link->opcode == IORING_OP_LINK_TIMEOUT) { struct io_timeout_data *io = link->async_data; io_remove_next_linked(req); link->timeout.head = NULL; if (hrtimer_try_to_cancel(&io->timer) != -1) { + list_del(&link->timeout.list); io_cqring_fill_event(link->ctx, link->user_data, -ECANCELED, 0); - io_put_req_deferred(link, 1); + io_put_req_deferred(link); return true; } } @@ -1863,12 +1987,17 @@ static void io_fail_links(struct io_kiocb *req) req->link = NULL; while (link) { + long res = -ECANCELED; + + if (link->flags & REQ_F_FAIL) + res = link->result; + nxt = link->link; link->link = NULL; trace_io_uring_fail_link(req, link); - io_cqring_fill_event(link->ctx, link->user_data, -ECANCELED, 0); - io_put_req_deferred(link, 2); + io_cqring_fill_event(link->ctx, link->user_data, res, 0); + io_put_req_deferred(link); link = nxt; } } @@ -1878,8 +2007,24 @@ static bool io_disarm_next(struct io_kiocb *req) { bool posted = false; - if (likely(req->flags & REQ_F_LINK_TIMEOUT)) + if (req->flags & REQ_F_ARM_LTIMEOUT) { + struct io_kiocb *link = req->link; + + req->flags &= ~REQ_F_ARM_LTIMEOUT; + if (link && link->opcode == IORING_OP_LINK_TIMEOUT) { + io_remove_next_linked(req); + io_cqring_fill_event(link->ctx, link->user_data, + -ECANCELED, 0); + io_put_req_deferred(link); + posted = true; + } + } else if (req->flags & REQ_F_LINK_TIMEOUT) { + struct io_ring_ctx *ctx = req->ctx; + + spin_lock_irq(&ctx->timeout_lock); posted = io_kill_linked_timeout(req); + spin_unlock_irq(&ctx->timeout_lock); + } if (unlikely((req->flags & REQ_F_FAIL) && !(req->flags & REQ_F_HARDLINK))) { posted |= (req->link != NULL); @@ -1898,16 +2043,15 @@ static struct io_kiocb *__io_req_find_next(struct io_kiocb *req) * dependencies to the next request. In case of failure, fail the rest * of the chain. */ - if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL)) { + if (req->flags & IO_DISARM_MASK) { struct io_ring_ctx *ctx = req->ctx; - unsigned long flags; bool posted; - spin_lock_irqsave(&ctx->completion_lock, flags); + spin_lock(&ctx->completion_lock); posted = io_disarm_next(req); if (posted) io_commit_cqring(req->ctx); - spin_unlock_irqrestore(&ctx->completion_lock, flags); + spin_unlock(&ctx->completion_lock); if (posted) io_cqring_ev_posted(ctx); } @@ -1923,20 +2067,22 @@ static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) return __io_req_find_next(req); } -static void ctx_flush_and_put(struct io_ring_ctx *ctx) +static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked) { if (!ctx) return; - if (ctx->submit_state.comp.nr) { - mutex_lock(&ctx->uring_lock); - io_submit_flush_completions(ctx); + if (*locked) { + if (ctx->submit_state.compl_nr) + io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); + *locked = false; } percpu_ref_put(&ctx->refs); } static void tctx_task_work(struct callback_head *cb) { + bool locked = false; struct io_ring_ctx *ctx = NULL; struct io_uring_task *tctx = container_of(cb, struct io_uring_task, task_work); @@ -1947,37 +2093,32 @@ static void tctx_task_work(struct callback_head *cb) spin_lock_irq(&tctx->task_lock); node = tctx->task_list.first; INIT_WQ_LIST(&tctx->task_list); + if (!node) + tctx->task_running = false; spin_unlock_irq(&tctx->task_lock); + if (!node) + break; - while (node) { + do { struct io_wq_work_node *next = node->next; struct io_kiocb *req = container_of(node, struct io_kiocb, io_task_work.node); if (req->ctx != ctx) { - ctx_flush_and_put(ctx); + ctx_flush_and_put(ctx, &locked); ctx = req->ctx; + /* if not contended, grab and improve batching */ + locked = mutex_trylock(&ctx->uring_lock); percpu_ref_get(&ctx->refs); } - req->io_task_work.func(req); + req->io_task_work.func(req, &locked); node = next; - } - if (wq_list_empty(&tctx->task_list)) { - spin_lock_irq(&tctx->task_lock); - clear_bit(0, &tctx->task_state); - if (wq_list_empty(&tctx->task_list)) { - spin_unlock_irq(&tctx->task_lock); - break; - } - spin_unlock_irq(&tctx->task_lock); - /* another tctx_task_work() is enqueued, yield */ - if (test_and_set_bit(0, &tctx->task_state)) - break; - } + } while (node); + cond_resched(); } - ctx_flush_and_put(ctx); + ctx_flush_and_put(ctx, &locked); } static void io_req_task_work_add(struct io_kiocb *req) @@ -1987,19 +2128,20 @@ static void io_req_task_work_add(struct io_kiocb *req) enum task_work_notify_mode notify; struct io_wq_work_node *node; unsigned long flags; + bool running; WARN_ON_ONCE(!tctx); spin_lock_irqsave(&tctx->task_lock, flags); wq_list_add_tail(&req->io_task_work.node, &tctx->task_list); + running = tctx->task_running; + if (!running) + tctx->task_running = true; spin_unlock_irqrestore(&tctx->task_lock, flags); /* task_work already pending, we're done */ - if (test_bit(0, &tctx->task_state) || - test_and_set_bit(0, &tctx->task_state)) + if (running) return; - if (unlikely(tsk->flags & PF_EXITING)) - goto fail; /* * SQPOLL kernel thread doesn't need notification, just a wakeup. For @@ -2012,9 +2154,9 @@ static void io_req_task_work_add(struct io_kiocb *req) wake_up_process(tsk); return; } -fail: - clear_bit(0, &tctx->task_state); + spin_lock_irqsave(&tctx->task_lock, flags); + tctx->task_running = false; node = tctx->task_list.first; INIT_WQ_LIST(&tctx->task_list); spin_unlock_irqrestore(&tctx->task_lock, flags); @@ -2028,27 +2170,25 @@ fail: } } -static void io_req_task_cancel(struct io_kiocb *req) +static void io_req_task_cancel(struct io_kiocb *req, bool *locked) { struct io_ring_ctx *ctx = req->ctx; - /* ctx is guaranteed to stay alive while we hold uring_lock */ - mutex_lock(&ctx->uring_lock); + /* not needed for normal modes, but SQPOLL depends on it */ + io_tw_lock(ctx, locked); io_req_complete_failed(req, req->result); - mutex_unlock(&ctx->uring_lock); } -static void io_req_task_submit(struct io_kiocb *req) +static void io_req_task_submit(struct io_kiocb *req, bool *locked) { struct io_ring_ctx *ctx = req->ctx; - /* ctx stays valid until unlock, even if we drop all ours ctx->refs */ - mutex_lock(&ctx->uring_lock); - if (!(req->task->flags & PF_EXITING) && !req->task->in_execve) + io_tw_lock(ctx, locked); + /* req->task == current here, checking PF_EXITING is safe */ + if (likely(!(req->task->flags & PF_EXITING))) __io_queue_sqe(req); else io_req_complete_failed(req, -EFAULT); - mutex_unlock(&ctx->uring_lock); } static void io_req_task_queue_fail(struct io_kiocb *req, int ret) @@ -2084,6 +2224,11 @@ static void io_free_req(struct io_kiocb *req) __io_free_req(req); } +static void io_free_req_work(struct io_kiocb *req, bool *locked) +{ + io_free_req(req); +} + struct req_batch { struct task_struct *task; int task_refs; @@ -2100,10 +2245,10 @@ static inline void io_init_req_batch(struct req_batch *rb) static void io_req_free_batch_finish(struct io_ring_ctx *ctx, struct req_batch *rb) { - if (rb->task) - io_put_task(rb->task, rb->task_refs); if (rb->ctx_refs) percpu_ref_put_many(&ctx->refs, rb->ctx_refs); + if (rb->task) + io_put_task(rb->task, rb->task_refs); } static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req, @@ -2124,37 +2269,37 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req, if (state->free_reqs != ARRAY_SIZE(state->reqs)) state->reqs[state->free_reqs++] = req; else - list_add(&req->compl.list, &state->comp.free_list); + list_add(&req->inflight_entry, &state->free_list); } static void io_submit_flush_completions(struct io_ring_ctx *ctx) + __must_hold(&ctx->uring_lock) { - struct io_comp_state *cs = &ctx->submit_state.comp; - int i, nr = cs->nr; + struct io_submit_state *state = &ctx->submit_state; + int i, nr = state->compl_nr; struct req_batch rb; - spin_lock_irq(&ctx->completion_lock); + spin_lock(&ctx->completion_lock); for (i = 0; i < nr; i++) { - struct io_kiocb *req = cs->reqs[i]; + struct io_kiocb *req = state->compl_reqs[i]; __io_cqring_fill_event(ctx, req->user_data, req->result, req->compl.cflags); } io_commit_cqring(ctx); - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); io_cqring_ev_posted(ctx); io_init_req_batch(&rb); for (i = 0; i < nr; i++) { - struct io_kiocb *req = cs->reqs[i]; + struct io_kiocb *req = state->compl_reqs[i]; - /* submission and completion refs */ - if (req_ref_sub_and_test(req, 2)) + if (req_ref_put_and_test(req)) io_req_free_batch(&rb, req, &ctx->submit_state); } io_req_free_batch_finish(ctx, &rb); - cs->nr = 0; + state->compl_nr = 0; } /* @@ -2178,16 +2323,12 @@ static inline void io_put_req(struct io_kiocb *req) io_free_req(req); } -static void io_free_req_deferred(struct io_kiocb *req) +static inline void io_put_req_deferred(struct io_kiocb *req) { - req->io_task_work.func = io_free_req; - io_req_task_work_add(req); -} - -static inline void io_put_req_deferred(struct io_kiocb *req, int refs) -{ - if (req_ref_sub_and_test(req, refs)) - io_free_req_deferred(req); + if (req_ref_put_and_test(req)) { + req->io_task_work.func = io_free_req_work; + io_req_task_work_add(req); + } } static unsigned io_cqring_events(struct io_ring_ctx *ctx) @@ -2220,6 +2361,8 @@ static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req) { struct io_buffer *kbuf; + if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) + return 0; kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; return io_put_kbuf(req, kbuf); } @@ -2239,7 +2382,7 @@ static inline bool io_run_task_work(void) * Find and free completed poll iocbs */ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, - struct list_head *done, bool resubmit) + struct list_head *done) { struct req_batch rb; struct io_kiocb *req; @@ -2249,23 +2392,18 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, io_init_req_batch(&rb); while (!list_empty(done)) { - int cflags = 0; - req = list_first_entry(done, struct io_kiocb, inflight_entry); list_del(&req->inflight_entry); - if (READ_ONCE(req->result) == -EAGAIN && resubmit && + if (READ_ONCE(req->result) == -EAGAIN && !(req->flags & REQ_F_DONT_REISSUE)) { req->iopoll_completed = 0; - req_ref_get(req); io_req_task_queue_reissue(req); continue; } - if (req->flags & REQ_F_BUFFER_SELECTED) - cflags = io_put_rw_kbuf(req); - - __io_cqring_fill_event(ctx, req->user_data, req->result, cflags); + __io_cqring_fill_event(ctx, req->user_data, req->result, + io_put_rw_kbuf(req)); (*nr_events)++; if (req_ref_put_and_test(req)) @@ -2278,12 +2416,11 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, } static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, - long min, bool resubmit) + long min) { struct io_kiocb *req, *tmp; LIST_HEAD(done); bool spin; - int ret; /* * Only spin for completions if we don't have multiple devices hanging @@ -2291,9 +2428,9 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, */ spin = !ctx->poll_multi_queue && *nr_events < min; - ret = 0; list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) { struct kiocb *kiocb = &req->rw.kiocb; + int ret; /* * Move completed and retryable entries to our local lists. @@ -2308,22 +2445,20 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, break; ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin); - if (ret < 0) - break; + if (unlikely(ret < 0)) + return ret; + else if (ret) + spin = false; /* iopoll may have completed current req */ if (READ_ONCE(req->iopoll_completed)) list_move_tail(&req->inflight_entry, &done); - - if (ret && spin) - spin = false; - ret = 0; } if (!list_empty(&done)) - io_iopoll_complete(ctx, nr_events, &done, resubmit); + io_iopoll_complete(ctx, nr_events, &done); - return ret; + return 0; } /* @@ -2339,7 +2474,7 @@ static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) while (!list_empty(&ctx->iopoll_list)) { unsigned int nr_events = 0; - io_do_iopoll(ctx, &nr_events, 0, false); + io_do_iopoll(ctx, &nr_events, 0); /* let it sleep and repeat later if can't complete a request */ if (nr_events == 0) @@ -2401,7 +2536,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) list_empty(&ctx->iopoll_list)) break; } - ret = io_do_iopoll(ctx, &nr_events, min, true); + ret = io_do_iopoll(ctx, &nr_events, min); } while (!ret && nr_events < min && !need_resched()); out: mutex_unlock(&ctx->uring_lock); @@ -2470,44 +2605,57 @@ static bool io_rw_should_reissue(struct io_kiocb *req) } #endif -static void io_fallback_req_func(struct work_struct *work) +static bool __io_complete_rw_common(struct io_kiocb *req, long res) { - struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, - fallback_work.work); - struct llist_node *node = llist_del_all(&ctx->fallback_llist); - struct io_kiocb *req, *tmp; - - percpu_ref_get(&ctx->refs); - llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node) - req->io_task_work.func(req); - percpu_ref_put(&ctx->refs); -} - -static void __io_complete_rw(struct io_kiocb *req, long res, long res2, - unsigned int issue_flags) -{ - int cflags = 0; - if (req->rw.kiocb.ki_flags & IOCB_WRITE) kiocb_end_write(req); if (res != req->result) { if ((res == -EAGAIN || res == -EOPNOTSUPP) && io_rw_should_reissue(req)) { req->flags |= REQ_F_REISSUE; - return; + return true; } req_set_fail(req); + req->result = res; } - if (req->flags & REQ_F_BUFFER_SELECTED) - cflags = io_put_rw_kbuf(req); - __io_req_complete(req, issue_flags, res, cflags); + return false; +} + +static void io_req_task_complete(struct io_kiocb *req, bool *locked) +{ + unsigned int cflags = io_put_rw_kbuf(req); + long res = req->result; + + if (*locked) { + struct io_ring_ctx *ctx = req->ctx; + struct io_submit_state *state = &ctx->submit_state; + + io_req_complete_state(req, res, cflags); + state->compl_reqs[state->compl_nr++] = req; + if (state->compl_nr == ARRAY_SIZE(state->compl_reqs)) + io_submit_flush_completions(ctx); + } else { + io_req_complete_post(req, res, cflags); + } +} + +static void __io_complete_rw(struct io_kiocb *req, long res, long res2, + unsigned int issue_flags) +{ + if (__io_complete_rw_common(req, res)) + return; + __io_req_complete(req, 0, req->result, io_put_rw_kbuf(req)); } static void io_complete_rw(struct kiocb *kiocb, long res, long res2) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); - __io_complete_rw(req, res, res2, 0); + if (__io_complete_rw_common(req, res)) + return; + req->result = res; + req->io_task_work.func = io_req_task_complete; + io_req_task_work_add(req); } static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) @@ -2593,40 +2741,6 @@ static void io_iopoll_req_issued(struct io_kiocb *req) } } -static inline void io_state_file_put(struct io_submit_state *state) -{ - if (state->file_refs) { - fput_many(state->file, state->file_refs); - state->file_refs = 0; - } -} - -/* - * Get as many references to a file as we have IOs left in this submission, - * assuming most submissions are for one file, or at least that each file - * has more than one submission. - */ -static struct file *__io_file_get(struct io_submit_state *state, int fd) -{ - if (!state) - return fget(fd); - - if (state->file_refs) { - if (state->fd == fd) { - state->file_refs--; - return state->file; - } - io_state_file_put(state); - } - state->file = fget_many(fd, state->ios_left); - if (unlikely(!state->file)) - return NULL; - - state->fd = fd; - state->file_refs = state->ios_left - 1; - return state->file; -} - static bool io_bdev_nowait(struct block_device *bdev) { return !bdev || blk_queue_nowait(bdev_get_queue(bdev)); @@ -2637,7 +2751,7 @@ static bool io_bdev_nowait(struct block_device *bdev) * any file. For now, just ensure that anything potentially problematic is done * inline. */ -static bool __io_file_supports_async(struct file *file, int rw) +static bool __io_file_supports_nowait(struct file *file, int rw) { umode_t mode = file_inode(file)->i_mode; @@ -2670,14 +2784,14 @@ static bool __io_file_supports_async(struct file *file, int rw) return file->f_op->write_iter != NULL; } -static bool io_file_supports_async(struct io_kiocb *req, int rw) +static bool io_file_supports_nowait(struct io_kiocb *req, int rw) { - if (rw == READ && (req->flags & REQ_F_ASYNC_READ)) + if (rw == READ && (req->flags & REQ_F_NOWAIT_READ)) return true; - else if (rw == WRITE && (req->flags & REQ_F_ASYNC_WRITE)) + else if (rw == WRITE && (req->flags & REQ_F_NOWAIT_WRITE)) return true; - return __io_file_supports_async(req->file, rw); + return __io_file_supports_nowait(req->file, rw); } static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -2688,7 +2802,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) unsigned ioprio; int ret; - if (!(req->flags & REQ_F_ISREG) && S_ISREG(file_inode(file)->i_mode)) + if (!io_req_ffs_set(req) && S_ISREG(file_inode(file)->i_mode)) req->flags |= REQ_F_ISREG; kiocb->ki_pos = READ_ONCE(sqe->off); @@ -2788,15 +2902,11 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret, if (check_reissue && (req->flags & REQ_F_REISSUE)) { req->flags &= ~REQ_F_REISSUE; if (io_resubmit_prep(req)) { - req_ref_get(req); io_req_task_queue_reissue(req); } else { - int cflags = 0; - req_set_fail(req); - if (req->flags & REQ_F_BUFFER_SELECTED) - cflags = io_put_rw_kbuf(req); - __io_req_complete(req, issue_flags, ret, cflags); + __io_req_complete(req, issue_flags, ret, + io_put_rw_kbuf(req)); } } } @@ -3214,9 +3324,6 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, req->rw.kiocb.ki_flags &= ~IOCB_WAITQ; list_del_init(&wait->entry); - - /* submit ref gets dropped, acquire a new one */ - req_ref_get(req); io_req_task_queue(req); return 1; } @@ -3301,7 +3408,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) kiocb->ki_flags |= IOCB_NOWAIT; /* If the file doesn't support async, just async punt */ - if (force_nonblock && !io_file_supports_async(req, READ)) { + if (force_nonblock && !io_file_supports_nowait(req, READ)) { ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true); return ret ?: -EAGAIN; } @@ -3406,7 +3513,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) kiocb->ki_flags |= IOCB_NOWAIT; /* If the file doesn't support async, just async punt */ - if (force_nonblock && !io_file_supports_async(req, WRITE)) + if (force_nonblock && !io_file_supports_nowait(req, WRITE)) goto copy_iov; /* file path doesn't support NOWAIT for non-direct_IO */ @@ -3481,7 +3588,7 @@ static int io_renameat_prep(struct io_kiocb *req, if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (sqe->ioprio || sqe->buf_index) + if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; @@ -3532,7 +3639,8 @@ static int io_unlinkat_prep(struct io_kiocb *req, if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index) + if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index || + sqe->splice_fd_in) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; @@ -3578,8 +3686,8 @@ static int io_shutdown_prep(struct io_kiocb *req, #if defined(CONFIG_NET) if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags || - sqe->buf_index) + if (unlikely(sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags || + sqe->buf_index || sqe->splice_fd_in)) return -EINVAL; req->shutdown.how = READ_ONCE(sqe->len); @@ -3628,7 +3736,7 @@ static int __io_splice_prep(struct io_kiocb *req, if (unlikely(sp->flags & ~valid_flags)) return -EINVAL; - sp->file_in = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in), + sp->file_in = io_file_get(req->ctx, req, READ_ONCE(sqe->splice_fd_in), (sp->flags & SPLICE_F_FD_IN_FIXED)); if (!sp->file_in) return -EBADF; @@ -3727,7 +3835,8 @@ static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) + if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index || + sqe->splice_fd_in)) return -EINVAL; req->sync.flags = READ_ONCE(sqe->fsync_flags); @@ -3760,7 +3869,8 @@ static int io_fsync(struct io_kiocb *req, unsigned int issue_flags) static int io_fallocate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - if (sqe->ioprio || sqe->buf_index || sqe->rw_flags) + if (sqe->ioprio || sqe->buf_index || sqe->rw_flags || + sqe->splice_fd_in) return -EINVAL; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -3791,6 +3901,8 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe const char __user *fname; int ret; + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; if (unlikely(sqe->ioprio || sqe->buf_index)) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) @@ -3808,6 +3920,11 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe req->open.filename = NULL; return ret; } + + req->open.file_slot = READ_ONCE(sqe->file_index); + if (req->open.file_slot && (req->open.how.flags & O_CLOEXEC)) + return -EINVAL; + req->open.nofile = rlimit(RLIMIT_NOFILE); req->flags |= REQ_F_NEED_CLEANUP; return 0; @@ -3815,12 +3932,9 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - u64 flags, mode; + u64 mode = READ_ONCE(sqe->len); + u64 flags = READ_ONCE(sqe->open_flags); - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - mode = READ_ONCE(sqe->len); - flags = READ_ONCE(sqe->open_flags); req->open.how = build_open_how(flags, mode); return __io_openat_prep(req, sqe); } @@ -3831,8 +3945,6 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) size_t len; int ret; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; how = u64_to_user_ptr(READ_ONCE(sqe->addr2)); len = READ_ONCE(sqe->len); if (len < OPEN_HOW_SIZE_VER0) @@ -3850,8 +3962,8 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags) { struct open_flags op; struct file *file; - bool nonblock_set; - bool resolve_nonblock; + bool resolve_nonblock, nonblock_set; + bool fixed = !!req->open.file_slot; int ret; ret = build_open_flags(&req->open.how, &op); @@ -3870,9 +3982,11 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags) op.open_flag |= O_NONBLOCK; } - ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile); - if (ret < 0) - goto err; + if (!fixed) { + ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile); + if (ret < 0) + goto err; + } file = do_filp_open(req->open.dfd, req->open.filename, &op); if (IS_ERR(file)) { @@ -3881,7 +3995,8 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags) * marginal gain for something that is now known to be a slower * path. So just put it, and we'll get a new one when we retry. */ - put_unused_fd(ret); + if (!fixed) + put_unused_fd(ret); ret = PTR_ERR(file); /* only retry if RESOLVE_CACHED wasn't already set by application */ @@ -3894,7 +4009,12 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags) if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set) file->f_flags &= ~O_NONBLOCK; fsnotify_open(file); - fd_install(ret, file); + + if (!fixed) + fd_install(ret, file); + else + ret = io_install_fixed_file(req, file, issue_flags, + req->open.file_slot - 1); err: putname(req->open.filename); req->flags &= ~REQ_F_NEED_CLEANUP; @@ -3915,7 +4035,8 @@ static int io_remove_buffers_prep(struct io_kiocb *req, struct io_provide_buf *p = &req->pbuf; u64 tmp; - if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off) + if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off || + sqe->splice_fd_in) return -EINVAL; tmp = READ_ONCE(sqe->fd); @@ -3986,7 +4107,7 @@ static int io_provide_buffers_prep(struct io_kiocb *req, struct io_provide_buf *p = &req->pbuf; u64 tmp; - if (sqe->ioprio || sqe->rw_flags) + if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in) return -EINVAL; tmp = READ_ONCE(sqe->fd); @@ -4073,7 +4194,7 @@ static int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_EPOLL) - if (sqe->ioprio || sqe->buf_index) + if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -4119,7 +4240,7 @@ static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) - if (sqe->ioprio || sqe->buf_index || sqe->off) + if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -4154,7 +4275,7 @@ static int io_madvise(struct io_kiocb *req, unsigned int issue_flags) static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - if (sqe->ioprio || sqe->buf_index || sqe->addr) + if (sqe->ioprio || sqe->buf_index || sqe->addr || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -4192,7 +4313,7 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (sqe->ioprio || sqe->buf_index) + if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (req->flags & REQ_F_FIXED_FILE) return -EBADF; @@ -4228,7 +4349,7 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->ioprio || sqe->off || sqe->addr || sqe->len || - sqe->rw_flags || sqe->buf_index) + sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (req->flags & REQ_F_FIXED_FILE) return -EBADF; @@ -4289,7 +4410,8 @@ static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) + if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index || + sqe->splice_fd_in)) return -EINVAL; req->sync.off = READ_ONCE(sqe->off); @@ -4723,6 +4845,15 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); accept->flags = READ_ONCE(sqe->accept_flags); accept->nofile = rlimit(RLIMIT_NOFILE); + + accept->file_slot = READ_ONCE(sqe->file_index); + if (accept->file_slot && ((req->open.how.flags & O_CLOEXEC) || + (accept->flags & SOCK_CLOEXEC))) + return -EINVAL; + if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) + return -EINVAL; + if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK)) + accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK; return 0; } @@ -4731,20 +4862,35 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags) struct io_accept *accept = &req->accept; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0; - int ret; + bool fixed = !!accept->file_slot; + struct file *file; + int ret, fd; if (req->file->f_flags & O_NONBLOCK) req->flags |= REQ_F_NOWAIT; - ret = __sys_accept4_file(req->file, file_flags, accept->addr, - accept->addr_len, accept->flags, - accept->nofile); - if (ret == -EAGAIN && force_nonblock) - return -EAGAIN; - if (ret < 0) { + if (!fixed) { + fd = __get_unused_fd_flags(accept->flags, accept->nofile); + if (unlikely(fd < 0)) + return fd; + } + file = do_accept(req->file, file_flags, accept->addr, accept->addr_len, + accept->flags); + if (IS_ERR(file)) { + if (!fixed) + put_unused_fd(fd); + ret = PTR_ERR(file); + if (ret == -EAGAIN && force_nonblock) + return -EAGAIN; if (ret == -ERESTARTSYS) ret = -EINTR; req_set_fail(req); + } else if (!fixed) { + fd_install(fd, file); + ret = fd; + } else { + ret = io_install_fixed_file(req, file, issue_flags, + accept->file_slot - 1); } __io_req_complete(req, issue_flags, ret, 0); return 0; @@ -4764,7 +4910,8 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags) + if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags || + sqe->splice_fd_in) return -EINVAL; conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); @@ -4877,6 +5024,7 @@ static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll) { struct io_ring_ctx *ctx = req->ctx; + /* req->task == current here, checking PF_EXITING is safe */ if (unlikely(req->task->flags & PF_EXITING)) WRITE_ONCE(poll->canceled, true); @@ -4886,7 +5034,7 @@ static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll) req->result = vfs_poll(req->file, &pt) & poll->events; } - spin_lock_irq(&ctx->completion_lock); + spin_lock(&ctx->completion_lock); if (!req->result && !READ_ONCE(poll->canceled)) { add_wait_queue(poll->head, &poll->wait); return true; @@ -4920,12 +5068,12 @@ static void io_poll_remove_double(struct io_kiocb *req) if (poll && poll->head) { struct wait_queue_head *head = poll->head; - spin_lock(&head->lock); + spin_lock_irq(&head->lock); list_del_init(&poll->wait.entry); if (poll->wait.private) req_ref_put(req); poll->head = NULL; - spin_unlock(&head->lock); + spin_unlock_irq(&head->lock); } } @@ -4955,13 +5103,13 @@ static bool io_poll_complete(struct io_kiocb *req, __poll_t mask) return !(flags & IORING_CQE_F_MORE); } -static void io_poll_task_func(struct io_kiocb *req) +static void io_poll_task_func(struct io_kiocb *req, bool *locked) { struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *nxt; if (io_poll_rewait(req, &req->poll)) { - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); } else { bool done; @@ -4973,13 +5121,13 @@ static void io_poll_task_func(struct io_kiocb *req) req->result = 0; add_wait_queue(req->poll.head, &req->poll.wait); } - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); io_cqring_ev_posted(ctx); if (done) { nxt = io_put_req_find_next(req); if (nxt) - io_req_task_submit(nxt); + io_req_task_submit(nxt, locked); } } } @@ -4990,6 +5138,7 @@ static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode, struct io_kiocb *req = wait->private; struct io_poll_iocb *poll = io_poll_get_single(req); __poll_t mask = key_to_poll(key); + unsigned long flags; /* for instances that support it check for an event match first: */ if (mask && !(mask & poll->events)) @@ -5002,13 +5151,13 @@ static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode, if (poll->head) { bool done; - spin_lock(&poll->head->lock); + spin_lock_irqsave(&poll->head->lock, flags); done = list_empty(&poll->wait.entry); if (!done) list_del_init(&poll->wait.entry); /* make sure double remove sees this as being gone */ wait->private = NULL; - spin_unlock(&poll->head->lock); + spin_unlock_irqrestore(&poll->head->lock, flags); if (!done) { /* use wait func handler, so it matches the rq type */ poll->wait.func(&poll->wait, mode, sync, key); @@ -5045,8 +5194,13 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, if (unlikely(pt->nr_entries)) { struct io_poll_iocb *poll_one = poll; + /* double add on the same waitqueue head, ignore */ + if (poll_one->head == head) + return; /* already have a 2nd entry, fail a third attempt */ if (*poll_ptr) { + if ((*poll_ptr)->head == head) + return; pt->error = -EINVAL; return; } @@ -5056,9 +5210,6 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, */ if (!(poll_one->events & EPOLLONESHOT)) poll_one->events |= EPOLLONESHOT; - /* double add on the same waitqueue head, ignore */ - if (poll_one->head == head) - return; poll = kmalloc(sizeof(*poll), GFP_ATOMIC); if (!poll) { pt->error = -ENOMEM; @@ -5088,7 +5239,7 @@ static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll); } -static void io_async_task_func(struct io_kiocb *req) +static void io_async_task_func(struct io_kiocb *req, bool *locked) { struct async_poll *apoll = req->apoll; struct io_ring_ctx *ctx = req->ctx; @@ -5096,16 +5247,16 @@ static void io_async_task_func(struct io_kiocb *req) trace_io_uring_task_run(req->ctx, req, req->opcode, req->user_data); if (io_poll_rewait(req, &apoll->poll)) { - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); return; } hash_del(&req->hash_node); io_poll_remove_double(req); - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); if (!READ_ONCE(apoll->poll.canceled)) - io_req_task_submit(req); + io_req_task_submit(req, locked); else io_req_complete_failed(req, -ECANCELED); } @@ -5154,11 +5305,11 @@ static __poll_t __io_arm_poll_handler(struct io_kiocb *req, if (unlikely(!ipt->nr_entries) && !ipt->error) ipt->error = -EINVAL; - spin_lock_irq(&ctx->completion_lock); + spin_lock(&ctx->completion_lock); if (ipt->error || (mask && (poll->events & EPOLLONESHOT))) io_poll_remove_double(req); if (likely(poll->head)) { - spin_lock(&poll->head->lock); + spin_lock_irq(&poll->head->lock); if (unlikely(list_empty(&poll->wait.entry))) { if (ipt->error) cancel = true; @@ -5171,7 +5322,7 @@ static __poll_t __io_arm_poll_handler(struct io_kiocb *req, WRITE_ONCE(poll->canceled, true); else if (!poll->done) /* actually waiting for an event */ io_poll_req_insert(req); - spin_unlock(&poll->head->lock); + spin_unlock_irq(&poll->head->lock); } return mask; @@ -5213,7 +5364,7 @@ static int io_arm_poll_handler(struct io_kiocb *req) } /* if we can't nonblock try, then no point in arming a poll handler */ - if (!io_file_supports_async(req, rw)) + if (!io_file_supports_nowait(req, rw)) return IO_APOLL_ABORTED; apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); @@ -5223,16 +5374,14 @@ static int io_arm_poll_handler(struct io_kiocb *req) req->apoll = apoll; req->flags |= REQ_F_POLLED; ipt.pt._qproc = io_async_queue_proc; + io_req_set_refcount(req); ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, io_async_wake); - if (ret || ipt.error) { - spin_unlock_irq(&ctx->completion_lock); - if (ret) - return IO_APOLL_READY; - return IO_APOLL_ABORTED; - } - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); + if (ret || ipt.error) + return ret ? IO_APOLL_READY : IO_APOLL_ABORTED; + trace_io_uring_poll_arm(ctx, req, req->opcode, req->user_data, mask, apoll->poll.events); return IO_APOLL_OK; @@ -5246,46 +5395,32 @@ static bool __io_poll_remove_one(struct io_kiocb *req, if (!poll->head) return false; - spin_lock(&poll->head->lock); + spin_lock_irq(&poll->head->lock); if (do_cancel) WRITE_ONCE(poll->canceled, true); if (!list_empty(&poll->wait.entry)) { list_del_init(&poll->wait.entry); do_complete = true; } - spin_unlock(&poll->head->lock); + spin_unlock_irq(&poll->head->lock); hash_del(&req->hash_node); return do_complete; } -static bool io_poll_remove_waitqs(struct io_kiocb *req) - __must_hold(&req->ctx->completion_lock) -{ - bool do_complete; - - io_poll_remove_double(req); - do_complete = __io_poll_remove_one(req, io_poll_get_single(req), true); - - if (req->opcode != IORING_OP_POLL_ADD && do_complete) { - /* non-poll requests have submit ref still */ - req_ref_put(req); - } - return do_complete; -} - static bool io_poll_remove_one(struct io_kiocb *req) __must_hold(&req->ctx->completion_lock) { bool do_complete; - do_complete = io_poll_remove_waitqs(req); + io_poll_remove_double(req); + do_complete = __io_poll_remove_one(req, io_poll_get_single(req), true); + if (do_complete) { io_cqring_fill_event(req->ctx, req->user_data, -ECANCELED, 0); io_commit_cqring(req->ctx); req_set_fail(req); - io_put_req_deferred(req, 1); + io_put_req_deferred(req); } - return do_complete; } @@ -5299,7 +5434,7 @@ static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, struct io_kiocb *req; int posted = 0, i; - spin_lock_irq(&ctx->completion_lock); + spin_lock(&ctx->completion_lock); for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { struct hlist_head *list; @@ -5309,7 +5444,7 @@ static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, posted += io_poll_remove_one(req); } } - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); if (posted) io_cqring_ev_posted(ctx); @@ -5372,7 +5507,7 @@ static int io_poll_update_prep(struct io_kiocb *req, if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (sqe->ioprio || sqe->buf_index) + if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) return -EINVAL; flags = READ_ONCE(sqe->len); if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA | @@ -5427,6 +5562,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe if (flags & ~IORING_POLL_ADD_MULTI) return -EINVAL; + io_req_set_refcount(req); poll->events = io_poll_parse_events(sqe, flags); return 0; } @@ -5447,7 +5583,7 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) ipt.error = 0; io_poll_complete(req, mask); } - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); if (mask) { io_cqring_ev_posted(ctx); @@ -5464,7 +5600,7 @@ static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags) bool completing; int ret; - spin_lock_irq(&ctx->completion_lock); + spin_lock(&ctx->completion_lock); preq = io_poll_find(ctx, req->poll_update.old_user_data, true); if (!preq) { ret = -ENOENT; @@ -5491,7 +5627,7 @@ static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags) ret = 0; err: if (ret < 0) { - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); req_set_fail(req); io_req_complete(req, ret); return 0; @@ -5504,7 +5640,7 @@ err: } if (req->poll_update.update_user_data) preq->user_data = req->poll_update.new_user_data; - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); /* complete update request, we're done with it */ io_req_complete(req, ret); @@ -5519,6 +5655,12 @@ err: return 0; } +static void io_req_task_timeout(struct io_kiocb *req, bool *locked) +{ + req_set_fail(req); + io_req_complete_post(req, -ETIME, 0); +} + static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) { struct io_timeout_data *data = container_of(timer, @@ -5527,24 +5669,20 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) struct io_ring_ctx *ctx = req->ctx; unsigned long flags; - spin_lock_irqsave(&ctx->completion_lock, flags); + spin_lock_irqsave(&ctx->timeout_lock, flags); list_del_init(&req->timeout.list); atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); + spin_unlock_irqrestore(&ctx->timeout_lock, flags); - io_cqring_fill_event(ctx, req->user_data, -ETIME, 0); - io_commit_cqring(ctx); - spin_unlock_irqrestore(&ctx->completion_lock, flags); - - io_cqring_ev_posted(ctx); - req_set_fail(req); - io_put_req(req); + req->io_task_work.func = io_req_task_timeout; + io_req_task_work_add(req); return HRTIMER_NORESTART; } static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, __u64 user_data) - __must_hold(&ctx->completion_lock) + __must_hold(&ctx->timeout_lock) { struct io_timeout_data *io; struct io_kiocb *req; @@ -5567,6 +5705,7 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) __must_hold(&ctx->completion_lock) + __must_hold(&ctx->timeout_lock) { struct io_kiocb *req = io_timeout_extract(ctx, user_data); @@ -5575,13 +5714,54 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) req_set_fail(req); io_cqring_fill_event(ctx, req->user_data, -ECANCELED, 0); - io_put_req_deferred(req, 1); + io_put_req_deferred(req); + return 0; +} + +static clockid_t io_timeout_get_clock(struct io_timeout_data *data) +{ + switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) { + case IORING_TIMEOUT_BOOTTIME: + return CLOCK_BOOTTIME; + case IORING_TIMEOUT_REALTIME: + return CLOCK_REALTIME; + default: + /* can't happen, vetted at prep time */ + WARN_ON_ONCE(1); + fallthrough; + case 0: + return CLOCK_MONOTONIC; + } +} + +static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, + struct timespec64 *ts, enum hrtimer_mode mode) + __must_hold(&ctx->timeout_lock) +{ + struct io_timeout_data *io; + struct io_kiocb *req; + bool found = false; + + list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) { + found = user_data == req->user_data; + if (found) + break; + } + if (!found) + return -ENOENT; + + io = req->async_data; + if (hrtimer_try_to_cancel(&io->timer) == -1) + return -EALREADY; + hrtimer_init(&io->timer, io_timeout_get_clock(io), mode); + io->timer.function = io_link_timeout_fn; + hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode); return 0; } static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, struct timespec64 *ts, enum hrtimer_mode mode) - __must_hold(&ctx->completion_lock) + __must_hold(&ctx->timeout_lock) { struct io_kiocb *req = io_timeout_extract(ctx, user_data); struct io_timeout_data *data; @@ -5592,7 +5772,7 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, req->timeout.off = 0; /* noseq */ data = req->async_data; list_add_tail(&req->timeout.list, &ctx->timeout_list); - hrtimer_init(&data->timer, CLOCK_MONOTONIC, mode); + hrtimer_init(&data->timer, io_timeout_get_clock(data), mode); data->timer.function = io_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode); return 0; @@ -5607,13 +5787,18 @@ static int io_timeout_remove_prep(struct io_kiocb *req, return -EINVAL; if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; - if (sqe->ioprio || sqe->buf_index || sqe->len) + if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->splice_fd_in) return -EINVAL; + tr->ltimeout = false; tr->addr = READ_ONCE(sqe->addr); tr->flags = READ_ONCE(sqe->timeout_flags); - if (tr->flags & IORING_TIMEOUT_UPDATE) { - if (tr->flags & ~(IORING_TIMEOUT_UPDATE|IORING_TIMEOUT_ABS)) + if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) { + if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1) + return -EINVAL; + if (tr->flags & IORING_LINK_TIMEOUT_UPDATE) + tr->ltimeout = true; + if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS)) return -EINVAL; if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2))) return -EFAULT; @@ -5640,20 +5825,26 @@ static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) struct io_ring_ctx *ctx = req->ctx; int ret; - spin_lock_irq(&ctx->completion_lock); - if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) + if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) { + spin_lock(&ctx->completion_lock); + spin_lock_irq(&ctx->timeout_lock); ret = io_timeout_cancel(ctx, tr->addr); - else - ret = io_timeout_update(ctx, tr->addr, &tr->ts, - io_translate_timeout_mode(tr->flags)); + spin_unlock_irq(&ctx->timeout_lock); + spin_unlock(&ctx->completion_lock); + } else { + enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags); + + spin_lock_irq(&ctx->timeout_lock); + if (tr->ltimeout) + ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode); + else + ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode); + spin_unlock_irq(&ctx->timeout_lock); + } - io_cqring_fill_event(ctx, req->user_data, ret, 0); - io_commit_cqring(ctx); - spin_unlock_irq(&ctx->completion_lock); - io_cqring_ev_posted(ctx); if (ret < 0) req_set_fail(req); - io_put_req(req); + io_req_complete_post(req, ret, 0); return 0; } @@ -5666,14 +5857,19 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (sqe->ioprio || sqe->buf_index || sqe->len != 1) + if (sqe->ioprio || sqe->buf_index || sqe->len != 1 || + sqe->splice_fd_in) return -EINVAL; if (off && is_timeout_link) return -EINVAL; flags = READ_ONCE(sqe->timeout_flags); - if (flags & ~IORING_TIMEOUT_ABS) + if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK)) + return -EINVAL; + /* more than one clock specified is invalid, obviously */ + if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1) return -EINVAL; + INIT_LIST_HEAD(&req->timeout.list); req->timeout.off = off; if (unlikely(off && !req->ctx->off_timeout_used)) req->ctx->off_timeout_used = true; @@ -5683,14 +5879,24 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, data = req->async_data; data->req = req; + data->flags = flags; if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr))) return -EFAULT; data->mode = io_translate_timeout_mode(flags); - hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode); - if (is_timeout_link) - io_req_track_inflight(req); + hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode); + + if (is_timeout_link) { + struct io_submit_link *link = &req->ctx->submit_state.link; + + if (!link->head) + return -EINVAL; + if (link->last->opcode == IORING_OP_LINK_TIMEOUT) + return -EINVAL; + req->timeout.head = link->last; + link->last->flags |= REQ_F_ARM_LTIMEOUT; + } return 0; } @@ -5701,7 +5907,7 @@ static int io_timeout(struct io_kiocb *req, unsigned int issue_flags) struct list_head *entry; u32 tail, off = req->timeout.off; - spin_lock_irq(&ctx->completion_lock); + spin_lock_irq(&ctx->timeout_lock); /* * sqe->off holds how many events that need to occur for this @@ -5740,7 +5946,7 @@ add: list_add(&req->timeout.list, entry); data->timer.function = io_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); - spin_unlock_irq(&ctx->completion_lock); + spin_unlock_irq(&ctx->timeout_lock); return 0; } @@ -5783,31 +5989,27 @@ static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data, return ret; } -static void io_async_find_and_cancel(struct io_ring_ctx *ctx, - struct io_kiocb *req, __u64 sqe_addr, - int success_ret) +static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr) { - unsigned long flags; + struct io_ring_ctx *ctx = req->ctx; int ret; - ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx); - spin_lock_irqsave(&ctx->completion_lock, flags); - if (ret != -ENOENT) - goto done; - ret = io_timeout_cancel(ctx, sqe_addr); - if (ret != -ENOENT) - goto done; - ret = io_poll_cancel(ctx, sqe_addr, false); -done: - if (!ret) - ret = success_ret; - io_cqring_fill_event(ctx, req->user_data, ret, 0); - io_commit_cqring(ctx); - spin_unlock_irqrestore(&ctx->completion_lock, flags); - io_cqring_ev_posted(ctx); + WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current); - if (ret < 0) - req_set_fail(req); + ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx); + if (ret != -ENOENT) + return ret; + + spin_lock(&ctx->completion_lock); + spin_lock_irq(&ctx->timeout_lock); + ret = io_timeout_cancel(ctx, sqe_addr); + spin_unlock_irq(&ctx->timeout_lock); + if (ret != -ENOENT) + goto out; + ret = io_poll_cancel(ctx, sqe_addr, false); +out: + spin_unlock(&ctx->completion_lock); + return ret; } static int io_async_cancel_prep(struct io_kiocb *req, @@ -5817,7 +6019,8 @@ static int io_async_cancel_prep(struct io_kiocb *req, return -EINVAL; if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; - if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags) + if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags || + sqe->splice_fd_in) return -EINVAL; req->cancel.addr = READ_ONCE(sqe->addr); @@ -5831,18 +6034,9 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) struct io_tctx_node *node; int ret; - /* tasks should wait for their io-wq threads, so safe w/o sync */ - ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx); - spin_lock_irq(&ctx->completion_lock); + ret = io_try_cancel_userdata(req, sqe_addr); if (ret != -ENOENT) goto done; - ret = io_timeout_cancel(ctx, sqe_addr); - if (ret != -ENOENT) - goto done; - ret = io_poll_cancel(ctx, sqe_addr, false); - if (ret != -ENOENT) - goto done; - spin_unlock_irq(&ctx->completion_lock); /* slow path, try all io-wq's */ io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); @@ -5855,17 +6049,10 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) break; } io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); - - spin_lock_irq(&ctx->completion_lock); done: - io_cqring_fill_event(ctx, req->user_data, ret, 0); - io_commit_cqring(ctx); - spin_unlock_irq(&ctx->completion_lock); - io_cqring_ev_posted(ctx); - if (ret < 0) req_set_fail(req); - io_put_req(req); + io_req_complete_post(req, ret, 0); return 0; } @@ -5874,7 +6061,7 @@ static int io_rsrc_update_prep(struct io_kiocb *req, { if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; - if (sqe->ioprio || sqe->rw_flags) + if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in) return -EINVAL; req->rsrc_update.offset = READ_ONCE(sqe->off); @@ -6076,11 +6263,11 @@ fail: return true; } - spin_lock_irq(&ctx->completion_lock); + spin_lock(&ctx->completion_lock); if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) { - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); kfree(de); - io_queue_async_work(req); + io_queue_async_work(req, NULL); return true; } @@ -6088,7 +6275,7 @@ fail: de->req = req; de->seq = seq; list_add_tail(&de->list, &ctx->defer_list); - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); return true; } @@ -6289,16 +6476,31 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) return 0; } +static struct io_wq_work *io_wq_free_work(struct io_wq_work *work) +{ + struct io_kiocb *req = container_of(work, struct io_kiocb, work); + + req = io_put_req_find_next(req); + return req ? &req->work : NULL; +} + static void io_wq_submit_work(struct io_wq_work *work) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); struct io_kiocb *timeout; int ret = 0; + /* one will be dropped by ->io_free_work() after returning to io-wq */ + if (!(req->flags & REQ_F_REFCOUNT)) + __io_req_set_refcount(req, 2); + else + req_ref_get(req); + timeout = io_prep_linked_timeout(req); if (timeout) io_queue_linked_timeout(timeout); + /* either cancelled or io-wq is dying, so don't touch tctx->iowq */ if (work->flags & IO_WQ_WORK_CANCEL) ret = -ECANCELED; @@ -6317,29 +6519,14 @@ static void io_wq_submit_work(struct io_wq_work *work) } /* avoid locking problems by failing it from a clean context */ - if (ret) { - /* io-wq is going to take one down */ - req_ref_get(req); + if (ret) io_req_task_queue_fail(req, ret); - } } -#define FFS_ASYNC_READ 0x1UL -#define FFS_ASYNC_WRITE 0x2UL -#ifdef CONFIG_64BIT -#define FFS_ISREG 0x4UL -#else -#define FFS_ISREG 0x0UL -#endif -#define FFS_MASK ~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG) - static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table, - unsigned i) + unsigned i) { - struct io_fixed_file *table_l2; - - table_l2 = table->files[i >> IORING_FILE_TABLE_SHIFT]; - return &table_l2[i & IORING_FILE_TABLE_MASK]; + return &table->files[i]; } static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, @@ -6354,45 +6541,69 @@ static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file { unsigned long file_ptr = (unsigned long) file; - if (__io_file_supports_async(file, READ)) + if (__io_file_supports_nowait(file, READ)) file_ptr |= FFS_ASYNC_READ; - if (__io_file_supports_async(file, WRITE)) + if (__io_file_supports_nowait(file, WRITE)) file_ptr |= FFS_ASYNC_WRITE; if (S_ISREG(file_inode(file)->i_mode)) file_ptr |= FFS_ISREG; file_slot->file_ptr = file_ptr; } -static struct file *io_file_get(struct io_submit_state *state, - struct io_kiocb *req, int fd, bool fixed) +static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx, + struct io_kiocb *req, int fd) { - struct io_ring_ctx *ctx = req->ctx; struct file *file; + unsigned long file_ptr; - if (fixed) { - unsigned long file_ptr; - - if (unlikely((unsigned int)fd >= ctx->nr_user_files)) - return NULL; - fd = array_index_nospec(fd, ctx->nr_user_files); - file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr; - file = (struct file *) (file_ptr & FFS_MASK); - file_ptr &= ~FFS_MASK; - /* mask in overlapping REQ_F and FFS bits */ - req->flags |= (file_ptr << REQ_F_ASYNC_READ_BIT); - io_req_set_rsrc_node(req); - } else { - trace_io_uring_file_get(ctx, fd); - file = __io_file_get(state, fd); - - /* we don't allow fixed io_uring files */ - if (file && unlikely(file->f_op == &io_uring_fops)) - io_req_track_inflight(req); - } - + if (unlikely((unsigned int)fd >= ctx->nr_user_files)) + return NULL; + fd = array_index_nospec(fd, ctx->nr_user_files); + file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr; + file = (struct file *) (file_ptr & FFS_MASK); + file_ptr &= ~FFS_MASK; + /* mask in overlapping REQ_F and FFS bits */ + req->flags |= (file_ptr << REQ_F_NOWAIT_READ_BIT); + io_req_set_rsrc_node(req); return file; } +static struct file *io_file_get_normal(struct io_ring_ctx *ctx, + struct io_kiocb *req, int fd) +{ + struct file *file = fget(fd); + + trace_io_uring_file_get(ctx, fd); + + /* we don't allow fixed io_uring files */ + if (file && unlikely(file->f_op == &io_uring_fops)) + io_req_track_inflight(req); + return file; +} + +static inline struct file *io_file_get(struct io_ring_ctx *ctx, + struct io_kiocb *req, int fd, bool fixed) +{ + if (fixed) + return io_file_get_fixed(ctx, req, fd); + else + return io_file_get_normal(ctx, req, fd); +} + +static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked) +{ + struct io_kiocb *prev = req->timeout.prev; + int ret; + + if (prev) { + ret = io_try_cancel_userdata(req, prev->user_data); + io_req_complete_post(req, ret ?: -ETIME, 0); + io_put_req(prev); + } else { + io_req_complete_post(req, -ETIME, 0); + } +} + static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) { struct io_timeout_data *data = container_of(timer, @@ -6401,7 +6612,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) struct io_ring_ctx *ctx = req->ctx; unsigned long flags; - spin_lock_irqsave(&ctx->completion_lock, flags); + spin_lock_irqsave(&ctx->timeout_lock, flags); prev = req->timeout.head; req->timeout.head = NULL; @@ -6414,15 +6625,12 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) if (!req_ref_inc_not_zero(prev)) prev = NULL; } - spin_unlock_irqrestore(&ctx->completion_lock, flags); + list_del(&req->timeout.list); + req->timeout.prev = prev; + spin_unlock_irqrestore(&ctx->timeout_lock, flags); - if (prev) { - io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME); - io_put_req_deferred(prev, 1); - io_put_req_deferred(req, 1); - } else { - io_req_complete_post(req, -ETIME, 0); - } + req->io_task_work.func = io_req_task_link_timeout; + io_req_task_work_add(req); return HRTIMER_NORESTART; } @@ -6430,7 +6638,7 @@ static void io_queue_linked_timeout(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - spin_lock_irq(&ctx->completion_lock); + spin_lock_irq(&ctx->timeout_lock); /* * If the back reference is NULL, then our linked request finished * before we got a chance to setup the timer @@ -6441,29 +6649,17 @@ static void io_queue_linked_timeout(struct io_kiocb *req) data->timer.function = io_link_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); + list_add_tail(&req->timeout.list, &ctx->ltimeout_list); } - spin_unlock_irq(&ctx->completion_lock); + spin_unlock_irq(&ctx->timeout_lock); /* drop submission reference */ io_put_req(req); } -static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) -{ - struct io_kiocb *nxt = req->link; - - if (!nxt || (req->flags & REQ_F_LINK_TIMEOUT) || - nxt->opcode != IORING_OP_LINK_TIMEOUT) - return NULL; - - nxt->timeout.head = req; - nxt->flags |= REQ_F_LTIMEOUT_ACTIVE; - req->flags |= REQ_F_LINK_TIMEOUT; - return nxt; -} - static void __io_queue_sqe(struct io_kiocb *req) + __must_hold(&req->ctx->uring_lock) { - struct io_kiocb *linked_timeout = io_prep_linked_timeout(req); + struct io_kiocb *linked_timeout; int ret; issue_sqe: @@ -6474,50 +6670,60 @@ issue_sqe: * doesn't support non-blocking read/write attempts */ if (likely(!ret)) { - /* drop submission reference */ if (req->flags & REQ_F_COMPLETE_INLINE) { struct io_ring_ctx *ctx = req->ctx; - struct io_comp_state *cs = &ctx->submit_state.comp; + struct io_submit_state *state = &ctx->submit_state; - cs->reqs[cs->nr++] = req; - if (cs->nr == ARRAY_SIZE(cs->reqs)) + state->compl_reqs[state->compl_nr++] = req; + if (state->compl_nr == ARRAY_SIZE(state->compl_reqs)) io_submit_flush_completions(ctx); - } else { - io_put_req(req); + return; } + + linked_timeout = io_prep_linked_timeout(req); + if (linked_timeout) + io_queue_linked_timeout(linked_timeout); } else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { + linked_timeout = io_prep_linked_timeout(req); + switch (io_arm_poll_handler(req)) { case IO_APOLL_READY: + if (linked_timeout) + io_unprep_linked_timeout(req); goto issue_sqe; case IO_APOLL_ABORTED: /* * Queued up for async execution, worker will release * submit reference when the iocb is actually submitted. */ - io_queue_async_work(req); + io_queue_async_work(req, NULL); break; } + + if (linked_timeout) + io_queue_linked_timeout(linked_timeout); } else { io_req_complete_failed(req, ret); } - if (linked_timeout) - io_queue_linked_timeout(linked_timeout); } static inline void io_queue_sqe(struct io_kiocb *req) + __must_hold(&req->ctx->uring_lock) { if (unlikely(req->ctx->drain_active) && io_drain_req(req)) return; - if (likely(!(req->flags & REQ_F_FORCE_ASYNC))) { + if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)))) { __io_queue_sqe(req); + } else if (req->flags & REQ_F_FAIL) { + io_req_complete_failed(req, req->result); } else { int ret = io_req_prep_async(req); if (unlikely(ret)) io_req_complete_failed(req, ret); else - io_queue_async_work(req); + io_queue_async_work(req, NULL); } } @@ -6549,19 +6755,19 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx, static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct io_uring_sqe *sqe) + __must_hold(&ctx->uring_lock) { struct io_submit_state *state; unsigned int sqe_flags; int personality, ret = 0; + /* req is partially pre-initialised, see io_preinit_req() */ req->opcode = READ_ONCE(sqe->opcode); /* same numerical values with corresponding REQ_F_*, safe to copy */ req->flags = sqe_flags = READ_ONCE(sqe->flags); req->user_data = READ_ONCE(sqe->user_data); req->file = NULL; req->fixed_rsrc_refs = NULL; - /* one is dropped after submission, the other at completion */ - atomic_set(&req->refs, 2); req->task = current; /* enforce forwards compatibility on users */ @@ -6599,9 +6805,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, } if (io_op_defs[req->opcode].needs_file) { - bool fixed = req->flags & REQ_F_FIXED_FILE; - - req->file = io_file_get(state, req, READ_ONCE(sqe->fd), fixed); + req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd), + (sqe_flags & IOSQE_FIXED_FILE)); if (unlikely(!req->file)) ret = -EBADF; } @@ -6612,6 +6817,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct io_uring_sqe *sqe) + __must_hold(&ctx->uring_lock) { struct io_submit_link *link = &ctx->submit_state.link; int ret; @@ -6619,20 +6825,34 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, ret = io_init_req(ctx, req, sqe); if (unlikely(ret)) { fail_req: + /* fail even hard links since we don't submit */ if (link->head) { - /* fail even hard links since we don't submit */ - req_set_fail(link->head); - io_req_complete_failed(link->head, -ECANCELED); - link->head = NULL; + /* + * we can judge a link req is failed or cancelled by if + * REQ_F_FAIL is set, but the head is an exception since + * it may be set REQ_F_FAIL because of other req's failure + * so let's leverage req->result to distinguish if a head + * is set REQ_F_FAIL because of its failure or other req's + * failure so that we can set the correct ret code for it. + * init result here to avoid affecting the normal path. + */ + if (!(link->head->flags & REQ_F_FAIL)) + req_fail_link_node(link->head, -ECANCELED); + } else if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) { + /* + * the current req is a normal req, we should return + * error and thus break the submittion loop. + */ + io_req_complete_failed(req, ret); + return ret; } - io_req_complete_failed(req, ret); - return ret; + req_fail_link_node(req, ret); + } else { + ret = io_req_prep(req, sqe); + if (unlikely(ret)) + goto fail_req; } - ret = io_req_prep(req, sqe); - if (unlikely(ret)) - goto fail_req; - /* don't need @sqe from now on */ trace_io_uring_submit_sqe(ctx, req, req->opcode, req->user_data, req->flags, true, @@ -6648,9 +6868,14 @@ fail_req: if (link->head) { struct io_kiocb *head = link->head; - ret = io_req_prep_async(req); - if (unlikely(ret)) - goto fail_req; + if (!(req->flags & REQ_F_FAIL)) { + ret = io_req_prep_async(req); + if (unlikely(ret)) { + req_fail_link_node(req, ret); + if (!(head->flags & REQ_F_FAIL)) + req_fail_link_node(head, -ECANCELED); + } + } trace_io_uring_link(ctx, req, head); link->last->link = req; link->last = req; @@ -6680,11 +6905,10 @@ static void io_submit_state_end(struct io_submit_state *state, { if (state->link.head) io_queue_sqe(state->link.head); - if (state->comp.nr) + if (state->compl_nr) io_submit_flush_completions(ctx); if (state->plug_started) blk_finish_plug(&state->plug); - io_state_file_put(state); } /* @@ -6744,26 +6968,17 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) } static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) + __must_hold(&ctx->uring_lock) { - struct io_uring_task *tctx; int submitted = 0; /* make sure SQ entry isn't read before tail */ nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx)); if (!percpu_ref_tryget_many(&ctx->refs, nr)) return -EAGAIN; + io_get_task_refs(nr); - tctx = current->io_uring; - tctx->cached_refs -= nr; - if (unlikely(tctx->cached_refs < 0)) { - unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR; - - percpu_counter_add(&tctx->inflight, refill); - refcount_add(refill, ¤t->usage); - tctx->cached_refs += refill; - } io_submit_state_start(&ctx->submit_state, nr); - while (submitted < nr) { const struct io_uring_sqe *sqe; struct io_kiocb *req; @@ -6776,7 +6991,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) } sqe = io_get_sqe(ctx); if (unlikely(!sqe)) { - kmem_cache_free(req_cachep, req); + list_add(&req->inflight_entry, &ctx->submit_state.free_list); break; } /* will complete beyond this point, count as submitted */ @@ -6808,18 +7023,18 @@ static inline bool io_sqd_events_pending(struct io_sq_data *sqd) static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx) { /* Tell userspace we may need a wakeup call */ - spin_lock_irq(&ctx->completion_lock); + spin_lock(&ctx->completion_lock); WRITE_ONCE(ctx->rings->sq_flags, ctx->rings->sq_flags | IORING_SQ_NEED_WAKEUP); - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); } static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx) { - spin_lock_irq(&ctx->completion_lock); + spin_lock(&ctx->completion_lock); WRITE_ONCE(ctx->rings->sq_flags, ctx->rings->sq_flags & ~IORING_SQ_NEED_WAKEUP); - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); } static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) @@ -6841,7 +7056,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) mutex_lock(&ctx->uring_lock); if (!list_empty(&ctx->iopoll_list)) - io_do_iopoll(ctx, &nr_events, 0, true); + io_do_iopoll(ctx, &nr_events, 0); /* * Don't submit if refs are dying, good for io_uring_register(), @@ -6976,21 +7191,21 @@ static int io_sq_thread(void *data) struct io_wait_queue { struct wait_queue_entry wq; struct io_ring_ctx *ctx; - unsigned to_wait; + unsigned cq_tail; unsigned nr_timeouts; }; static inline bool io_should_wake(struct io_wait_queue *iowq) { struct io_ring_ctx *ctx = iowq->ctx; + int dist = ctx->cached_cq_tail - (int) iowq->cq_tail; /* * Wake up if we have enough events, or if a timeout occurred since we * started waiting. For timeouts, we always want to return to userspace, * regardless of event count. */ - return io_cqring_events(ctx) >= iowq->to_wait || - atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts; + return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts; } static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, @@ -7046,21 +7261,13 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, const sigset_t __user *sig, size_t sigsz, struct __kernel_timespec __user *uts) { - struct io_wait_queue iowq = { - .wq = { - .private = current, - .func = io_wake_function, - .entry = LIST_HEAD_INIT(iowq.wq.entry), - }, - .ctx = ctx, - .to_wait = min_events, - }; + struct io_wait_queue iowq; struct io_rings *rings = ctx->rings; signed long timeout = MAX_SCHEDULE_TIMEOUT; int ret; do { - io_cqring_overflow_flush(ctx, false); + io_cqring_overflow_flush(ctx); if (io_cqring_events(ctx) >= min_events) return 0; if (!io_run_task_work()) @@ -7088,11 +7295,17 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, timeout = timespec64_to_jiffies(&ts); } + init_waitqueue_func_entry(&iowq.wq, io_wake_function); + iowq.wq.private = current; + INIT_LIST_HEAD(&iowq.wq.entry); + iowq.ctx = ctx; iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); + iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events; + trace_io_uring_cqring_wait(ctx, min_events); do { /* if we can't even flush overflow, don't wait for more */ - if (!io_cqring_overflow_flush(ctx, false)) { + if (!io_cqring_overflow_flush(ctx)) { ret = -EBUSY; break; } @@ -7123,14 +7336,14 @@ static void **io_alloc_page_table(size_t size) size_t init_size = size; void **table; - table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL); + table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT); if (!table) return NULL; for (i = 0; i < nr_tables; i++) { unsigned int this_size = min_t(size_t, size, PAGE_SIZE); - table[i] = kzalloc(this_size, GFP_KERNEL); + table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT); if (!table[i]) { io_free_page_table(table, init_size); return NULL; @@ -7146,6 +7359,50 @@ static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node) kfree(ref_node); } +static void io_rsrc_node_ref_zero(struct percpu_ref *ref) +{ + struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs); + struct io_ring_ctx *ctx = node->rsrc_data->ctx; + unsigned long flags; + bool first_add = false; + + spin_lock_irqsave(&ctx->rsrc_ref_lock, flags); + node->done = true; + + while (!list_empty(&ctx->rsrc_ref_list)) { + node = list_first_entry(&ctx->rsrc_ref_list, + struct io_rsrc_node, node); + /* recycle ref nodes in order */ + if (!node->done) + break; + list_del(&node->node); + first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist); + } + spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags); + + if (first_add) + mod_delayed_work(system_wq, &ctx->rsrc_put_work, HZ); +} + +static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) +{ + struct io_rsrc_node *ref_node; + + ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); + if (!ref_node) + return NULL; + + if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero, + 0, GFP_KERNEL)) { + kfree(ref_node); + return NULL; + } + INIT_LIST_HEAD(&ref_node->node); + INIT_LIST_HEAD(&ref_node->rsrc_list); + ref_node->done = false; + return ref_node; +} + static void io_rsrc_node_switch(struct io_ring_ctx *ctx, struct io_rsrc_data *data_to_kill) { @@ -7277,17 +7534,14 @@ fail: static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files) { - size_t size = nr_files * sizeof(struct io_fixed_file); - - table->files = (struct io_fixed_file **)io_alloc_page_table(size); + table->files = kvcalloc(nr_files, sizeof(table->files[0]), + GFP_KERNEL_ACCOUNT); return !!table->files; } -static void io_free_file_tables(struct io_file_table *table, unsigned nr_files) +static void io_free_file_tables(struct io_file_table *table) { - size_t size = nr_files * sizeof(struct io_fixed_file); - - io_free_page_table((void **)table->files, size); + kvfree(table->files); table->files = NULL; } @@ -7312,7 +7566,7 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) fput(file); } #endif - io_free_file_tables(&ctx->file_table, ctx->nr_user_files); + io_free_file_tables(&ctx->file_table); io_rsrc_data_free(ctx->file_data); ctx->file_data = NULL; ctx->nr_user_files = 0; @@ -7628,11 +7882,11 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL; io_ring_submit_lock(ctx, lock_ring); - spin_lock_irq(&ctx->completion_lock); + spin_lock(&ctx->completion_lock); io_cqring_fill_event(ctx, prsrc->tag, 0, 0); ctx->cq_extra++; io_commit_cqring(ctx); - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); io_cqring_ev_posted(ctx); io_ring_submit_unlock(ctx, lock_ring); } @@ -7664,50 +7918,6 @@ static void io_rsrc_put_work(struct work_struct *work) } } -static void io_rsrc_node_ref_zero(struct percpu_ref *ref) -{ - struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs); - struct io_ring_ctx *ctx = node->rsrc_data->ctx; - unsigned long flags; - bool first_add = false; - - spin_lock_irqsave(&ctx->rsrc_ref_lock, flags); - node->done = true; - - while (!list_empty(&ctx->rsrc_ref_list)) { - node = list_first_entry(&ctx->rsrc_ref_list, - struct io_rsrc_node, node); - /* recycle ref nodes in order */ - if (!node->done) - break; - list_del(&node->node); - first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist); - } - spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags); - - if (first_add) - mod_delayed_work(system_wq, &ctx->rsrc_put_work, HZ); -} - -static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) -{ - struct io_rsrc_node *ref_node; - - ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); - if (!ref_node) - return NULL; - - if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero, - 0, GFP_KERNEL)) { - kfree(ref_node); - return NULL; - } - INIT_LIST_HEAD(&ref_node->node); - INIT_LIST_HEAD(&ref_node->rsrc_list); - ref_node->done = false; - return ref_node; -} - static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args, u64 __user *tags) { @@ -7722,6 +7932,8 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, return -EINVAL; if (nr_args > IORING_MAX_FIXED_FILES) return -EMFILE; + if (nr_args > rlimit(RLIMIT_NOFILE)) + return -EMFILE; ret = io_rsrc_node_switch_start(ctx); if (ret) return ret; @@ -7780,7 +7992,7 @@ out_fput: if (file) fput(file); } - io_free_file_tables(&ctx->file_table, nr_args); + io_free_file_tables(&ctx->file_table); ctx->nr_user_files = 0; out_free: io_rsrc_data_free(ctx->file_data); @@ -7831,6 +8043,46 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file, #endif } +static int io_install_fixed_file(struct io_kiocb *req, struct file *file, + unsigned int issue_flags, u32 slot_index) +{ + struct io_ring_ctx *ctx = req->ctx; + bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + struct io_fixed_file *file_slot; + int ret = -EBADF; + + io_ring_submit_lock(ctx, !force_nonblock); + if (file->f_op == &io_uring_fops) + goto err; + ret = -ENXIO; + if (!ctx->file_data) + goto err; + ret = -EINVAL; + if (slot_index >= ctx->nr_user_files) + goto err; + + slot_index = array_index_nospec(slot_index, ctx->nr_user_files); + file_slot = io_fixed_file_slot(&ctx->file_table, slot_index); + ret = -EBADF; + if (file_slot->file_ptr) + goto err; + + *io_get_tag_slot(ctx->file_data, slot_index) = 0; + io_fixed_file_set(file_slot, file); + ret = io_sqe_file_register(ctx, file, slot_index); + if (ret) { + file_slot->file_ptr = 0; + goto err; + } + + ret = 0; +err: + io_ring_submit_unlock(ctx, !force_nonblock); + if (ret) + fput(file); + return ret; +} + static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, struct io_rsrc_node *node, void *rsrc) { @@ -7926,14 +8178,6 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, return done ? done : err; } -static struct io_wq_work *io_free_work(struct io_wq_work *work) -{ - struct io_kiocb *req = container_of(work, struct io_kiocb, work); - - req = io_put_req_find_next(req); - return req ? &req->work : NULL; -} - static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx, struct task_struct *task) { @@ -7957,7 +8201,7 @@ static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx, data.hash = hash; data.task = task; - data.free_work = io_free_work; + data.free_work = io_wq_free_work; data.do_work = io_wq_submit_work; /* Do QD, or 4 * CPUS, whatever is smallest */ @@ -8624,33 +8868,29 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx) __io_remove_buffers(ctx, buf, index, -1U); } -static void io_req_cache_free(struct list_head *list, struct task_struct *tsk) +static void io_req_cache_free(struct list_head *list) { struct io_kiocb *req, *nxt; - list_for_each_entry_safe(req, nxt, list, compl.list) { - if (tsk && req->task != tsk) - continue; - list_del(&req->compl.list); + list_for_each_entry_safe(req, nxt, list, inflight_entry) { + list_del(&req->inflight_entry); kmem_cache_free(req_cachep, req); } } static void io_req_caches_free(struct io_ring_ctx *ctx) { - struct io_submit_state *submit_state = &ctx->submit_state; - struct io_comp_state *cs = &ctx->submit_state.comp; + struct io_submit_state *state = &ctx->submit_state; mutex_lock(&ctx->uring_lock); - if (submit_state->free_reqs) { - kmem_cache_free_bulk(req_cachep, submit_state->free_reqs, - submit_state->reqs); - submit_state->free_reqs = 0; + if (state->free_reqs) { + kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs); + state->free_reqs = 0; } - io_flush_cached_locked_reqs(ctx, cs); - io_req_cache_free(&cs->free_list, NULL); + io_flush_cached_locked_reqs(ctx, state); + io_req_cache_free(&state->free_list); mutex_unlock(&ctx->uring_lock); } @@ -8702,6 +8942,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) sock_release(ctx->ring_sock); } #endif + WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); io_mem_free(ctx->rings); io_mem_free(ctx->sq_sqes); @@ -8801,6 +9042,7 @@ static void io_ring_exit_work(struct work_struct *work) { struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work); unsigned long timeout = jiffies + HZ * 60 * 5; + unsigned long interval = HZ / 20; struct io_tctx_exit exit; struct io_tctx_node *node; int ret; @@ -8825,8 +9067,11 @@ static void io_ring_exit_work(struct work_struct *work) io_sq_thread_unpark(sqd); } - WARN_ON_ONCE(time_after(jiffies, timeout)); - } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)); + if (WARN_ON_ONCE(time_after(jiffies, timeout))) { + /* there is little hope left, don't run it too often */ + interval = HZ * 60; + } + } while (!wait_for_completion_timeout(&ctx->ref_comp, interval)); init_completion(&exit.completion); init_task_work(&exit.task_work, io_tctx_exit_cb); @@ -8855,8 +9100,8 @@ static void io_ring_exit_work(struct work_struct *work) mutex_lock(&ctx->uring_lock); } mutex_unlock(&ctx->uring_lock); - spin_lock_irq(&ctx->completion_lock); - spin_unlock_irq(&ctx->completion_lock); + spin_lock(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); io_ring_ctx_free(ctx); } @@ -8868,16 +9113,18 @@ static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, struct io_kiocb *req, *tmp; int canceled = 0; - spin_lock_irq(&ctx->completion_lock); + spin_lock(&ctx->completion_lock); + spin_lock_irq(&ctx->timeout_lock); list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { if (io_match_task(req, tsk, cancel_all)) { io_kill_timeout(req, -ECANCELED); canceled++; } } + spin_unlock_irq(&ctx->timeout_lock); if (canceled != 0) io_commit_cqring(ctx); - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); if (canceled != 0) io_cqring_ev_posted(ctx); return canceled != 0; @@ -8933,13 +9180,12 @@ static bool io_cancel_task_cb(struct io_wq_work *work, void *data) bool ret; if (!cancel->all && (req->flags & REQ_F_LINK_TIMEOUT)) { - unsigned long flags; struct io_ring_ctx *ctx = req->ctx; /* protect against races with linked timeouts */ - spin_lock_irqsave(&ctx->completion_lock, flags); + spin_lock(&ctx->completion_lock); ret = io_match_task(req, cancel->task, cancel->all); - spin_unlock_irqrestore(&ctx->completion_lock, flags); + spin_unlock(&ctx->completion_lock); } else { ret = io_match_task(req, cancel->task, cancel->all); } @@ -8952,14 +9198,14 @@ static bool io_cancel_defer_files(struct io_ring_ctx *ctx, struct io_defer_entry *de; LIST_HEAD(list); - spin_lock_irq(&ctx->completion_lock); + spin_lock(&ctx->completion_lock); list_for_each_entry_reverse(de, &ctx->defer_list, list) { if (io_match_task(de->req, task, cancel_all)) { list_cut_position(&list, &ctx->defer_list, &de->list); break; } } - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); if (list_empty(&list)) return false; @@ -9124,8 +9370,8 @@ static void io_uring_clean_tctx(struct io_uring_task *tctx) * Must be after io_uring_del_task_file() (removes nodes under * uring_lock) to avoid race with io_uring_try_cancel_iowq(). */ - tctx->io_wq = NULL; io_wq_put_and_exit(wq); + tctx->io_wq = NULL; } } @@ -9141,9 +9387,11 @@ static void io_uring_drop_tctx_refs(struct task_struct *task) struct io_uring_task *tctx = task->io_uring; unsigned int refs = tctx->cached_refs; - tctx->cached_refs = 0; - percpu_counter_sub(&tctx->inflight, refs); - put_task_struct_many(task, refs); + if (refs) { + tctx->cached_refs = 0; + percpu_counter_sub(&tctx->inflight, refs); + put_task_struct_many(task, refs); + } } /* @@ -9164,9 +9412,9 @@ static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) if (tctx->io_wq) io_wq_exit_start(tctx->io_wq); - io_uring_drop_tctx_refs(current); atomic_inc(&tctx->in_idle); do { + io_uring_drop_tctx_refs(current); /* read completions before cancelations */ inflight = tctx_inflight(tctx, !cancel_all); if (!inflight) @@ -9190,6 +9438,7 @@ static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) } prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE); + io_uring_drop_tctx_refs(current); /* * If we've seen completions, retry without waiting. This * avoids a race where a completion comes in before we did @@ -9208,9 +9457,9 @@ static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) } } -void __io_uring_cancel(struct files_struct *files) +void __io_uring_cancel(bool cancel_all) { - io_uring_cancel_generic(!files, NULL); + io_uring_cancel_generic(cancel_all, NULL); } static void *io_uring_validate_mmap_request(struct file *file, @@ -9370,7 +9619,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, */ ret = 0; if (ctx->flags & IORING_SETUP_SQPOLL) { - io_cqring_overflow_flush(ctx, false); + io_cqring_overflow_flush(ctx); if (unlikely(ctx->sq_data->thread == NULL)) { ret = -EOWNERDEAD; @@ -9506,7 +9755,7 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) io_uring_show_cred(m, index, cred); } seq_printf(m, "PollList:\n"); - spin_lock_irq(&ctx->completion_lock); + spin_lock(&ctx->completion_lock); for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { struct hlist_head *list = &ctx->cancel_hash[i]; struct io_kiocb *req; @@ -9515,7 +9764,7 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) seq_printf(m, " op=%d, task_works=%d\n", req->opcode, req->task->task_works != NULL); } - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); if (has_lock) mutex_unlock(&ctx->uring_lock); } @@ -10048,6 +10297,31 @@ static int io_unregister_iowq_aff(struct io_ring_ctx *ctx) return io_wq_cpu_affinity(tctx->io_wq, NULL); } +static int io_register_iowq_max_workers(struct io_ring_ctx *ctx, + void __user *arg) +{ + struct io_uring_task *tctx = current->io_uring; + __u32 new_count[2]; + int i, ret; + + if (!tctx || !tctx->io_wq) + return -EINVAL; + if (copy_from_user(new_count, arg, sizeof(new_count))) + return -EFAULT; + for (i = 0; i < ARRAY_SIZE(new_count); i++) + if (new_count[i] > INT_MAX) + return -EINVAL; + + ret = io_wq_max_workers(tctx->io_wq, new_count); + if (ret) + return ret; + + if (copy_to_user(arg, new_count, sizeof(new_count))) + return -EFAULT; + + return 0; +} + static bool io_register_op_must_quiesce(int op) { switch (op) { @@ -10065,12 +10339,40 @@ static bool io_register_op_must_quiesce(int op) case IORING_REGISTER_BUFFERS_UPDATE: case IORING_REGISTER_IOWQ_AFF: case IORING_UNREGISTER_IOWQ_AFF: + case IORING_REGISTER_IOWQ_MAX_WORKERS: return false; default: return true; } } +static int io_ctx_quiesce(struct io_ring_ctx *ctx) +{ + long ret; + + percpu_ref_kill(&ctx->refs); + + /* + * Drop uring mutex before waiting for references to exit. If another + * thread is currently inside io_uring_enter() it might need to grab the + * uring_lock to make progress. If we hold it here across the drain + * wait, then we can deadlock. It's safe to drop the mutex here, since + * no new references will come in after we've killed the percpu ref. + */ + mutex_unlock(&ctx->uring_lock); + do { + ret = wait_for_completion_interruptible(&ctx->ref_comp); + if (!ret) + break; + ret = io_run_task_work_sig(); + } while (ret >= 0); + mutex_lock(&ctx->uring_lock); + + if (ret) + io_refs_resurrect(&ctx->refs, &ctx->ref_comp); + return ret; +} + static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, void __user *arg, unsigned nr_args) __releases(ctx->uring_lock) @@ -10095,31 +10397,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, } if (io_register_op_must_quiesce(opcode)) { - percpu_ref_kill(&ctx->refs); - - /* - * Drop uring mutex before waiting for references to exit. If - * another thread is currently inside io_uring_enter() it might - * need to grab the uring_lock to make progress. If we hold it - * here across the drain wait, then we can deadlock. It's safe - * to drop the mutex here, since no new references will come in - * after we've killed the percpu ref. - */ - mutex_unlock(&ctx->uring_lock); - do { - ret = wait_for_completion_interruptible(&ctx->ref_comp); - if (!ret) - break; - ret = io_run_task_work_sig(); - if (ret < 0) - break; - } while (1); - mutex_lock(&ctx->uring_lock); - - if (ret) { - io_refs_resurrect(&ctx->refs, &ctx->ref_comp); + ret = io_ctx_quiesce(ctx); + if (ret) return ret; - } } switch (opcode) { @@ -10216,6 +10496,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_unregister_iowq_aff(ctx); break; + case IORING_REGISTER_IOWQ_MAX_WORKERS: + ret = -EINVAL; + if (!arg || nr_args != 2) + break; + ret = io_register_iowq_max_workers(ctx, arg); + break; default: ret = -EINVAL; break; @@ -10297,11 +10583,16 @@ static int __init io_uring_init(void) BUILD_BUG_SQE_ELEM(40, __u16, buf_group); BUILD_BUG_SQE_ELEM(42, __u16, personality); BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in); + BUILD_BUG_SQE_ELEM(44, __u32, file_index); BUILD_BUG_ON(sizeof(struct io_uring_files_update) != sizeof(struct io_uring_rsrc_update)); BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) > sizeof(struct io_uring_rsrc_update2)); + + /* ->buf_index is u16 */ + BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); + /* should fit into one byte */ BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8)); diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 4abd928b0bc8..f6b2d280aab5 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1053,7 +1053,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_time_gran = 1; sb->s_max_links = NILFS_LINK_MAX; - sb->s_bdi = bdi_get(sb->s_bdev->bd_bdi); + sb->s_bdi = bdi_get(sb->s_bdev->bd_disk->bdi); err = load_nilfs(nilfs, sb); if (err) diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 855f0e87066d..2db8bcf7ff85 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -49,8 +49,7 @@ static int copy_bio_to_actor(struct bio *bio, bytes_to_copy = min_t(int, bytes_to_copy, req_length - copied_bytes); - memcpy(actor_addr + actor_offset, - page_address(bvec->bv_page) + bvec->bv_offset + offset, + memcpy(actor_addr + actor_offset, bvec_virt(bvec) + offset, bytes_to_copy); actor_offset += bytes_to_copy; @@ -177,7 +176,7 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length, goto out_free_bio; } /* Extract the length of the metadata block */ - data = page_address(bvec->bv_page) + bvec->bv_offset; + data = bvec_virt(bvec); length = data[offset]; if (offset < bvec->bv_len - 1) { length |= data[offset + 1] << 8; @@ -186,7 +185,7 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length, res = -EIO; goto out_free_bio; } - data = page_address(bvec->bv_page) + bvec->bv_offset; + data = bvec_virt(bvec); length |= data[0] << 8; } bio_free_pages(bio); diff --git a/fs/squashfs/lz4_wrapper.c b/fs/squashfs/lz4_wrapper.c index 233d5582fbee..b685b6238316 100644 --- a/fs/squashfs/lz4_wrapper.c +++ b/fs/squashfs/lz4_wrapper.c @@ -101,7 +101,7 @@ static int lz4_uncompress(struct squashfs_sb_info *msblk, void *strm, while (bio_next_segment(bio, &iter_all)) { int avail = min(bytes, ((int)bvec->bv_len) - offset); - data = page_address(bvec->bv_page) + bvec->bv_offset; + data = bvec_virt(bvec); memcpy(buff, data + offset, avail); buff += avail; bytes -= avail; diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c index 97bb7d92ddcd..cb510a631968 100644 --- a/fs/squashfs/lzo_wrapper.c +++ b/fs/squashfs/lzo_wrapper.c @@ -76,7 +76,7 @@ static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm, while (bio_next_segment(bio, &iter_all)) { int avail = min(bytes, ((int)bvec->bv_len) - offset); - data = page_address(bvec->bv_page) + bvec->bv_offset; + data = bvec_virt(bvec); memcpy(buff, data + offset, avail); buff += avail; bytes -= avail; diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c index e80419aed862..68f6d09bb3a2 100644 --- a/fs/squashfs/xz_wrapper.c +++ b/fs/squashfs/xz_wrapper.c @@ -146,7 +146,7 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, } avail = min(length, ((int)bvec->bv_len) - offset); - data = page_address(bvec->bv_page) + bvec->bv_offset; + data = bvec_virt(bvec); length -= avail; stream->buf.in = data + offset; stream->buf.in_size = avail; diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c index bcb881ec47f2..a20e9042146b 100644 --- a/fs/squashfs/zlib_wrapper.c +++ b/fs/squashfs/zlib_wrapper.c @@ -76,7 +76,7 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, } avail = min(length, ((int)bvec->bv_len) - offset); - data = page_address(bvec->bv_page) + bvec->bv_offset; + data = bvec_virt(bvec); length -= avail; stream->next_in = data + offset; stream->avail_in = avail; diff --git a/fs/squashfs/zstd_wrapper.c b/fs/squashfs/zstd_wrapper.c index b7cb1faa652d..0015cf8b5582 100644 --- a/fs/squashfs/zstd_wrapper.c +++ b/fs/squashfs/zstd_wrapper.c @@ -94,7 +94,7 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm, } avail = min(length, ((int)bvec->bv_len) - offset); - data = page_address(bvec->bv_page) + bvec->bv_offset; + data = bvec_virt(bvec); length -= avail; in_buf.src = data + offset; in_buf.size = avail; diff --git a/fs/super.c b/fs/super.c index 91b7f156735b..bcef3a6f4c4b 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1203,7 +1203,7 @@ static int set_bdev_super(struct super_block *s, void *data) { s->s_bdev = data; s->s_dev = s->s_bdev->bd_dev; - s->s_bdi = bdi_get(s->s_bdev->bd_bdi); + s->s_bdi = bdi_get(s->s_bdev->bd_disk->bdi); if (blk_queue_stable_writes(s->s_bdev->bd_disk->queue)) s->s_iflags |= SB_I_STABLE_WRITES; diff --git a/fs/timerfd.c b/fs/timerfd.c index c5509d2448e3..e9c96a0c79f1 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -115,6 +115,22 @@ void timerfd_clock_was_set(void) rcu_read_unlock(); } +static void timerfd_resume_work(struct work_struct *work) +{ + timerfd_clock_was_set(); +} + +static DECLARE_WORK(timerfd_work, timerfd_resume_work); + +/* + * Invoked from timekeeping_resume(). Defer the actual update to work so + * timerfd_clock_was_set() runs in task context. + */ +void timerfd_resume(void) +{ + schedule_work(&timerfd_work); +} + static void __timerfd_remove_cancel(struct timerfd_ctx *ctx) { if (ctx->might_cancel) { diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 8ff42b3585e0..3ab73567a0f5 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -844,7 +844,7 @@ xfs_buf_readahead_map( { struct xfs_buf *bp; - if (bdi_read_congested(target->bt_bdev->bd_bdi)) + if (bdi_read_congested(target->bt_bdev->bd_disk->bdi)) return; xfs_buf_read_map(target, map, nmaps, diff --git a/include/dt-bindings/clock/ingenic,sysost.h b/include/dt-bindings/clock/ingenic,sysost.h index 063791b01ab3..d7aa42c08ded 100644 --- a/include/dt-bindings/clock/ingenic,sysost.h +++ b/include/dt-bindings/clock/ingenic,sysost.h @@ -13,4 +13,23 @@ #define OST_CLK_PERCPU_TIMER2 3 #define OST_CLK_PERCPU_TIMER3 4 +#define OST_CLK_EVENT_TIMER 1 + +#define OST_CLK_EVENT_TIMER0 0 +#define OST_CLK_EVENT_TIMER1 1 +#define OST_CLK_EVENT_TIMER2 2 +#define OST_CLK_EVENT_TIMER3 3 +#define OST_CLK_EVENT_TIMER4 4 +#define OST_CLK_EVENT_TIMER5 5 +#define OST_CLK_EVENT_TIMER6 6 +#define OST_CLK_EVENT_TIMER7 7 +#define OST_CLK_EVENT_TIMER8 8 +#define OST_CLK_EVENT_TIMER9 9 +#define OST_CLK_EVENT_TIMER10 10 +#define OST_CLK_EVENT_TIMER11 11 +#define OST_CLK_EVENT_TIMER12 12 +#define OST_CLK_EVENT_TIMER13 13 +#define OST_CLK_EVENT_TIMER14 14 +#define OST_CLK_EVENT_TIMER15 15 + #endif /* __DT_BINDINGS_CLOCK_INGENIC_OST_H__ */ diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 44df4fcef65c..29530859d9ff 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -143,7 +143,7 @@ static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) sb = inode->i_sb; #ifdef CONFIG_BLOCK if (sb_is_blkdev_sb(sb)) - return I_BDEV(inode)->bd_bdi; + return I_BDEV(inode)->bd_disk->bdi; #endif return sb->s_bdi; } diff --git a/include/linux/bio.h b/include/linux/bio.h index 2203b686e1f0..7b5f65a81f2b 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -5,7 +5,6 @@ #ifndef __LINUX_BIO_H #define __LINUX_BIO_H -#include #include #include /* struct bio, bio_vec and BIO_* flags are defined in blk_types.h */ @@ -519,47 +518,6 @@ static inline void bio_clone_blkg_association(struct bio *dst, struct bio *src) { } #endif /* CONFIG_BLK_CGROUP */ -#ifdef CONFIG_HIGHMEM -/* - * remember never ever reenable interrupts between a bvec_kmap_irq and - * bvec_kunmap_irq! - */ -static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags) -{ - unsigned long addr; - - /* - * might not be a highmem page, but the preempt/irq count - * balancing is a lot nicer this way - */ - local_irq_save(*flags); - addr = (unsigned long) kmap_atomic(bvec->bv_page); - - BUG_ON(addr & ~PAGE_MASK); - - return (char *) addr + bvec->bv_offset; -} - -static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags) -{ - unsigned long ptr = (unsigned long) buffer & PAGE_MASK; - - kunmap_atomic((void *) ptr); - local_irq_restore(*flags); -} - -#else -static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags) -{ - return page_address(bvec->bv_page) + bvec->bv_offset; -} - -static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags) -{ - *flags = 0; -} -#endif - /* * BIO list management for use by remapping drivers (e.g. DM or MD) and loop. * diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 37048438872c..b4de2010fba5 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -152,8 +152,8 @@ typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd); -typedef size_t (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd, char *buf, - size_t size); +typedef bool (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd, + struct seq_file *s); struct blkcg_policy { int plid; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 1d18447ebebc..13ba1861e688 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -404,7 +404,13 @@ enum { BLK_MQ_F_STACKING = 1 << 2, BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3, BLK_MQ_F_BLOCKING = 1 << 5, + /* Do not allow an I/O scheduler to be configured. */ BLK_MQ_F_NO_SCHED = 1 << 6, + /* + * Select 'none' during queue registration in case of a single hwq + * or shared hwqs instead of 'mq-deadline'. + */ + BLK_MQ_F_NO_SCHED_BY_DEFAULT = 1 << 7, BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, BLK_MQ_F_ALLOC_POLICY_BITS = 1, @@ -426,18 +432,14 @@ enum { ((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \ << BLK_MQ_F_ALLOC_POLICY_START_BIT) +struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, + struct lock_class_key *lkclass); #define blk_mq_alloc_disk(set, queuedata) \ ({ \ static struct lock_class_key __key; \ - struct gendisk *__disk = __blk_mq_alloc_disk(set, queuedata); \ \ - if (!IS_ERR(__disk)) \ - lockdep_init_map(&__disk->lockdep_map, \ - "(bio completion)", &__key, 0); \ - __disk; \ + __blk_mq_alloc_disk(set, queuedata, &__key); \ }) -struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, - void *queuedata); struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index c2890767ac3e..054aa6722fa9 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -34,14 +34,10 @@ struct block_device { void * bd_holder; int bd_holders; bool bd_write_holder; -#ifdef CONFIG_SYSFS - struct list_head bd_holder_disks; -#endif struct kobject *bd_holder_dir; u8 bd_partno; spinlock_t bd_size_lock; /* for bd_inode->i_size updates */ struct gendisk * bd_disk; - struct backing_dev_info *bd_bdi; /* The counter of freeze processes */ int bd_fsfreeze_count; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d3afea47ade6..c9cb12483e12 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include @@ -398,8 +397,6 @@ struct request_queue { struct blk_mq_hw_ctx **queue_hw_ctx; unsigned int nr_hw_queues; - struct backing_dev_info *backing_dev_info; - /* * The queue owner gets to use this for whatever they like. * ll_rw_blk doesn't touch it. @@ -424,6 +421,8 @@ struct request_queue { spinlock_t queue_lock; + struct gendisk *disk; + /* * queue kobject */ @@ -664,8 +663,6 @@ extern void blk_clear_pm_only(struct request_queue *q); dma_map_page_attrs(dev, (bv)->bv_page, (bv)->bv_offset, (bv)->bv_len, \ (dir), (attrs)) -#define queue_to_disk(q) (dev_to_disk(kobj_to_dev((q)->kobj.parent))) - static inline bool queue_is_mq(struct request_queue *q) { return q->mq_ops; @@ -941,6 +938,10 @@ static inline struct request_queue *bdev_get_queue(struct block_device *bdev) #define SECTOR_SIZE (1 << SECTOR_SHIFT) #endif +#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) +#define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT) +#define SECTOR_MASK (PAGE_SECTORS - 1) + /* * blk_rq_pos() : the current sector * blk_rq_bytes() : bytes left in the entire request @@ -1139,7 +1140,7 @@ void blk_queue_zone_write_granularity(struct request_queue *q, unsigned int size); extern void blk_queue_alignment_offset(struct request_queue *q, unsigned int alignment); -void blk_queue_update_readahead(struct request_queue *q); +void disk_update_readahead(struct gendisk *disk); extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min); extern void blk_queue_io_min(struct request_queue *q, unsigned int min); extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt); @@ -1521,6 +1522,22 @@ static inline int queue_limit_discard_alignment(struct queue_limits *lim, sector return offset << SECTOR_SHIFT; } +/* + * Two cases of handling DISCARD merge: + * If max_discard_segments > 1, the driver takes every bio + * as a range and send them to controller together. The ranges + * needn't to be contiguous. + * Otherwise, the bios/requests will be handled as same as + * others which should be contiguous. + */ +static inline bool blk_discard_mergable(struct request *req) +{ + if (req_op(req) == REQ_OP_DISCARD && + queue_max_discard_segments(req->q) > 1) + return true; + return false; +} + static inline int bdev_discard_alignment(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); @@ -1855,6 +1872,13 @@ struct block_device_operations { char *(*devnode)(struct gendisk *disk, umode_t *mode); struct module *owner; const struct pr_ops *pr_ops; + + /* + * Special callback for probing GPT entry at a given sector. + * Needed by Android devices, used by GPT scanner and MMC blk + * driver. + */ + int (*alternative_gpt_sector)(struct gendisk *disk, sector_t *sector); }; #ifdef CONFIG_COMPAT @@ -1984,8 +2008,6 @@ void blkdev_put_no_open(struct block_device *bdev); struct block_device *bdev_alloc(struct gendisk *disk, u8 partno); void bdev_add(struct block_device *bdev, dev_t dev); struct block_device *I_BDEV(struct inode *inode); -struct block_device *bdgrab(struct block_device *bdev); -void bdput(struct block_device *); int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart, loff_t lend); diff --git a/include/linux/bvec.h b/include/linux/bvec.h index ff832e698efb..0e9bdd42dafb 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -4,9 +4,10 @@ * * Copyright (C) 2001 Ming Lei */ -#ifndef __LINUX_BVEC_ITER_H -#define __LINUX_BVEC_ITER_H +#ifndef __LINUX_BVEC_H +#define __LINUX_BVEC_H +#include #include #include #include @@ -183,4 +184,61 @@ static inline void bvec_advance(const struct bio_vec *bvec, } } -#endif /* __LINUX_BVEC_ITER_H */ +/** + * bvec_kmap_local - map a bvec into the kernel virtual address space + * @bvec: bvec to map + * + * Must be called on single-page bvecs only. Call kunmap_local on the returned + * address to unmap. + */ +static inline void *bvec_kmap_local(struct bio_vec *bvec) +{ + return kmap_local_page(bvec->bv_page) + bvec->bv_offset; +} + +/** + * memcpy_from_bvec - copy data from a bvec + * @bvec: bvec to copy from + * + * Must be called on single-page bvecs only. + */ +static inline void memcpy_from_bvec(char *to, struct bio_vec *bvec) +{ + memcpy_from_page(to, bvec->bv_page, bvec->bv_offset, bvec->bv_len); +} + +/** + * memcpy_to_bvec - copy data to a bvec + * @bvec: bvec to copy to + * + * Must be called on single-page bvecs only. + */ +static inline void memcpy_to_bvec(struct bio_vec *bvec, const char *from) +{ + memcpy_to_page(bvec->bv_page, bvec->bv_offset, from, bvec->bv_len); +} + +/** + * memzero_bvec - zero all data in a bvec + * @bvec: bvec to zero + * + * Must be called on single-page bvecs only. + */ +static inline void memzero_bvec(struct bio_vec *bvec) +{ + memzero_page(bvec->bv_page, bvec->bv_offset, bvec->bv_len); +} + +/** + * bvec_virt - return the virtual address for a bvec + * @bvec: bvec to return the virtual address for + * + * Note: the caller must ensure that @bvec->bv_page is not a highmem page. + */ +static inline void *bvec_virt(struct bio_vec *bvec) +{ + WARN_ON_ONCE(PageHighMem(bvec->bv_page)); + return page_address(bvec->bv_page) + bvec->bv_offset; +} + +#endif /* __LINUX_BVEC_H */ diff --git a/include/linux/cmdline-parser.h b/include/linux/cmdline-parser.h deleted file mode 100644 index 68a541807bdf..000000000000 --- a/include/linux/cmdline-parser.h +++ /dev/null @@ -1,46 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Parsing command line, get the partitions information. - * - * Written by Cai Zhiyong - * - */ -#ifndef CMDLINEPARSEH -#define CMDLINEPARSEH - -#include -#include -#include - -/* partition flags */ -#define PF_RDONLY 0x01 /* Device is read only */ -#define PF_POWERUP_LOCK 0x02 /* Always locked after reset */ - -struct cmdline_subpart { - char name[BDEVNAME_SIZE]; /* partition name, such as 'rootfs' */ - sector_t from; - sector_t size; - int flags; - struct cmdline_subpart *next_subpart; -}; - -struct cmdline_parts { - char name[BDEVNAME_SIZE]; /* block device, such as 'mmcblk0' */ - unsigned int nr_subparts; - struct cmdline_subpart *subpart; - struct cmdline_parts *next_parts; -}; - -void cmdline_parts_free(struct cmdline_parts **parts); - -int cmdline_parts_parse(struct cmdline_parts **parts, const char *cmdline); - -struct cmdline_parts *cmdline_parts_find(struct cmdline_parts *parts, - const char *bdev); - -int cmdline_parts_set(struct cmdline_parts *parts, sector_t disk_size, - int slot, - int (*add_part)(int, struct cmdline_subpart *, void *), - void *param); - -#endif /* CMDLINEPARSEH */ diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 7457d49acf9a..94f2cd6a8e83 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -151,7 +151,6 @@ typedef size_t (*dm_dax_copy_iter_fn)(struct dm_target *ti, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i); typedef int (*dm_dax_zero_page_range_fn)(struct dm_target *ti, pgoff_t pgoff, size_t nr_pages); -#define PAGE_SECTORS (PAGE_SIZE / 512) void dm_error(const char *message); diff --git a/include/linux/fs.h b/include/linux/fs.h index 6adf824e6459..bb25ce88be50 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3204,10 +3204,6 @@ ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb, ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb, struct iov_iter *iter); -/* fs/block_dev.c */ -extern int blkdev_fsync(struct file *filp, loff_t start, loff_t end, - int datasync); - /* fs/splice.c */ extern ssize_t generic_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 13b34177cc85..c68d83c87f83 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -60,9 +60,6 @@ struct partition_meta_info { * device. * Affects responses to the ``CDROM_GET_CAPABILITY`` ioctl. * - * ``GENHD_FL_UP`` (0x0010): indicates that the block device is "up", - * with a similar meaning to network interfaces. - * * ``GENHD_FL_SUPPRESS_PARTITION_INFO`` (0x0020): don't include * partition information in ``/proc/partitions`` or in the output of * printk_all_partitions(). @@ -97,7 +94,6 @@ struct partition_meta_info { /* 2 is unused (used to be GENHD_FL_DRIVERFS) */ /* 4 is unused (used to be GENHD_FL_MEDIA_CHANGE_NOTIFY) */ #define GENHD_FL_CD 0x0008 -#define GENHD_FL_UP 0x0010 #define GENHD_FL_SUPPRESS_PARTITION_INFO 0x0020 #define GENHD_FL_EXT_DEVT 0x0040 #define GENHD_FL_NATIVE_CAPACITY 0x0080 @@ -153,13 +149,15 @@ struct gendisk { unsigned long state; #define GD_NEED_PART_SCAN 0 #define GD_READ_ONLY 1 -#define GD_QUEUE_REF 2 struct mutex open_mutex; /* open/close mutex */ unsigned open_partitions; /* number of open partitions */ + struct backing_dev_info *bdi; struct kobject *slave_dir; - +#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED + struct list_head slave_bdevs; +#endif struct timer_rand_state *random; atomic_t sync_io; /* RAID */ struct disk_events *ev; @@ -172,8 +170,14 @@ struct gendisk { int node_id; struct badblocks *bb; struct lockdep_map lockdep_map; + u64 diskseq; }; +static inline bool disk_live(struct gendisk *disk) +{ + return !inode_unhashed(disk->part0->bd_inode); +} + /* * The gendisk is refcounted by the part0 block_device, and the bd_device * therein is also used for device model presentation in sysfs. @@ -210,18 +214,12 @@ static inline dev_t disk_devt(struct gendisk *disk) void disk_uevent(struct gendisk *disk, enum kobject_action action); /* block/genhd.c */ -extern void device_add_disk(struct device *parent, struct gendisk *disk, - const struct attribute_group **groups); -static inline void add_disk(struct gendisk *disk) +int device_add_disk(struct device *parent, struct gendisk *disk, + const struct attribute_group **groups); +static inline int add_disk(struct gendisk *disk) { - device_add_disk(NULL, disk, NULL); + return device_add_disk(NULL, disk, NULL); } -extern void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk); -static inline void add_disk_no_queue_reg(struct gendisk *disk) -{ - device_add_disk_no_queue_reg(NULL, disk); -} - extern void del_gendisk(struct gendisk *gp); void set_disk_ro(struct gendisk *disk, bool read_only); @@ -236,6 +234,7 @@ extern void disk_block_events(struct gendisk *disk); extern void disk_unblock_events(struct gendisk *disk); extern void disk_flush_events(struct gendisk *disk, unsigned int mask); bool set_capacity_and_notify(struct gendisk *disk, sector_t size); +bool disk_force_media_change(struct gendisk *disk, unsigned int events); /* drivers/char/random.c */ extern void add_disk_randomness(struct gendisk *disk) __latent_entropy; @@ -259,26 +258,10 @@ static inline sector_t get_capacity(struct gendisk *disk) int bdev_disk_changed(struct gendisk *disk, bool invalidate); void blk_drop_partitions(struct gendisk *disk); -extern struct gendisk *__alloc_disk_node(int minors, int node_id); +struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, + struct lock_class_key *lkclass); extern void put_disk(struct gendisk *disk); - -#define alloc_disk_node(minors, node_id) \ -({ \ - static struct lock_class_key __key; \ - const char *__name; \ - struct gendisk *__disk; \ - \ - __name = "(gendisk_completion)"#minors"("#node_id")"; \ - \ - __disk = __alloc_disk_node(minors, node_id); \ - \ - if (__disk) \ - lockdep_init_map(&__disk->lockdep_map, __name, &__key, 0); \ - \ - __disk; \ -}) - -#define alloc_disk(minors) alloc_disk_node(minors, NUMA_NO_NODE) +struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass); /** * blk_alloc_disk - allocate a gendisk structure @@ -291,15 +274,10 @@ extern void put_disk(struct gendisk *disk); */ #define blk_alloc_disk(node_id) \ ({ \ - struct gendisk *__disk = __blk_alloc_disk(node_id); \ static struct lock_class_key __key; \ \ - if (__disk) \ - lockdep_init_map(&__disk->lockdep_map, \ - "(bio completion)", &__key, 0); \ - __disk; \ + __blk_alloc_disk(node_id, &__key); \ }) -struct gendisk *__blk_alloc_disk(int node); void blk_cleanup_disk(struct gendisk *disk); int __register_blkdev(unsigned int major, const char *name, @@ -316,9 +294,10 @@ void set_capacity(struct gendisk *disk, sector_t size); int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long); long compat_blkdev_ioctl(struct file *, unsigned, unsigned long); -#ifdef CONFIG_SYSFS +#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk); void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk); +int bd_register_pending_holders(struct gendisk *disk); #else static inline int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) @@ -329,9 +308,14 @@ static inline void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) { } -#endif /* CONFIG_SYSFS */ +static inline int bd_register_pending_holders(struct gendisk *disk) +{ + return 0; +} +#endif /* CONFIG_BLOCK_HOLDER_DEPRECATED */ dev_t part_devt(struct gendisk *disk, u8 partno); +void inc_diskseq(struct gendisk *disk); dev_t blk_lookup_devt(const char *name, int partno); void blk_request_module(dev_t devt); #ifdef CONFIG_BLOCK diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index bb5e7b0a4274..0ee140176f10 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -318,16 +318,12 @@ struct clock_event_device; extern void hrtimer_interrupt(struct clock_event_device *dev); -extern void clock_was_set_delayed(void); - extern unsigned int hrtimer_resolution; #else #define hrtimer_resolution (unsigned int)LOW_RES_NSEC -static inline void clock_was_set_delayed(void) { } - #endif static inline ktime_t @@ -351,13 +347,13 @@ hrtimer_expires_remaining_adjusted(const struct hrtimer *timer) timer->base->get_time()); } -extern void clock_was_set(void); #ifdef CONFIG_TIMERFD extern void timerfd_clock_was_set(void); +extern void timerfd_resume(void); #else static inline void timerfd_clock_was_set(void) { } +static inline void timerfd_resume(void) { } #endif -extern void hrtimers_resume(void); DECLARE_PER_CPU(struct tick_device, tick_cpu_device); diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 04b650bcbbe5..649a4d7c241b 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -7,17 +7,18 @@ #if defined(CONFIG_IO_URING) struct sock *io_uring_get_socket(struct file *file); -void __io_uring_cancel(struct files_struct *files); +void __io_uring_cancel(bool cancel_all); void __io_uring_free(struct task_struct *tsk); -static inline void io_uring_files_cancel(struct files_struct *files) +static inline void io_uring_files_cancel(void) { if (current->io_uring) - __io_uring_cancel(files); + __io_uring_cancel(false); } static inline void io_uring_task_cancel(void) { - return io_uring_files_cancel(NULL); + if (current->io_uring) + __io_uring_cancel(true); } static inline void io_uring_free(struct task_struct *tsk) { @@ -32,7 +33,7 @@ static inline struct sock *io_uring_get_socket(struct file *file) static inline void io_uring_task_cancel(void) { } -static inline void io_uring_files_cancel(struct files_struct *files) +static inline void io_uring_files_cancel(void) { } static inline void io_uring_free(struct task_struct *tsk) diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h index e9bfe6972aed..3f53bc27a19b 100644 --- a/include/linux/ioprio.h +++ b/include/linux/ioprio.h @@ -6,46 +6,22 @@ #include #include -/* - * Gives us 8 prio classes with 13-bits of data for each class - */ -#define IOPRIO_CLASS_SHIFT (13) -#define IOPRIO_PRIO_MASK ((1UL << IOPRIO_CLASS_SHIFT) - 1) - -#define IOPRIO_PRIO_CLASS(mask) ((mask) >> IOPRIO_CLASS_SHIFT) -#define IOPRIO_PRIO_DATA(mask) ((mask) & IOPRIO_PRIO_MASK) -#define IOPRIO_PRIO_VALUE(class, data) (((class) << IOPRIO_CLASS_SHIFT) | data) - -#define ioprio_valid(mask) (IOPRIO_PRIO_CLASS((mask)) != IOPRIO_CLASS_NONE) +#include /* - * These are the io priority groups as implemented by CFQ. RT is the realtime - * class, it always gets premium service. BE is the best-effort scheduling - * class, the default for any process. IDLE is the idle scheduling class, it - * is only served when no one else is using the disk. + * Default IO priority. */ -enum { - IOPRIO_CLASS_NONE, - IOPRIO_CLASS_RT, - IOPRIO_CLASS_BE, - IOPRIO_CLASS_IDLE, -}; +#define IOPRIO_DEFAULT IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_BE_NORM) /* - * 8 best effort priority levels are supported + * Check that a priority value has a valid class. */ -#define IOPRIO_BE_NR (8) +static inline bool ioprio_valid(unsigned short ioprio) +{ + unsigned short class = IOPRIO_PRIO_CLASS(ioprio); -enum { - IOPRIO_WHO_PROCESS = 1, - IOPRIO_WHO_PGRP, - IOPRIO_WHO_USER, -}; - -/* - * Fallback BE priority - */ -#define IOPRIO_NORM (4) + return class > IOPRIO_CLASS_NONE && class <= IOPRIO_CLASS_IDLE; +} /* * if process has set io priority explicitly, use that. if not, convert @@ -80,7 +56,7 @@ static inline int get_current_ioprio(void) if (ioc) return ioc->ioprio; - return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0); + return IOPRIO_DEFAULT; } /* diff --git a/include/linux/libata.h b/include/linux/libata.h index 3fcd24236793..860e63f5667b 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -161,6 +161,10 @@ enum { ATA_DFLAG_D_SENSE = (1 << 29), /* Descriptor sense requested */ ATA_DFLAG_ZAC = (1 << 30), /* ZAC device */ + ATA_DFLAG_FEATURES_MASK = ATA_DFLAG_TRUSTED | ATA_DFLAG_DA | \ + ATA_DFLAG_DEVSLP | ATA_DFLAG_NCQ_SEND_RECV | \ + ATA_DFLAG_NCQ_PRIO, + ATA_DEV_UNKNOWN = 0, /* unknown device */ ATA_DEV_ATA = 1, /* ATA device */ ATA_DEV_ATA_UNSUP = 2, /* ATA device (unsupported) */ @@ -535,6 +539,7 @@ typedef void (*ata_postreset_fn_t)(struct ata_link *link, unsigned int *classes) extern struct device_attribute dev_attr_unload_heads; #ifdef CONFIG_SATA_HOST extern struct device_attribute dev_attr_link_power_management_policy; +extern struct device_attribute dev_attr_ncq_prio_supported; extern struct device_attribute dev_attr_ncq_prio_enable; extern struct device_attribute dev_attr_em_message_type; extern struct device_attribute dev_attr_em_message; @@ -1454,7 +1459,7 @@ static inline bool sata_pmp_attached(struct ata_port *ap) static inline bool ata_is_host_link(const struct ata_link *link) { - return 1; + return true; } #endif /* CONFIG_SATA_PMP */ diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h deleted file mode 100644 index 0908abda9c1b..000000000000 --- a/include/linux/lightnvm.h +++ /dev/null @@ -1,697 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef NVM_H -#define NVM_H - -#include -#include -#include - -enum { - NVM_IO_OK = 0, - NVM_IO_REQUEUE = 1, - NVM_IO_DONE = 2, - NVM_IO_ERR = 3, - - NVM_IOTYPE_NONE = 0, - NVM_IOTYPE_GC = 1, -}; - -/* common format */ -#define NVM_GEN_CH_BITS (8) -#define NVM_GEN_LUN_BITS (8) -#define NVM_GEN_BLK_BITS (16) -#define NVM_GEN_RESERVED (32) - -/* 1.2 format */ -#define NVM_12_PG_BITS (16) -#define NVM_12_PL_BITS (4) -#define NVM_12_SEC_BITS (4) -#define NVM_12_RESERVED (8) - -/* 2.0 format */ -#define NVM_20_SEC_BITS (24) -#define NVM_20_RESERVED (8) - -enum { - NVM_OCSSD_SPEC_12 = 12, - NVM_OCSSD_SPEC_20 = 20, -}; - -struct ppa_addr { - /* Generic structure for all addresses */ - union { - /* generic device format */ - struct { - u64 ch : NVM_GEN_CH_BITS; - u64 lun : NVM_GEN_LUN_BITS; - u64 blk : NVM_GEN_BLK_BITS; - u64 reserved : NVM_GEN_RESERVED; - } a; - - /* 1.2 device format */ - struct { - u64 ch : NVM_GEN_CH_BITS; - u64 lun : NVM_GEN_LUN_BITS; - u64 blk : NVM_GEN_BLK_BITS; - u64 pg : NVM_12_PG_BITS; - u64 pl : NVM_12_PL_BITS; - u64 sec : NVM_12_SEC_BITS; - u64 reserved : NVM_12_RESERVED; - } g; - - /* 2.0 device format */ - struct { - u64 grp : NVM_GEN_CH_BITS; - u64 pu : NVM_GEN_LUN_BITS; - u64 chk : NVM_GEN_BLK_BITS; - u64 sec : NVM_20_SEC_BITS; - u64 reserved : NVM_20_RESERVED; - } m; - - struct { - u64 line : 63; - u64 is_cached : 1; - } c; - - u64 ppa; - }; -}; - -struct nvm_rq; -struct nvm_id; -struct nvm_dev; -struct nvm_tgt_dev; -struct nvm_chk_meta; - -typedef int (nvm_id_fn)(struct nvm_dev *); -typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, u8 *); -typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct ppa_addr *, int, int); -typedef int (nvm_get_chk_meta_fn)(struct nvm_dev *, sector_t, int, - struct nvm_chk_meta *); -typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *, void *); -typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *, int); -typedef void (nvm_destroy_dma_pool_fn)(void *); -typedef void *(nvm_dev_dma_alloc_fn)(struct nvm_dev *, void *, gfp_t, - dma_addr_t *); -typedef void (nvm_dev_dma_free_fn)(void *, void*, dma_addr_t); - -struct nvm_dev_ops { - nvm_id_fn *identity; - nvm_op_bb_tbl_fn *get_bb_tbl; - nvm_op_set_bb_fn *set_bb_tbl; - - nvm_get_chk_meta_fn *get_chk_meta; - - nvm_submit_io_fn *submit_io; - - nvm_create_dma_pool_fn *create_dma_pool; - nvm_destroy_dma_pool_fn *destroy_dma_pool; - nvm_dev_dma_alloc_fn *dev_dma_alloc; - nvm_dev_dma_free_fn *dev_dma_free; -}; - -#ifdef CONFIG_NVM - -#include -#include - -enum { - /* HW Responsibilities */ - NVM_RSP_L2P = 1 << 0, - NVM_RSP_ECC = 1 << 1, - - /* Physical Adressing Mode */ - NVM_ADDRMODE_LINEAR = 0, - NVM_ADDRMODE_CHANNEL = 1, - - /* Plane programming mode for LUN */ - NVM_PLANE_SINGLE = 1, - NVM_PLANE_DOUBLE = 2, - NVM_PLANE_QUAD = 4, - - /* Status codes */ - NVM_RSP_SUCCESS = 0x0, - NVM_RSP_NOT_CHANGEABLE = 0x1, - NVM_RSP_ERR_FAILWRITE = 0x40ff, - NVM_RSP_ERR_EMPTYPAGE = 0x42ff, - NVM_RSP_ERR_FAILECC = 0x4281, - NVM_RSP_ERR_FAILCRC = 0x4004, - NVM_RSP_WARN_HIGHECC = 0x4700, - - /* Device opcodes */ - NVM_OP_PWRITE = 0x91, - NVM_OP_PREAD = 0x92, - NVM_OP_ERASE = 0x90, - - /* PPA Command Flags */ - NVM_IO_SNGL_ACCESS = 0x0, - NVM_IO_DUAL_ACCESS = 0x1, - NVM_IO_QUAD_ACCESS = 0x2, - - /* NAND Access Modes */ - NVM_IO_SUSPEND = 0x80, - NVM_IO_SLC_MODE = 0x100, - NVM_IO_SCRAMBLE_ENABLE = 0x200, - - /* Block Types */ - NVM_BLK_T_FREE = 0x0, - NVM_BLK_T_BAD = 0x1, - NVM_BLK_T_GRWN_BAD = 0x2, - NVM_BLK_T_DEV = 0x4, - NVM_BLK_T_HOST = 0x8, - - /* Memory capabilities */ - NVM_ID_CAP_SLC = 0x1, - NVM_ID_CAP_CMD_SUSPEND = 0x2, - NVM_ID_CAP_SCRAMBLE = 0x4, - NVM_ID_CAP_ENCRYPT = 0x8, - - /* Memory types */ - NVM_ID_FMTYPE_SLC = 0, - NVM_ID_FMTYPE_MLC = 1, - - /* Device capabilities */ - NVM_ID_DCAP_BBLKMGMT = 0x1, - NVM_UD_DCAP_ECC = 0x2, -}; - -struct nvm_id_lp_mlc { - u16 num_pairs; - u8 pairs[886]; -}; - -struct nvm_id_lp_tbl { - __u8 id[8]; - struct nvm_id_lp_mlc mlc; -}; - -struct nvm_addrf_12 { - u8 ch_len; - u8 lun_len; - u8 blk_len; - u8 pg_len; - u8 pln_len; - u8 sec_len; - - u8 ch_offset; - u8 lun_offset; - u8 blk_offset; - u8 pg_offset; - u8 pln_offset; - u8 sec_offset; - - u64 ch_mask; - u64 lun_mask; - u64 blk_mask; - u64 pg_mask; - u64 pln_mask; - u64 sec_mask; -}; - -struct nvm_addrf { - u8 ch_len; - u8 lun_len; - u8 chk_len; - u8 sec_len; - u8 rsv_len[2]; - - u8 ch_offset; - u8 lun_offset; - u8 chk_offset; - u8 sec_offset; - u8 rsv_off[2]; - - u64 ch_mask; - u64 lun_mask; - u64 chk_mask; - u64 sec_mask; - u64 rsv_mask[2]; -}; - -enum { - /* Chunk states */ - NVM_CHK_ST_FREE = 1 << 0, - NVM_CHK_ST_CLOSED = 1 << 1, - NVM_CHK_ST_OPEN = 1 << 2, - NVM_CHK_ST_OFFLINE = 1 << 3, - - /* Chunk types */ - NVM_CHK_TP_W_SEQ = 1 << 0, - NVM_CHK_TP_W_RAN = 1 << 1, - NVM_CHK_TP_SZ_SPEC = 1 << 4, -}; - -/* - * Note: The structure size is linked to nvme_nvm_chk_meta such that the same - * buffer can be used when converting from little endian to cpu addressing. - */ -struct nvm_chk_meta { - u8 state; - u8 type; - u8 wi; - u8 rsvd[5]; - u64 slba; - u64 cnlb; - u64 wp; -}; - -struct nvm_target { - struct list_head list; - struct nvm_tgt_dev *dev; - struct nvm_tgt_type *type; - struct gendisk *disk; -}; - -#define ADDR_EMPTY (~0ULL) - -#define NVM_TARGET_DEFAULT_OP (101) -#define NVM_TARGET_MIN_OP (3) -#define NVM_TARGET_MAX_OP (80) - -#define NVM_VERSION_MAJOR 1 -#define NVM_VERSION_MINOR 0 -#define NVM_VERSION_PATCH 0 - -#define NVM_MAX_VLBA (64) /* max logical blocks in a vector command */ - -struct nvm_rq; -typedef void (nvm_end_io_fn)(struct nvm_rq *); - -struct nvm_rq { - struct nvm_tgt_dev *dev; - - struct bio *bio; - - union { - struct ppa_addr ppa_addr; - dma_addr_t dma_ppa_list; - }; - - struct ppa_addr *ppa_list; - - void *meta_list; - dma_addr_t dma_meta_list; - - nvm_end_io_fn *end_io; - - uint8_t opcode; - uint16_t nr_ppas; - uint16_t flags; - - u64 ppa_status; /* ppa media status */ - int error; - - int is_seq; /* Sequential hint flag. 1.2 only */ - - void *private; -}; - -static inline struct nvm_rq *nvm_rq_from_pdu(void *pdu) -{ - return pdu - sizeof(struct nvm_rq); -} - -static inline void *nvm_rq_to_pdu(struct nvm_rq *rqdata) -{ - return rqdata + 1; -} - -static inline struct ppa_addr *nvm_rq_to_ppa_list(struct nvm_rq *rqd) -{ - return (rqd->nr_ppas > 1) ? rqd->ppa_list : &rqd->ppa_addr; -} - -enum { - NVM_BLK_ST_FREE = 0x1, /* Free block */ - NVM_BLK_ST_TGT = 0x2, /* Block in use by target */ - NVM_BLK_ST_BAD = 0x8, /* Bad block */ -}; - -/* Instance geometry */ -struct nvm_geo { - /* device reported version */ - u8 major_ver_id; - u8 minor_ver_id; - - /* kernel short version */ - u8 version; - - /* instance specific geometry */ - int num_ch; - int num_lun; /* per channel */ - - /* calculated values */ - int all_luns; /* across channels */ - int all_chunks; /* across channels */ - - int op; /* over-provision in instance */ - - sector_t total_secs; /* across channels */ - - /* chunk geometry */ - u32 num_chk; /* chunks per lun */ - u32 clba; /* sectors per chunk */ - u16 csecs; /* sector size */ - u16 sos; /* out-of-band area size */ - bool ext; /* metadata in extended data buffer */ - u32 mdts; /* Max data transfer size*/ - - /* device write constrains */ - u32 ws_min; /* minimum write size */ - u32 ws_opt; /* optimal write size */ - u32 mw_cunits; /* distance required for successful read */ - u32 maxoc; /* maximum open chunks */ - u32 maxocpu; /* maximum open chunks per parallel unit */ - - /* device capabilities */ - u32 mccap; - - /* device timings */ - u32 trdt; /* Avg. Tread (ns) */ - u32 trdm; /* Max Tread (ns) */ - u32 tprt; /* Avg. Tprog (ns) */ - u32 tprm; /* Max Tprog (ns) */ - u32 tbet; /* Avg. Terase (ns) */ - u32 tbem; /* Max Terase (ns) */ - - /* generic address format */ - struct nvm_addrf addrf; - - /* 1.2 compatibility */ - u8 vmnt; - u32 cap; - u32 dom; - - u8 mtype; - u8 fmtype; - - u16 cpar; - u32 mpos; - - u8 num_pln; - u8 pln_mode; - u16 num_pg; - u16 fpg_sz; -}; - -/* sub-device structure */ -struct nvm_tgt_dev { - /* Device information */ - struct nvm_geo geo; - - /* Base ppas for target LUNs */ - struct ppa_addr *luns; - - struct request_queue *q; - - struct nvm_dev *parent; - void *map; -}; - -struct nvm_dev { - struct nvm_dev_ops *ops; - - struct list_head devices; - - /* Device information */ - struct nvm_geo geo; - - unsigned long *lun_map; - void *dma_pool; - - /* Backend device */ - struct request_queue *q; - char name[DISK_NAME_LEN]; - void *private_data; - - struct kref ref; - void *rmap; - - struct mutex mlock; - spinlock_t lock; - - /* target management */ - struct list_head area_list; - struct list_head targets; -}; - -static inline struct ppa_addr generic_to_dev_addr(struct nvm_dev *dev, - struct ppa_addr r) -{ - struct nvm_geo *geo = &dev->geo; - struct ppa_addr l; - - if (geo->version == NVM_OCSSD_SPEC_12) { - struct nvm_addrf_12 *ppaf = (struct nvm_addrf_12 *)&geo->addrf; - - l.ppa = ((u64)r.g.ch) << ppaf->ch_offset; - l.ppa |= ((u64)r.g.lun) << ppaf->lun_offset; - l.ppa |= ((u64)r.g.blk) << ppaf->blk_offset; - l.ppa |= ((u64)r.g.pg) << ppaf->pg_offset; - l.ppa |= ((u64)r.g.pl) << ppaf->pln_offset; - l.ppa |= ((u64)r.g.sec) << ppaf->sec_offset; - } else { - struct nvm_addrf *lbaf = &geo->addrf; - - l.ppa = ((u64)r.m.grp) << lbaf->ch_offset; - l.ppa |= ((u64)r.m.pu) << lbaf->lun_offset; - l.ppa |= ((u64)r.m.chk) << lbaf->chk_offset; - l.ppa |= ((u64)r.m.sec) << lbaf->sec_offset; - } - - return l; -} - -static inline struct ppa_addr dev_to_generic_addr(struct nvm_dev *dev, - struct ppa_addr r) -{ - struct nvm_geo *geo = &dev->geo; - struct ppa_addr l; - - l.ppa = 0; - - if (geo->version == NVM_OCSSD_SPEC_12) { - struct nvm_addrf_12 *ppaf = (struct nvm_addrf_12 *)&geo->addrf; - - l.g.ch = (r.ppa & ppaf->ch_mask) >> ppaf->ch_offset; - l.g.lun = (r.ppa & ppaf->lun_mask) >> ppaf->lun_offset; - l.g.blk = (r.ppa & ppaf->blk_mask) >> ppaf->blk_offset; - l.g.pg = (r.ppa & ppaf->pg_mask) >> ppaf->pg_offset; - l.g.pl = (r.ppa & ppaf->pln_mask) >> ppaf->pln_offset; - l.g.sec = (r.ppa & ppaf->sec_mask) >> ppaf->sec_offset; - } else { - struct nvm_addrf *lbaf = &geo->addrf; - - l.m.grp = (r.ppa & lbaf->ch_mask) >> lbaf->ch_offset; - l.m.pu = (r.ppa & lbaf->lun_mask) >> lbaf->lun_offset; - l.m.chk = (r.ppa & lbaf->chk_mask) >> lbaf->chk_offset; - l.m.sec = (r.ppa & lbaf->sec_mask) >> lbaf->sec_offset; - } - - return l; -} - -static inline u64 dev_to_chunk_addr(struct nvm_dev *dev, void *addrf, - struct ppa_addr p) -{ - struct nvm_geo *geo = &dev->geo; - u64 caddr; - - if (geo->version == NVM_OCSSD_SPEC_12) { - struct nvm_addrf_12 *ppaf = (struct nvm_addrf_12 *)addrf; - - caddr = (u64)p.g.pg << ppaf->pg_offset; - caddr |= (u64)p.g.pl << ppaf->pln_offset; - caddr |= (u64)p.g.sec << ppaf->sec_offset; - } else { - caddr = p.m.sec; - } - - return caddr; -} - -static inline struct ppa_addr nvm_ppa32_to_ppa64(struct nvm_dev *dev, - void *addrf, u32 ppa32) -{ - struct ppa_addr ppa64; - - ppa64.ppa = 0; - - if (ppa32 == -1) { - ppa64.ppa = ADDR_EMPTY; - } else if (ppa32 & (1U << 31)) { - ppa64.c.line = ppa32 & ((~0U) >> 1); - ppa64.c.is_cached = 1; - } else { - struct nvm_geo *geo = &dev->geo; - - if (geo->version == NVM_OCSSD_SPEC_12) { - struct nvm_addrf_12 *ppaf = addrf; - - ppa64.g.ch = (ppa32 & ppaf->ch_mask) >> - ppaf->ch_offset; - ppa64.g.lun = (ppa32 & ppaf->lun_mask) >> - ppaf->lun_offset; - ppa64.g.blk = (ppa32 & ppaf->blk_mask) >> - ppaf->blk_offset; - ppa64.g.pg = (ppa32 & ppaf->pg_mask) >> - ppaf->pg_offset; - ppa64.g.pl = (ppa32 & ppaf->pln_mask) >> - ppaf->pln_offset; - ppa64.g.sec = (ppa32 & ppaf->sec_mask) >> - ppaf->sec_offset; - } else { - struct nvm_addrf *lbaf = addrf; - - ppa64.m.grp = (ppa32 & lbaf->ch_mask) >> - lbaf->ch_offset; - ppa64.m.pu = (ppa32 & lbaf->lun_mask) >> - lbaf->lun_offset; - ppa64.m.chk = (ppa32 & lbaf->chk_mask) >> - lbaf->chk_offset; - ppa64.m.sec = (ppa32 & lbaf->sec_mask) >> - lbaf->sec_offset; - } - } - - return ppa64; -} - -static inline u32 nvm_ppa64_to_ppa32(struct nvm_dev *dev, - void *addrf, struct ppa_addr ppa64) -{ - u32 ppa32 = 0; - - if (ppa64.ppa == ADDR_EMPTY) { - ppa32 = ~0U; - } else if (ppa64.c.is_cached) { - ppa32 |= ppa64.c.line; - ppa32 |= 1U << 31; - } else { - struct nvm_geo *geo = &dev->geo; - - if (geo->version == NVM_OCSSD_SPEC_12) { - struct nvm_addrf_12 *ppaf = addrf; - - ppa32 |= ppa64.g.ch << ppaf->ch_offset; - ppa32 |= ppa64.g.lun << ppaf->lun_offset; - ppa32 |= ppa64.g.blk << ppaf->blk_offset; - ppa32 |= ppa64.g.pg << ppaf->pg_offset; - ppa32 |= ppa64.g.pl << ppaf->pln_offset; - ppa32 |= ppa64.g.sec << ppaf->sec_offset; - } else { - struct nvm_addrf *lbaf = addrf; - - ppa32 |= ppa64.m.grp << lbaf->ch_offset; - ppa32 |= ppa64.m.pu << lbaf->lun_offset; - ppa32 |= ppa64.m.chk << lbaf->chk_offset; - ppa32 |= ppa64.m.sec << lbaf->sec_offset; - } - } - - return ppa32; -} - -static inline int nvm_next_ppa_in_chk(struct nvm_tgt_dev *dev, - struct ppa_addr *ppa) -{ - struct nvm_geo *geo = &dev->geo; - int last = 0; - - if (geo->version == NVM_OCSSD_SPEC_12) { - int sec = ppa->g.sec; - - sec++; - if (sec == geo->ws_min) { - int pg = ppa->g.pg; - - sec = 0; - pg++; - if (pg == geo->num_pg) { - int pl = ppa->g.pl; - - pg = 0; - pl++; - if (pl == geo->num_pln) - last = 1; - - ppa->g.pl = pl; - } - ppa->g.pg = pg; - } - ppa->g.sec = sec; - } else { - ppa->m.sec++; - if (ppa->m.sec == geo->clba) - last = 1; - } - - return last; -} - -typedef sector_t (nvm_tgt_capacity_fn)(void *); -typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *, - int flags); -typedef void (nvm_tgt_exit_fn)(void *, bool); -typedef int (nvm_tgt_sysfs_init_fn)(struct gendisk *); -typedef void (nvm_tgt_sysfs_exit_fn)(struct gendisk *); - -enum { - NVM_TGT_F_DEV_L2P = 0, - NVM_TGT_F_HOST_L2P = 1 << 0, -}; - -struct nvm_tgt_type { - const char *name; - unsigned int version[3]; - int flags; - - /* target entry points */ - const struct block_device_operations *bops; - nvm_tgt_capacity_fn *capacity; - - /* module-specific init/teardown */ - nvm_tgt_init_fn *init; - nvm_tgt_exit_fn *exit; - - /* sysfs */ - nvm_tgt_sysfs_init_fn *sysfs_init; - nvm_tgt_sysfs_exit_fn *sysfs_exit; - - /* For internal use */ - struct list_head list; - struct module *owner; -}; - -extern int nvm_register_tgt_type(struct nvm_tgt_type *); -extern void nvm_unregister_tgt_type(struct nvm_tgt_type *); - -extern void *nvm_dev_dma_alloc(struct nvm_dev *, gfp_t, dma_addr_t *); -extern void nvm_dev_dma_free(struct nvm_dev *, void *, dma_addr_t); - -extern struct nvm_dev *nvm_alloc_dev(int); -extern int nvm_register(struct nvm_dev *); -extern void nvm_unregister(struct nvm_dev *); - -extern int nvm_get_chunk_meta(struct nvm_tgt_dev *, struct ppa_addr, - int, struct nvm_chk_meta *); -extern int nvm_set_chunk_meta(struct nvm_tgt_dev *, struct ppa_addr *, - int, int); -extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *, void *); -extern int nvm_submit_io_sync(struct nvm_tgt_dev *, struct nvm_rq *, void *); -extern void nvm_end_io(struct nvm_rq *); - -#else /* CONFIG_NVM */ -struct nvm_dev_ops; - -static inline struct nvm_dev *nvm_alloc_dev(int node) -{ - return ERR_PTR(-EINVAL); -} -static inline int nvm_register(struct nvm_dev *dev) -{ - return -EINVAL; -} -static inline void nvm_unregister(struct nvm_dev *dev) {} -#endif /* CONFIG_NVM */ -#endif /* LIGHTNVM.H */ diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h index 74e6c0624d27..37f975875102 100644 --- a/include/linux/mmc/card.h +++ b/include/linux/mmc/card.h @@ -109,6 +109,7 @@ struct mmc_ext_csd { u8 raw_hc_erase_gap_size; /* 221 */ u8 raw_erase_timeout_mult; /* 223 */ u8 raw_hc_erase_grp_size; /* 224 */ + u8 raw_boot_mult; /* 226 */ u8 raw_sec_trim_mult; /* 229 */ u8 raw_sec_erase_mult; /* 230 */ u8 raw_sec_feature_support;/* 231 */ diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index 0abd47e9ef9b..78dadf86b38f 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -398,6 +398,7 @@ struct mmc_host { #else #define MMC_CAP2_CRYPTO 0 #endif +#define MMC_CAP2_ALT_GPT_TEGRA (1 << 28) /* Host with eMMC that has GPT entry at a non-standard location */ int fixed_drv_type; /* fixed driver type for non-removable media */ diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index eed280fae433..962cd41a2cb5 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -431,6 +431,8 @@ extern int param_get_int(char *buffer, const struct kernel_param *kp); extern const struct kernel_param_ops param_ops_uint; extern int param_set_uint(const char *val, const struct kernel_param *kp); extern int param_get_uint(char *buffer, const struct kernel_param *kp); +int param_set_uint_minmax(const char *val, const struct kernel_param *kp, + unsigned int min, unsigned int max); #define param_check_uint(name, p) __param_check(name, p, unsigned int) extern const struct kernel_param_ops param_ops_long; diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 4bac1831de80..60e2101a009d 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -1121,6 +1121,7 @@ #define PCI_DEVICE_ID_3COM_3CR990SVR 0x990a #define PCI_VENDOR_ID_AL 0x10b9 +#define PCI_DEVICE_ID_AL_M1489 0x1489 #define PCI_DEVICE_ID_AL_M1533 0x1533 #define PCI_DEVICE_ID_AL_M1535 0x1535 #define PCI_DEVICE_ID_AL_M1541 0x1541 @@ -2643,6 +2644,7 @@ #define PCI_DEVICE_ID_INTEL_82375 0x0482 #define PCI_DEVICE_ID_INTEL_82424 0x0483 #define PCI_DEVICE_ID_INTEL_82378 0x0484 +#define PCI_DEVICE_ID_INTEL_82425 0x0486 #define PCI_DEVICE_ID_INTEL_MRST_SD0 0x0807 #define PCI_DEVICE_ID_INTEL_MRST_SD1 0x0808 #define PCI_DEVICE_ID_INTEL_MFD_SD 0x0820 diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 896c16d2c5fb..00fef0064355 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -82,12 +82,19 @@ static inline bool cpu_timer_enqueue(struct timerqueue_head *head, return timerqueue_add(head, &ctmr->node); } -static inline void cpu_timer_dequeue(struct cpu_timer *ctmr) +static inline bool cpu_timer_queued(struct cpu_timer *ctmr) { - if (ctmr->head) { + return !!ctmr->head; +} + +static inline bool cpu_timer_dequeue(struct cpu_timer *ctmr) +{ + if (cpu_timer_queued(ctmr)) { timerqueue_del(ctmr->head, &ctmr->node); ctmr->head = NULL; + return true; } + return false; } static inline u64 cpu_timer_getexpires(struct cpu_timer *ctmr) diff --git a/include/linux/sched.h b/include/linux/sched.h index 4bd3ee2f9863..e6e152262ccb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1482,6 +1482,16 @@ struct task_struct { struct llist_head kretprobe_instances; #endif +#ifdef CONFIG_ARCH_HAS_PARANOID_L1D_FLUSH + /* + * If L1D flush is supported on mm context switch + * then we use this callback head to queue kill work + * to kill tasks that are not running on SMT disabled + * cores + */ + struct callback_head l1d_flush_kill; +#endif + /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index b9126fe06c3f..0310a5add9ab 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -714,6 +714,12 @@ static inline void unlock_task_sighand(struct task_struct *task, spin_unlock_irqrestore(&task->sighand->siglock, *flags); } +#ifdef CONFIG_LOCKDEP +extern void lockdep_assert_task_sighand_held(struct task_struct *task); +#else +static inline void lockdep_assert_task_sighand_held(struct task_struct *task) { } +#endif + static inline unsigned long task_rlimit(const struct task_struct *task, unsigned int limit) { diff --git a/include/linux/socket.h b/include/linux/socket.h index 0d8e3dcb7f88..d3c1a42a2edd 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -421,6 +421,9 @@ extern int __sys_accept4_file(struct file *file, unsigned file_flags, struct sockaddr __user *upeer_sockaddr, int __user *upeer_addrlen, int flags, unsigned long nofile); +extern struct file *do_accept(struct file *file, unsigned file_flags, + struct sockaddr __user *upeer_sockaddr, + int __user *upeer_addrlen, int flags); extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, int __user *upeer_addrlen, int flags); extern int __sys_socket(int family, int type, int protocol); diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 667e86cfbdcf..270677dc4f36 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -336,14 +336,9 @@ static inline void cgroup_writeback_umount(void) /* * mm/page-writeback.c */ -#ifdef CONFIG_BLOCK void laptop_io_completion(struct backing_dev_info *info); void laptop_sync_completion(void); -void laptop_mode_sync(struct work_struct *work); void laptop_mode_timer_fn(struct timer_list *t); -#else -static inline void laptop_sync_completion(void) { } -#endif bool node_dirty_ok(struct pglist_data *pgdat); int wb_domain_init(struct wb_domain *dom, gfp_t gfp); #ifdef CONFIG_CGROUP_WRITEBACK diff --git a/include/trace/events/kyber.h b/include/trace/events/kyber.h index f9802562edf6..491098a0d8ed 100644 --- a/include/trace/events/kyber.h +++ b/include/trace/events/kyber.h @@ -30,7 +30,7 @@ TRACE_EVENT(kyber_latency, ), TP_fast_assign( - __entry->dev = disk_devt(queue_to_disk(q)); + __entry->dev = disk_devt(q->disk); strlcpy(__entry->domain, domain, sizeof(__entry->domain)); strlcpy(__entry->type, type, sizeof(__entry->type)); __entry->percentile = percentile; @@ -59,7 +59,7 @@ TRACE_EVENT(kyber_adjust, ), TP_fast_assign( - __entry->dev = disk_devt(queue_to_disk(q)); + __entry->dev = disk_devt(q->disk); strlcpy(__entry->domain, domain, sizeof(__entry->domain)); __entry->depth = depth; ), @@ -81,7 +81,7 @@ TRACE_EVENT(kyber_throttled, ), TP_fast_assign( - __entry->dev = disk_devt(queue_to_disk(q)); + __entry->dev = disk_devt(q->disk); strlcpy(__entry->domain, domain, sizeof(__entry->domain)); ), diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 4c32e97dcdf0..bdf7b404b3e7 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -184,6 +184,7 @@ struct fsxattr { #define BLKSECDISCARD _IO(0x12,125) #define BLKROTATIONAL _IO(0x12,126) #define BLKZEROOUT _IO(0x12,127) +#define BLKGETDISKSEQ _IOR(0x12,128,__u64) /* * A jump here: 130-136 are reserved for zoned block devices * (see uapi/linux/blkzoned.h) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 79126d5cd289..3caec9199658 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -55,7 +55,10 @@ struct io_uring_sqe { } __attribute__((packed)); /* personality to use, if used */ __u16 personality; - __s32 splice_fd_in; + union { + __s32 splice_fd_in; + __u32 file_index; + }; __u64 __pad2[2]; }; @@ -146,9 +149,13 @@ enum { /* * sqe->timeout_flags */ -#define IORING_TIMEOUT_ABS (1U << 0) -#define IORING_TIMEOUT_UPDATE (1U << 1) - +#define IORING_TIMEOUT_ABS (1U << 0) +#define IORING_TIMEOUT_UPDATE (1U << 1) +#define IORING_TIMEOUT_BOOTTIME (1U << 2) +#define IORING_TIMEOUT_REALTIME (1U << 3) +#define IORING_LINK_TIMEOUT_UPDATE (1U << 4) +#define IORING_TIMEOUT_CLOCK_MASK (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME) +#define IORING_TIMEOUT_UPDATE_MASK (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE) /* * sqe->splice_flags * extends splice(2) flags @@ -306,6 +313,9 @@ enum { IORING_REGISTER_IOWQ_AFF = 17, IORING_UNREGISTER_IOWQ_AFF = 18, + /* set/get max number of workers */ + IORING_REGISTER_IOWQ_MAX_WORKERS = 19, + /* this goes last */ IORING_REGISTER_LAST }; diff --git a/include/uapi/linux/ioprio.h b/include/uapi/linux/ioprio.h new file mode 100644 index 000000000000..f70f2596a6bf --- /dev/null +++ b/include/uapi/linux/ioprio.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_IOPRIO_H +#define _UAPI_LINUX_IOPRIO_H + +/* + * Gives us 8 prio classes with 13-bits of data for each class + */ +#define IOPRIO_CLASS_SHIFT 13 +#define IOPRIO_CLASS_MASK 0x07 +#define IOPRIO_PRIO_MASK ((1UL << IOPRIO_CLASS_SHIFT) - 1) + +#define IOPRIO_PRIO_CLASS(ioprio) \ + (((ioprio) >> IOPRIO_CLASS_SHIFT) & IOPRIO_CLASS_MASK) +#define IOPRIO_PRIO_DATA(ioprio) ((ioprio) & IOPRIO_PRIO_MASK) +#define IOPRIO_PRIO_VALUE(class, data) \ + ((((class) & IOPRIO_CLASS_MASK) << IOPRIO_CLASS_SHIFT) | \ + ((data) & IOPRIO_PRIO_MASK)) + +/* + * These are the io priority groups as implemented by the BFQ and mq-deadline + * schedulers. RT is the realtime class, it always gets premium service. For + * ATA disks supporting NCQ IO priority, RT class IOs will be processed using + * high priority NCQ commands. BE is the best-effort scheduling class, the + * default for any process. IDLE is the idle scheduling class, it is only + * served when no one else is using the disk. + */ +enum { + IOPRIO_CLASS_NONE, + IOPRIO_CLASS_RT, + IOPRIO_CLASS_BE, + IOPRIO_CLASS_IDLE, +}; + +/* + * The RT and BE priority classes both support up to 8 priority levels. + */ +#define IOPRIO_NR_LEVELS 8 +#define IOPRIO_BE_NR IOPRIO_NR_LEVELS + +enum { + IOPRIO_WHO_PROCESS = 1, + IOPRIO_WHO_PGRP, + IOPRIO_WHO_USER, +}; + +/* + * Fallback BE priority level. + */ +#define IOPRIO_NORM 4 +#define IOPRIO_BE_NORM IOPRIO_NORM + +#endif /* _UAPI_LINUX_IOPRIO_H */ diff --git a/include/uapi/linux/lightnvm.h b/include/uapi/linux/lightnvm.h deleted file mode 100644 index 2745afd9b8fa..000000000000 --- a/include/uapi/linux/lightnvm.h +++ /dev/null @@ -1,224 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* - * Copyright (C) 2015 CNEX Labs. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; see the file COPYING. If not, write to - * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, - * USA. - */ - -#ifndef _UAPI_LINUX_LIGHTNVM_H -#define _UAPI_LINUX_LIGHTNVM_H - -#ifdef __KERNEL__ -#include -#else /* __KERNEL__ */ -#include -#include -#define DISK_NAME_LEN 32 -#endif /* __KERNEL__ */ - -#include -#include - -#define NVM_TTYPE_NAME_MAX 48 -#define NVM_TTYPE_MAX 63 -#define NVM_MMTYPE_LEN 8 - -#define NVM_CTRL_FILE "/dev/lightnvm/control" - -struct nvm_ioctl_info_tgt { - __u32 version[3]; - __u32 reserved; - char tgtname[NVM_TTYPE_NAME_MAX]; -}; - -struct nvm_ioctl_info { - __u32 version[3]; /* in/out - major, minor, patch */ - __u16 tgtsize; /* number of targets */ - __u16 reserved16; /* pad to 4K page */ - __u32 reserved[12]; - struct nvm_ioctl_info_tgt tgts[NVM_TTYPE_MAX]; -}; - -enum { - NVM_DEVICE_ACTIVE = 1 << 0, -}; - -struct nvm_ioctl_device_info { - char devname[DISK_NAME_LEN]; - char bmname[NVM_TTYPE_NAME_MAX]; - __u32 bmversion[3]; - __u32 flags; - __u32 reserved[8]; -}; - -struct nvm_ioctl_get_devices { - __u32 nr_devices; - __u32 reserved[31]; - struct nvm_ioctl_device_info info[31]; -}; - -struct nvm_ioctl_create_simple { - __u32 lun_begin; - __u32 lun_end; -}; - -struct nvm_ioctl_create_extended { - __u16 lun_begin; - __u16 lun_end; - __u16 op; - __u16 rsv; -}; - -enum { - NVM_CONFIG_TYPE_SIMPLE = 0, - NVM_CONFIG_TYPE_EXTENDED = 1, -}; - -struct nvm_ioctl_create_conf { - __u32 type; - union { - struct nvm_ioctl_create_simple s; - struct nvm_ioctl_create_extended e; - }; -}; - -enum { - NVM_TARGET_FACTORY = 1 << 0, /* Init target in factory mode */ -}; - -struct nvm_ioctl_create { - char dev[DISK_NAME_LEN]; /* open-channel SSD device */ - char tgttype[NVM_TTYPE_NAME_MAX]; /* target type name */ - char tgtname[DISK_NAME_LEN]; /* dev to expose target as */ - - __u32 flags; - - struct nvm_ioctl_create_conf conf; -}; - -struct nvm_ioctl_remove { - char tgtname[DISK_NAME_LEN]; - - __u32 flags; -}; - -struct nvm_ioctl_dev_init { - char dev[DISK_NAME_LEN]; /* open-channel SSD device */ - char mmtype[NVM_MMTYPE_LEN]; /* register to media manager */ - - __u32 flags; -}; - -enum { - NVM_FACTORY_ERASE_ONLY_USER = 1 << 0, /* erase only blocks used as - * host blks or grown blks */ - NVM_FACTORY_RESET_HOST_BLKS = 1 << 1, /* remove host blk marks */ - NVM_FACTORY_RESET_GRWN_BBLKS = 1 << 2, /* remove grown blk marks */ - NVM_FACTORY_NR_BITS = 1 << 3, /* stops here */ -}; - -struct nvm_ioctl_dev_factory { - char dev[DISK_NAME_LEN]; - - __u32 flags; -}; - -struct nvm_user_vio { - __u8 opcode; - __u8 flags; - __u16 control; - __u16 nppas; - __u16 rsvd; - __u64 metadata; - __u64 addr; - __u64 ppa_list; - __u32 metadata_len; - __u32 data_len; - __u64 status; - __u32 result; - __u32 rsvd3[3]; -}; - -struct nvm_passthru_vio { - __u8 opcode; - __u8 flags; - __u8 rsvd[2]; - __u32 nsid; - __u32 cdw2; - __u32 cdw3; - __u64 metadata; - __u64 addr; - __u32 metadata_len; - __u32 data_len; - __u64 ppa_list; - __u16 nppas; - __u16 control; - __u32 cdw13; - __u32 cdw14; - __u32 cdw15; - __u64 status; - __u32 result; - __u32 timeout_ms; -}; - -/* The ioctl type, 'L', 0x20 - 0x2F documented in ioctl-number.txt */ -enum { - /* top level cmds */ - NVM_INFO_CMD = 0x20, - NVM_GET_DEVICES_CMD, - - /* device level cmds */ - NVM_DEV_CREATE_CMD, - NVM_DEV_REMOVE_CMD, - - /* Init a device to support LightNVM media managers */ - NVM_DEV_INIT_CMD, - - /* Factory reset device */ - NVM_DEV_FACTORY_CMD, - - /* Vector user I/O */ - NVM_DEV_VIO_ADMIN_CMD = 0x41, - NVM_DEV_VIO_CMD = 0x42, - NVM_DEV_VIO_USER_CMD = 0x43, -}; - -#define NVM_IOCTL 'L' /* 0x4c */ - -#define NVM_INFO _IOWR(NVM_IOCTL, NVM_INFO_CMD, \ - struct nvm_ioctl_info) -#define NVM_GET_DEVICES _IOR(NVM_IOCTL, NVM_GET_DEVICES_CMD, \ - struct nvm_ioctl_get_devices) -#define NVM_DEV_CREATE _IOW(NVM_IOCTL, NVM_DEV_CREATE_CMD, \ - struct nvm_ioctl_create) -#define NVM_DEV_REMOVE _IOW(NVM_IOCTL, NVM_DEV_REMOVE_CMD, \ - struct nvm_ioctl_remove) -#define NVM_DEV_INIT _IOW(NVM_IOCTL, NVM_DEV_INIT_CMD, \ - struct nvm_ioctl_dev_init) -#define NVM_DEV_FACTORY _IOW(NVM_IOCTL, NVM_DEV_FACTORY_CMD, \ - struct nvm_ioctl_dev_factory) - -#define NVME_NVM_IOCTL_IO_VIO _IOWR(NVM_IOCTL, NVM_DEV_VIO_USER_CMD, \ - struct nvm_passthru_vio) -#define NVME_NVM_IOCTL_ADMIN_VIO _IOWR(NVM_IOCTL, NVM_DEV_VIO_ADMIN_CMD,\ - struct nvm_passthru_vio) -#define NVME_NVM_IOCTL_SUBMIT_VIO _IOWR(NVM_IOCTL, NVM_DEV_VIO_CMD,\ - struct nvm_user_vio) - -#define NVM_VERSION_MAJOR 1 -#define NVM_VERSION_MINOR 0 -#define NVM_VERSION_PATCHLEVEL 0 - -#endif diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 968582cd91b5..23a50e30be6c 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -213,6 +213,7 @@ struct prctl_mm_map { /* Speculation control variants */ # define PR_SPEC_STORE_BYPASS 0 # define PR_SPEC_INDIRECT_BRANCH 1 +# define PR_SPEC_L1D_FLUSH 2 /* Return and control values for PR_SET/GET_SPECULATION_CTRL */ # define PR_SPEC_NOT_AFFECTED 0 # define PR_SPEC_PRCTL (1UL << 0) diff --git a/init/do_mounts.c b/init/do_mounts.c index 74aede860de7..b691d6891e51 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -432,10 +432,6 @@ retry: printk("Please append a correct \"root=\" boot option; here are the available partitions:\n"); printk_all_partitions(); -#ifdef CONFIG_DEBUG_BLOCK_EXT_DEVT - printk("DEBUG_BLOCK_EXT_DEVT is enabled, you need to specify " - "explicit textual name for \"root=\" boot option.\n"); -#endif panic("VFS: Unable to mount root fs on %s", b); } if (!(flags & SB_RDONLY)) { diff --git a/kernel/exit.c b/kernel/exit.c index 9a89e7f36acb..91a43e57a32e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -777,7 +777,7 @@ void __noreturn do_exit(long code) schedule(); } - io_uring_files_cancel(tsk->files); + io_uring_files_cancel(); exit_signals(tsk); /* sets PF_EXITING */ /* sync mm's RSS info before statistics gathering */ diff --git a/kernel/params.c b/kernel/params.c index 2daa2780a92c..8299bd764e42 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -243,6 +243,24 @@ STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul); STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull); STANDARD_PARAM_DEF(hexint, unsigned int, "%#08x", kstrtouint); +int param_set_uint_minmax(const char *val, const struct kernel_param *kp, + unsigned int min, unsigned int max) +{ + unsigned int num; + int ret; + + if (!val) + return -EINVAL; + ret = kstrtouint(val, 0, &num); + if (ret) + return ret; + if (num < min || num > max) + return -EINVAL; + *((unsigned int *)kp->arg) = num; + return 0; +} +EXPORT_SYMBOL_GPL(param_set_uint_minmax); + int param_set_charp(const char *val, const struct kernel_param *kp) { if (strlen(val) > 1024) { diff --git a/kernel/signal.c b/kernel/signal.c index 6aa994db6521..151bea31e7b0 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1414,6 +1414,21 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, } EXPORT_SYMBOL_GPL(__lock_task_sighand); +#ifdef CONFIG_LOCKDEP +void lockdep_assert_task_sighand_held(struct task_struct *task) +{ + struct sighand_struct *sighand; + + rcu_read_lock(); + sighand = rcu_dereference(task->sighand); + if (sighand) + lockdep_assert_held(&sighand->siglock); + else + WARN_ON_ONCE(1); + rcu_read_unlock(); +} +#endif + /* * send signal info to all the members of a group */ diff --git a/kernel/time/clocksource-wdtest.c b/kernel/time/clocksource-wdtest.c index 01df12395c0e..df922f49d171 100644 --- a/kernel/time/clocksource-wdtest.c +++ b/kernel/time/clocksource-wdtest.c @@ -19,6 +19,8 @@ #include #include +#include "tick-internal.h" + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney "); @@ -34,9 +36,6 @@ static u64 wdtest_jiffies_read(struct clocksource *cs) return (u64)jiffies; } -/* Assume HZ > 100. */ -#define JIFFIES_SHIFT 8 - static struct clocksource clocksource_wdtest_jiffies = { .name = "wdtest-jiffies", .rating = 1, /* lowest valid rating*/ diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index b89c76e1c02c..b8a14d2fb5ba 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -306,12 +306,12 @@ void clocksource_verify_percpu(struct clocksource *cs) return; cpumask_clear(&cpus_ahead); cpumask_clear(&cpus_behind); - get_online_cpus(); + cpus_read_lock(); preempt_disable(); clocksource_verify_choose_cpus(); if (cpumask_weight(&cpus_chosen) == 0) { preempt_enable(); - put_online_cpus(); + cpus_read_unlock(); pr_warn("Not enough CPUs to check clocksource '%s'.\n", cs->name); return; } @@ -337,7 +337,7 @@ void clocksource_verify_percpu(struct clocksource *cs) cs_nsec_min = cs_nsec; } preempt_enable(); - put_online_cpus(); + cpus_read_unlock(); if (!cpumask_empty(&cpus_ahead)) pr_warn(" CPUs %*pbl ahead of CPU %d for clocksource %s.\n", cpumask_pr_args(&cpus_ahead), testcpu, cs->name); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 4a66725b1d4a..0ea8702eb516 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -652,21 +652,10 @@ static inline int hrtimer_hres_active(void) return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases)); } -/* - * Reprogram the event source with checking both queues for the - * next event - * Called with interrupts disabled and base->lock held - */ -static void -hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) +static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base, + struct hrtimer *next_timer, + ktime_t expires_next) { - ktime_t expires_next; - - expires_next = hrtimer_update_next_event(cpu_base); - - if (skip_equal && expires_next == cpu_base->expires_next) - return; - cpu_base->expires_next = expires_next; /* @@ -689,7 +678,25 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected) return; - tick_program_event(cpu_base->expires_next, 1); + tick_program_event(expires_next, 1); +} + +/* + * Reprogram the event source with checking both queues for the + * next event + * Called with interrupts disabled and base->lock held + */ +static void +hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) +{ + ktime_t expires_next; + + expires_next = hrtimer_update_next_event(cpu_base); + + if (skip_equal && expires_next == cpu_base->expires_next) + return; + + __hrtimer_reprogram(cpu_base, cpu_base->next_timer, expires_next); } /* High resolution timer related functions */ @@ -720,23 +727,7 @@ static inline int hrtimer_is_hres_enabled(void) return hrtimer_hres_enabled; } -/* - * Retrigger next event is called after clock was set - * - * Called with interrupts disabled via on_each_cpu() - */ -static void retrigger_next_event(void *arg) -{ - struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); - - if (!__hrtimer_hres_active(base)) - return; - - raw_spin_lock(&base->lock); - hrtimer_update_base(base); - hrtimer_force_reprogram(base, 0); - raw_spin_unlock(&base->lock); -} +static void retrigger_next_event(void *arg); /* * Switch to high resolution mode @@ -758,29 +749,54 @@ static void hrtimer_switch_to_hres(void) retrigger_next_event(NULL); } -static void clock_was_set_work(struct work_struct *work) -{ - clock_was_set(); -} - -static DECLARE_WORK(hrtimer_work, clock_was_set_work); - -/* - * Called from timekeeping and resume code to reprogram the hrtimer - * interrupt device on all cpus. - */ -void clock_was_set_delayed(void) -{ - schedule_work(&hrtimer_work); -} - #else static inline int hrtimer_is_hres_enabled(void) { return 0; } static inline void hrtimer_switch_to_hres(void) { } -static inline void retrigger_next_event(void *arg) { } #endif /* CONFIG_HIGH_RES_TIMERS */ +/* + * Retrigger next event is called after clock was set with interrupts + * disabled through an SMP function call or directly from low level + * resume code. + * + * This is only invoked when: + * - CONFIG_HIGH_RES_TIMERS is enabled. + * - CONFIG_NOHZ_COMMON is enabled + * + * For the other cases this function is empty and because the call sites + * are optimized out it vanishes as well, i.e. no need for lots of + * #ifdeffery. + */ +static void retrigger_next_event(void *arg) +{ + struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); + + /* + * When high resolution mode or nohz is active, then the offsets of + * CLOCK_REALTIME/TAI/BOOTTIME have to be updated. Otherwise the + * next tick will take care of that. + * + * If high resolution mode is active then the next expiring timer + * must be reevaluated and the clock event device reprogrammed if + * necessary. + * + * In the NOHZ case the update of the offset and the reevaluation + * of the next expiring timer is enough. The return from the SMP + * function call will take care of the reprogramming in case the + * CPU was in a NOHZ idle sleep. + */ + if (!__hrtimer_hres_active(base) && !tick_nohz_active) + return; + + raw_spin_lock(&base->lock); + hrtimer_update_base(base); + if (__hrtimer_hres_active(base)) + hrtimer_force_reprogram(base, 0); + else + hrtimer_update_next_event(base); + raw_spin_unlock(&base->lock); +} /* * When a timer is enqueued and expires earlier than the already enqueued @@ -835,75 +851,161 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) if (base->cpu_base != cpu_base) return; + if (expires >= cpu_base->expires_next) + return; + /* - * If the hrtimer interrupt is running, then it will - * reevaluate the clock bases and reprogram the clock event - * device. The callbacks are always executed in hard interrupt - * context so we don't need an extra check for a running - * callback. + * If the hrtimer interrupt is running, then it will reevaluate the + * clock bases and reprogram the clock event device. */ if (cpu_base->in_hrtirq) return; - if (expires >= cpu_base->expires_next) - return; - - /* Update the pointer to the next expiring timer */ cpu_base->next_timer = timer; - cpu_base->expires_next = expires; + + __hrtimer_reprogram(cpu_base, timer, expires); +} + +static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, + unsigned int active) +{ + struct hrtimer_clock_base *base; + unsigned int seq; + ktime_t expires; /* - * If hres is not active, hardware does not have to be - * programmed yet. + * Update the base offsets unconditionally so the following + * checks whether the SMP function call is required works. * - * If a hang was detected in the last timer interrupt then we - * do not schedule a timer which is earlier than the expiry - * which we enforced in the hang detection. We want the system - * to make progress. + * The update is safe even when the remote CPU is in the hrtimer + * interrupt or the hrtimer soft interrupt and expiring affected + * bases. Either it will see the update before handling a base or + * it will see it when it finishes the processing and reevaluates + * the next expiring timer. */ - if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected) - return; + seq = cpu_base->clock_was_set_seq; + hrtimer_update_base(cpu_base); /* - * Program the timer hardware. We enforce the expiry for - * events which are already in the past. + * If the sequence did not change over the update then the + * remote CPU already handled it. */ - tick_program_event(expires, 1); + if (seq == cpu_base->clock_was_set_seq) + return false; + + /* + * If the remote CPU is currently handling an hrtimer interrupt, it + * will reevaluate the first expiring timer of all clock bases + * before reprogramming. Nothing to do here. + */ + if (cpu_base->in_hrtirq) + return false; + + /* + * Walk the affected clock bases and check whether the first expiring + * timer in a clock base is moving ahead of the first expiring timer of + * @cpu_base. If so, the IPI must be invoked because per CPU clock + * event devices cannot be remotely reprogrammed. + */ + active &= cpu_base->active_bases; + + for_each_active_base(base, cpu_base, active) { + struct timerqueue_node *next; + + next = timerqueue_getnext(&base->active); + expires = ktime_sub(next->expires, base->offset); + if (expires < cpu_base->expires_next) + return true; + + /* Extra check for softirq clock bases */ + if (base->clockid < HRTIMER_BASE_MONOTONIC_SOFT) + continue; + if (cpu_base->softirq_activated) + continue; + if (expires < cpu_base->softirq_expires_next) + return true; + } + return false; } /* - * Clock realtime was set + * Clock was set. This might affect CLOCK_REALTIME, CLOCK_TAI and + * CLOCK_BOOTTIME (for late sleep time injection). * - * Change the offset of the realtime clock vs. the monotonic - * clock. - * - * We might have to reprogram the high resolution timer interrupt. On - * SMP we call the architecture specific code to retrigger _all_ high - * resolution timer interrupts. On UP we just disable interrupts and - * call the high resolution interrupt code. + * This requires to update the offsets for these clocks + * vs. CLOCK_MONOTONIC. When high resolution timers are enabled, then this + * also requires to eventually reprogram the per CPU clock event devices + * when the change moves an affected timer ahead of the first expiring + * timer on that CPU. Obviously remote per CPU clock event devices cannot + * be reprogrammed. The other reason why an IPI has to be sent is when the + * system is in !HIGH_RES and NOHZ mode. The NOHZ mode updates the offsets + * in the tick, which obviously might be stopped, so this has to bring out + * the remote CPU which might sleep in idle to get this sorted. */ -void clock_was_set(void) +void clock_was_set(unsigned int bases) { -#ifdef CONFIG_HIGH_RES_TIMERS - /* Retrigger the CPU local events everywhere */ - on_each_cpu(retrigger_next_event, NULL, 1); -#endif + struct hrtimer_cpu_base *cpu_base = raw_cpu_ptr(&hrtimer_bases); + cpumask_var_t mask; + int cpu; + + if (!__hrtimer_hres_active(cpu_base) && !tick_nohz_active) + goto out_timerfd; + + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) { + on_each_cpu(retrigger_next_event, NULL, 1); + goto out_timerfd; + } + + /* Avoid interrupting CPUs if possible */ + cpus_read_lock(); + for_each_online_cpu(cpu) { + unsigned long flags; + + cpu_base = &per_cpu(hrtimer_bases, cpu); + raw_spin_lock_irqsave(&cpu_base->lock, flags); + + if (update_needs_ipi(cpu_base, bases)) + cpumask_set_cpu(cpu, mask); + + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + } + + preempt_disable(); + smp_call_function_many(mask, retrigger_next_event, NULL, 1); + preempt_enable(); + cpus_read_unlock(); + free_cpumask_var(mask); + +out_timerfd: timerfd_clock_was_set(); } +static void clock_was_set_work(struct work_struct *work) +{ + clock_was_set(CLOCK_SET_WALL); +} + +static DECLARE_WORK(hrtimer_work, clock_was_set_work); + /* - * During resume we might have to reprogram the high resolution timer - * interrupt on all online CPUs. However, all other CPUs will be - * stopped with IRQs interrupts disabled so the clock_was_set() call - * must be deferred. + * Called from timekeeping code to reprogram the hrtimer interrupt device + * on all cpus and to notify timerfd. */ -void hrtimers_resume(void) +void clock_was_set_delayed(void) +{ + schedule_work(&hrtimer_work); +} + +/* + * Called during resume either directly from via timekeeping_resume() + * or in the case of s2idle from tick_unfreeze() to ensure that the + * hrtimers are up to date. + */ +void hrtimers_resume_local(void) { lockdep_assert_irqs_disabled(); /* Retrigger on the local CPU */ retrigger_next_event(NULL); - /* And schedule a retrigger for all others */ - clock_was_set_delayed(); } /* @@ -1030,12 +1132,13 @@ static void __remove_hrtimer(struct hrtimer *timer, * remove hrtimer, called with base lock held */ static inline int -remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart) +remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, + bool restart, bool keep_local) { u8 state = timer->state; if (state & HRTIMER_STATE_ENQUEUED) { - int reprogram; + bool reprogram; /* * Remove the timer and force reprogramming when high @@ -1048,8 +1151,16 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest debug_deactivate(timer); reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases); + /* + * If the timer is not restarted then reprogramming is + * required if the timer is local. If it is local and about + * to be restarted, avoid programming it twice (on removal + * and a moment later when it's requeued). + */ if (!restart) state = HRTIMER_STATE_INACTIVE; + else + reprogram &= !keep_local; __remove_hrtimer(timer, base, state, reprogram); return 1; @@ -1103,9 +1214,31 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, struct hrtimer_clock_base *base) { struct hrtimer_clock_base *new_base; + bool force_local, first; - /* Remove an active timer from the queue: */ - remove_hrtimer(timer, base, true); + /* + * If the timer is on the local cpu base and is the first expiring + * timer then this might end up reprogramming the hardware twice + * (on removal and on enqueue). To avoid that by prevent the + * reprogram on removal, keep the timer local to the current CPU + * and enforce reprogramming after it is queued no matter whether + * it is the new first expiring timer again or not. + */ + force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases); + force_local &= base->cpu_base->next_timer == timer; + + /* + * Remove an active timer from the queue. In case it is not queued + * on the current CPU, make sure that remove_hrtimer() updates the + * remote data correctly. + * + * If it's on the current CPU and the first expiring timer, then + * skip reprogramming, keep the timer local and enforce + * reprogramming later if it was the first expiring timer. This + * avoids programming the underlying clock event twice (once at + * removal and once after enqueue). + */ + remove_hrtimer(timer, base, true, force_local); if (mode & HRTIMER_MODE_REL) tim = ktime_add_safe(tim, base->get_time()); @@ -1115,9 +1248,24 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, hrtimer_set_expires_range_ns(timer, tim, delta_ns); /* Switch the timer base, if necessary: */ - new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED); + if (!force_local) { + new_base = switch_hrtimer_base(timer, base, + mode & HRTIMER_MODE_PINNED); + } else { + new_base = base; + } - return enqueue_hrtimer(timer, new_base, mode); + first = enqueue_hrtimer(timer, new_base, mode); + if (!force_local) + return first; + + /* + * Timer was forced to stay on the current CPU to avoid + * reprogramming on removal and enqueue. Force reprogram the + * hardware by evaluating the new first expiring timer. + */ + hrtimer_force_reprogram(new_base->cpu_base, 1); + return 0; } /** @@ -1183,7 +1331,7 @@ int hrtimer_try_to_cancel(struct hrtimer *timer) base = lock_hrtimer_base(timer, &flags); if (!hrtimer_callback_running(timer)) - ret = remove_hrtimer(timer, base, false); + ret = remove_hrtimer(timer, base, false, false); unlock_hrtimer_base(timer, &flags); diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 01935aafdb46..bc4db9e5ab70 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -10,28 +10,9 @@ #include #include "timekeeping.h" +#include "tick-internal.h" -/* Since jiffies uses a simple TICK_NSEC multiplier - * conversion, the .shift value could be zero. However - * this would make NTP adjustments impossible as they are - * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to - * shift both the nominator and denominator the same - * amount, and give ntp adjustments in units of 1/2^8 - * - * The value 8 is somewhat carefully chosen, as anything - * larger can result in overflows. TICK_NSEC grows as HZ - * shrinks, so values greater than 8 overflow 32bits when - * HZ=100. - */ -#if HZ < 34 -#define JIFFIES_SHIFT 6 -#elif HZ < 67 -#define JIFFIES_SHIFT 7 -#else -#define JIFFIES_SHIFT 8 -#endif - static u64 jiffies_read(struct clocksource *cs) { return (u64) jiffies; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 517be7fd175e..ee736861b18f 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -291,6 +291,8 @@ static void thread_group_start_cputime(struct task_struct *tsk, u64 *samples) struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; struct posix_cputimers *pct = &tsk->signal->posix_cputimers; + lockdep_assert_task_sighand_held(tsk); + /* Check if cputimer isn't running. This is accessed without locking. */ if (!READ_ONCE(pct->timers_active)) { struct task_cputime sum; @@ -405,6 +407,55 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer) return 0; } +static struct posix_cputimer_base *timer_base(struct k_itimer *timer, + struct task_struct *tsk) +{ + int clkidx = CPUCLOCK_WHICH(timer->it_clock); + + if (CPUCLOCK_PERTHREAD(timer->it_clock)) + return tsk->posix_cputimers.bases + clkidx; + else + return tsk->signal->posix_cputimers.bases + clkidx; +} + +/* + * Force recalculating the base earliest expiration on the next tick. + * This will also re-evaluate the need to keep around the process wide + * cputime counter and tick dependency and eventually shut these down + * if necessary. + */ +static void trigger_base_recalc_expires(struct k_itimer *timer, + struct task_struct *tsk) +{ + struct posix_cputimer_base *base = timer_base(timer, tsk); + + base->nextevt = 0; +} + +/* + * Dequeue the timer and reset the base if it was its earliest expiration. + * It makes sure the next tick recalculates the base next expiration so we + * don't keep the costly process wide cputime counter around for a random + * amount of time, along with the tick dependency. + * + * If another timer gets queued between this and the next tick, its + * expiration will update the base next event if necessary on the next + * tick. + */ +static void disarm_timer(struct k_itimer *timer, struct task_struct *p) +{ + struct cpu_timer *ctmr = &timer->it.cpu; + struct posix_cputimer_base *base; + + if (!cpu_timer_dequeue(ctmr)) + return; + + base = timer_base(timer, p); + if (cpu_timer_getexpires(ctmr) == base->nextevt) + trigger_base_recalc_expires(timer, p); +} + + /* * Clean up a CPU-clock timer that is about to be destroyed. * This is called from timer deletion with the timer already locked. @@ -439,7 +490,7 @@ static int posix_cpu_timer_del(struct k_itimer *timer) if (timer->it.cpu.firing) ret = TIMER_RETRY; else - cpu_timer_dequeue(ctmr); + disarm_timer(timer, p); unlock_task_sighand(p, &flags); } @@ -498,15 +549,9 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk) */ static void arm_timer(struct k_itimer *timer, struct task_struct *p) { - int clkidx = CPUCLOCK_WHICH(timer->it_clock); + struct posix_cputimer_base *base = timer_base(timer, p); struct cpu_timer *ctmr = &timer->it.cpu; u64 newexp = cpu_timer_getexpires(ctmr); - struct posix_cputimer_base *base; - - if (CPUCLOCK_PERTHREAD(timer->it_clock)) - base = p->posix_cputimers.bases + clkidx; - else - base = p->signal->posix_cputimers.bases + clkidx; if (!cpu_timer_enqueue(&base->tqhead, ctmr)) return; @@ -703,16 +748,29 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, timer->it_overrun_last = 0; timer->it_overrun = -1; - if (new_expires != 0 && !(val < new_expires)) { - /* - * The designated time already passed, so we notify - * immediately, even if the thread never runs to - * accumulate more time on this clock. - */ - cpu_timer_fire(timer); - } + if (val >= new_expires) { + if (new_expires != 0) { + /* + * The designated time already passed, so we notify + * immediately, even if the thread never runs to + * accumulate more time on this clock. + */ + cpu_timer_fire(timer); + } - ret = 0; + /* + * Make sure we don't keep around the process wide cputime + * counter or the tick dependency if they are not necessary. + */ + sighand = lock_task_sighand(p, &flags); + if (!sighand) + goto out; + + if (!cpu_timer_queued(ctmr)) + trigger_base_recalc_expires(timer, p); + + unlock_task_sighand(p, &flags); + } out: rcu_read_unlock(); if (old) @@ -1346,8 +1404,6 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clkid, } } - if (!*newval) - return; *newval += now; } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index dd5697d7347b..3913222e7bcf 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -336,7 +336,7 @@ void posixtimer_rearm(struct kernel_siginfo *info) int posix_timer_event(struct k_itimer *timr, int si_private) { enum pid_type type; - int ret = -1; + int ret; /* * FIXME: if ->sigq is queued we can race with * dequeue_signal()->posixtimer_rearm(). diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 52414eb5823c..5c47cc50f9e9 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -472,6 +472,13 @@ void tick_resume_local(void) else tick_resume_oneshot(); } + + /* + * Ensure that hrtimers are up to date and the clockevents device + * is reprogrammed correctly when high resolution timers are + * enabled. + */ + hrtimers_resume_local(); } /** diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 6a742a29e545..649f2b48e8f0 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -165,3 +165,35 @@ DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem); void timer_clear_idle(void); + +#define CLOCK_SET_WALL \ + (BIT(HRTIMER_BASE_REALTIME) | BIT(HRTIMER_BASE_REALTIME_SOFT) | \ + BIT(HRTIMER_BASE_TAI) | BIT(HRTIMER_BASE_TAI_SOFT)) + +#define CLOCK_SET_BOOT \ + (BIT(HRTIMER_BASE_BOOTTIME) | BIT(HRTIMER_BASE_BOOTTIME_SOFT)) + +void clock_was_set(unsigned int bases); +void clock_was_set_delayed(void); + +void hrtimers_resume_local(void); + +/* Since jiffies uses a simple TICK_NSEC multiplier + * conversion, the .shift value could be zero. However + * this would make NTP adjustments impossible as they are + * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to + * shift both the nominator and denominator the same + * amount, and give ntp adjustments in units of 1/2^8 + * + * The value 8 is somewhat carefully chosen, as anything + * larger can result in overflows. TICK_NSEC grows as HZ + * shrinks, so values greater than 8 overflow 32bits when + * HZ=100. + */ +#if HZ < 34 +#define JIFFIES_SHIFT 6 +#elif HZ < 67 +#define JIFFIES_SHIFT 7 +#else +#define JIFFIES_SHIFT 8 +#endif diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 8a364aa9881a..b348749a9fc6 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1323,8 +1323,8 @@ out: write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); - /* signal hrtimers about time change */ - clock_was_set(); + /* Signal hrtimers about time change */ + clock_was_set(CLOCK_SET_WALL); if (!ret) audit_tk_injoffset(ts_delta); @@ -1371,8 +1371,8 @@ error: /* even if we error out, we forwarded the time, so call update */ write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); - /* signal hrtimers about time change */ - clock_was_set(); + /* Signal hrtimers about time change */ + clock_was_set(CLOCK_SET_WALL); return ret; } @@ -1746,8 +1746,8 @@ void timekeeping_inject_sleeptime64(const struct timespec64 *delta) write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); - /* signal hrtimers about time change */ - clock_was_set(); + /* Signal hrtimers about time change */ + clock_was_set(CLOCK_SET_WALL | CLOCK_SET_BOOT); } #endif @@ -1810,8 +1810,10 @@ void timekeeping_resume(void) touch_softlockup_watchdog(); + /* Resume the clockevent device(s) and hrtimers */ tick_resume(); - hrtimers_resume(); + /* Notify timerfd as resume is equivalent to clock_was_set() */ + timerfd_resume(); } int timekeeping_suspend(void) @@ -2125,7 +2127,7 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset, * timekeeping_advance - Updates the timekeeper to the current time and * current NTP tick length */ -static void timekeeping_advance(enum timekeeping_adv_mode mode) +static bool timekeeping_advance(enum timekeeping_adv_mode mode) { struct timekeeper *real_tk = &tk_core.timekeeper; struct timekeeper *tk = &shadow_timekeeper; @@ -2196,9 +2198,8 @@ static void timekeeping_advance(enum timekeeping_adv_mode mode) write_seqcount_end(&tk_core.seq); out: raw_spin_unlock_irqrestore(&timekeeper_lock, flags); - if (clock_set) - /* Have to call _delayed version, since in irq context*/ - clock_was_set_delayed(); + + return !!clock_set; } /** @@ -2207,7 +2208,8 @@ out: */ void update_wall_time(void) { - timekeeping_advance(TK_ADV_TICK); + if (timekeeping_advance(TK_ADV_TICK)) + clock_was_set_delayed(); } /** @@ -2387,8 +2389,9 @@ int do_adjtimex(struct __kernel_timex *txc) { struct timekeeper *tk = &tk_core.timekeeper; struct audit_ntp_data ad; - unsigned long flags; + bool clock_set = false; struct timespec64 ts; + unsigned long flags; s32 orig_tai, tai; int ret; @@ -2423,6 +2426,7 @@ int do_adjtimex(struct __kernel_timex *txc) if (tai != orig_tai) { __timekeeping_set_tai_offset(tk, tai); timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); + clock_set = true; } tk_update_leap_state(tk); @@ -2433,10 +2437,10 @@ int do_adjtimex(struct __kernel_timex *txc) /* Update the multiplier immediately if frequency was set directly */ if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK)) - timekeeping_advance(TK_ADV_FREQ); + clock_set |= timekeeping_advance(TK_ADV_FREQ); - if (tai != orig_tai) - clock_was_set(); + if (clock_set) + clock_was_set(CLOCK_REALTIME); ntp_notify_cmos_timer(); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index e5cdf98f50c2..73604bff3d2c 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1680,33 +1680,6 @@ config DEBUG_WQ_FORCE_RR_CPU feature by default. When enabled, memory and cache locality will be impacted. -config DEBUG_BLOCK_EXT_DEVT - bool "Force extended block device numbers and spread them" - depends on DEBUG_KERNEL - depends on BLOCK - default n - help - BIG FAT WARNING: ENABLING THIS OPTION MIGHT BREAK BOOTING ON - SOME DISTRIBUTIONS. DO NOT ENABLE THIS UNLESS YOU KNOW WHAT - YOU ARE DOING. Distros, please enable this and fix whatever - is broken. - - Conventionally, block device numbers are allocated from - predetermined contiguous area. However, extended block area - may introduce non-contiguous block device numbers. This - option forces most block device numbers to be allocated from - the extended space and spreads them to discover kernel or - userland code paths which assume predetermined contiguous - device number allocation. - - Note that turning on this debug option shuffles all the - device numbers for all IDE and SCSI devices including libata - ones, so root partition specified using device number - directly (via rdev or root=MAJ:MIN) won't work anymore. - Textual device names (root=/dev/sdXn) will continue to work. - - Say N if you are unsure. - config CPU_HOTPLUG_STATE_CONTROL bool "Enable CPU hotplug state control" depends on DEBUG_KERNEL diff --git a/mm/backing-dev.c b/mm/backing-dev.c index f5561ea7d90a..cd06dca232c3 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -807,6 +807,7 @@ struct backing_dev_info *bdi_alloc(int node_id) bdi->capabilities = BDI_CAP_WRITEBACK | BDI_CAP_WRITEBACK_ACCT; bdi->ra_pages = VM_READAHEAD_PAGES; bdi->io_pages = VM_READAHEAD_PAGES; + timer_setup(&bdi->laptop_mode_wb_timer, laptop_mode_timer_fn, 0); return bdi; } EXPORT_SYMBOL(bdi_alloc); @@ -928,6 +929,8 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi) void bdi_unregister(struct backing_dev_info *bdi) { + del_timer_sync(&bdi->laptop_mode_wb_timer); + /* make sure nobody finds us on the bdi_list anymore */ bdi_remove_from_list(bdi); wb_shutdown(&bdi->wb); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 9f63548f247c..c12f67cbfa19 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2010,7 +2010,6 @@ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, return ret; } -#ifdef CONFIG_BLOCK void laptop_mode_timer_fn(struct timer_list *t) { struct backing_dev_info *backing_dev_info = @@ -2045,7 +2044,6 @@ void laptop_sync_completion(void) rcu_read_unlock(); } -#endif /* * If ratelimit_pages is too high then we can get into dirty-data overload diff --git a/net/socket.c b/net/socket.c index 0b2dad3bdf7f..532fff5a3684 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1722,32 +1722,22 @@ SYSCALL_DEFINE2(listen, int, fd, int, backlog) return __sys_listen(fd, backlog); } -int __sys_accept4_file(struct file *file, unsigned file_flags, +struct file *do_accept(struct file *file, unsigned file_flags, struct sockaddr __user *upeer_sockaddr, - int __user *upeer_addrlen, int flags, - unsigned long nofile) + int __user *upeer_addrlen, int flags) { struct socket *sock, *newsock; struct file *newfile; - int err, len, newfd; + int err, len; struct sockaddr_storage address; - if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) - return -EINVAL; - - if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) - flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; - sock = sock_from_file(file); - if (!sock) { - err = -ENOTSOCK; - goto out; - } + if (!sock) + return ERR_PTR(-ENOTSOCK); - err = -ENFILE; newsock = sock_alloc(); if (!newsock) - goto out; + return ERR_PTR(-ENFILE); newsock->type = sock->type; newsock->ops = sock->ops; @@ -1758,18 +1748,9 @@ int __sys_accept4_file(struct file *file, unsigned file_flags, */ __module_get(newsock->ops->owner); - newfd = __get_unused_fd_flags(flags, nofile); - if (unlikely(newfd < 0)) { - err = newfd; - sock_release(newsock); - goto out; - } newfile = sock_alloc_file(newsock, flags, sock->sk->sk_prot_creator->name); - if (IS_ERR(newfile)) { - err = PTR_ERR(newfile); - put_unused_fd(newfd); - goto out; - } + if (IS_ERR(newfile)) + return newfile; err = security_socket_accept(sock, newsock); if (err) @@ -1794,16 +1775,38 @@ int __sys_accept4_file(struct file *file, unsigned file_flags, } /* File flags are not inherited via accept() unlike another OSes. */ - - fd_install(newfd, newfile); - err = newfd; -out: - return err; + return newfile; out_fd: fput(newfile); - put_unused_fd(newfd); - goto out; + return ERR_PTR(err); +} +int __sys_accept4_file(struct file *file, unsigned file_flags, + struct sockaddr __user *upeer_sockaddr, + int __user *upeer_addrlen, int flags, + unsigned long nofile) +{ + struct file *newfile; + int newfd; + + if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) + return -EINVAL; + + if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) + flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; + + newfd = __get_unused_fd_flags(flags, nofile); + if (unlikely(newfd < 0)) + return newfd; + + newfile = do_accept(file, file_flags, upeer_sockaddr, upeer_addrlen, + flags); + if (IS_ERR(newfile)) { + put_unused_fd(newfd); + return PTR_ERR(newfile); + } + fd_install(newfd, newfile); + return newfd; } /* diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index e573dcecdd66..b7dbdcbdeb6c 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -3149,24 +3149,6 @@ void cleanup_socket_xprt(void) xprt_unregister_transport(&xs_bc_tcp_transport); } -static int param_set_uint_minmax(const char *val, - const struct kernel_param *kp, - unsigned int min, unsigned int max) -{ - unsigned int num; - int ret; - - if (!val) - return -EINVAL; - ret = kstrtouint(val, 0, &num); - if (ret) - return ret; - if (num < min || num > max) - return -EINVAL; - *((unsigned int *)kp->arg) = num; - return 0; -} - static int param_set_portnr(const char *val, const struct kernel_param *kp) { return param_set_uint_minmax(val, kp,