From 3ad4a31620355358316fa08fcfab37b9d6c33347 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 27 Jul 2021 15:51:30 +0300 Subject: [PATCH 001/315] ata: sata_dwc_460ex: No need to call phy_exit() befre phy_init() Last change to device managed APIs cleaned up error path to simple phy_exit() call, which in some cases has been executed with NULL parameter. This per se is not a problem, but rather logical misconception: no need to free resource when it's for sure has not been allocated yet. Fix the driver accordingly. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20210727125130.19977-1-andriy.shevchenko@linux.intel.com Signed-off-by: Jens Axboe --- drivers/ata/sata_dwc_460ex.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/ata/sata_dwc_460ex.c b/drivers/ata/sata_dwc_460ex.c index f0ef844428bb..338c2e50f759 100644 --- a/drivers/ata/sata_dwc_460ex.c +++ b/drivers/ata/sata_dwc_460ex.c @@ -1259,24 +1259,20 @@ static int sata_dwc_probe(struct platform_device *ofdev) irq = irq_of_parse_and_map(np, 0); if (irq == NO_IRQ) { dev_err(&ofdev->dev, "no SATA DMA irq\n"); - err = -ENODEV; - goto error_out; + return -ENODEV; } #ifdef CONFIG_SATA_DWC_OLD_DMA if (!of_find_property(np, "dmas", NULL)) { err = sata_dwc_dma_init_old(ofdev, hsdev); if (err) - goto error_out; + return err; } #endif hsdev->phy = devm_phy_optional_get(hsdev->dev, "sata-phy"); - if (IS_ERR(hsdev->phy)) { - err = PTR_ERR(hsdev->phy); - hsdev->phy = NULL; - goto error_out; - } + if (IS_ERR(hsdev->phy)) + return PTR_ERR(hsdev->phy); err = phy_init(hsdev->phy); if (err) From c52787b590634646d4da3d8f23c4532ba050d40d Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Fri, 8 Jan 2021 23:10:52 +1100 Subject: [PATCH 002/315] x86/smp: Add a per-cpu view of SMT state A new field smt_active in cpuinfo_x86 identifies if the current core/cpu is in SMT mode or not. This is helpful when the system has some of its cores with threads offlined and can be used for cases where action is taken based on the state of SMT. The upcoming support for paranoid L1D flush will make use of this information. Suggested-by: Thomas Gleixner Signed-off-by: Balbir Singh Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210108121056.21940-2-sblbir@amazon.com --- arch/x86/include/asm/processor.h | 2 ++ arch/x86/kernel/smpboot.c | 10 +++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index f3020c54e2cb..1e0d13c9fda6 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -136,6 +136,8 @@ struct cpuinfo_x86 { u16 logical_die_id; /* Index into per_cpu list: */ u16 cpu_index; + /* Is SMT active on this core? */ + bool smt_active; u32 microcode; /* Address space bits used by the cache internally */ u8 x86_cache_bits; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 9320285a5e29..85f6e242b6b4 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -610,6 +610,9 @@ void set_cpu_sibling_map(int cpu) if (threads > __max_smt_threads) __max_smt_threads = threads; + for_each_cpu(i, topology_sibling_cpumask(cpu)) + cpu_data(i).smt_active = threads > 1; + /* * This needs a separate iteration over the cpus because we rely on all * topology_sibling_cpumask links to be set-up. @@ -1552,8 +1555,13 @@ static void remove_siblinginfo(int cpu) for_each_cpu(sibling, topology_die_cpumask(cpu)) cpumask_clear_cpu(cpu, topology_die_cpumask(sibling)); - for_each_cpu(sibling, topology_sibling_cpumask(cpu)) + + for_each_cpu(sibling, topology_sibling_cpumask(cpu)) { cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling)); + if (cpumask_weight(topology_sibling_cpumask(sibling)) == 1) + cpu_data(sibling).smt_active = false; + } + for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling)); cpumask_clear(cpu_llc_shared_mask(cpu)); From 371b09c6fdc436f2c7bb67fc90df5eec8ce90f06 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Fri, 8 Jan 2021 23:10:53 +1100 Subject: [PATCH 003/315] x86/mm: Refactor cond_ibpb() to support other use cases cond_ibpb() has the necessary bits required to track the previous mm in switch_mm_irqs_off(). This can be reused for other use cases like L1D flushing on context switch. Suggested-by: Thomas Gleixner Signed-off-by: Balbir Singh Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210108121056.21940-3-sblbir@amazon.com --- arch/x86/include/asm/tlbflush.h | 2 +- arch/x86/mm/tlb.c | 53 ++++++++++++++++++--------------- 2 files changed, 30 insertions(+), 25 deletions(-) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index fa952eadbc2e..b587a9ee9cb2 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -83,7 +83,7 @@ struct tlb_state { /* Last user mm for optimizing IBPB */ union { struct mm_struct *last_user_mm; - unsigned long last_user_mm_ibpb; + unsigned long last_user_mm_spec; }; u16 loaded_mm_asid; diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index cfe6b1e85fa6..c98bc84a82f6 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -43,10 +43,14 @@ */ /* - * Use bit 0 to mangle the TIF_SPEC_IB state into the mm pointer which is - * stored in cpu_tlb_state.last_user_mm_ibpb. + * Bits to mangle the TIF_SPEC_IB state into the mm pointer which is + * stored in cpu_tlb_state.last_user_mm_spec. */ #define LAST_USER_MM_IBPB 0x1UL +#define LAST_USER_MM_SPEC_MASK (LAST_USER_MM_IBPB) + +/* Bits to set when tlbstate and flush is (re)initialized */ +#define LAST_USER_MM_INIT LAST_USER_MM_IBPB /* * The x86 feature is called PCID (Process Context IDentifier). It is similar @@ -317,20 +321,29 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next, local_irq_restore(flags); } -static unsigned long mm_mangle_tif_spec_ib(struct task_struct *next) +static unsigned long mm_mangle_tif_spec_bits(struct task_struct *next) { unsigned long next_tif = task_thread_info(next)->flags; - unsigned long ibpb = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_IBPB; + unsigned long spec_bits = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_SPEC_MASK; - return (unsigned long)next->mm | ibpb; + return (unsigned long)next->mm | spec_bits; } -static void cond_ibpb(struct task_struct *next) +static void cond_mitigation(struct task_struct *next) { + unsigned long prev_mm, next_mm; + if (!next || !next->mm) return; + next_mm = mm_mangle_tif_spec_bits(next); + prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_spec); + /* + * Avoid user/user BTB poisoning by flushing the branch predictor + * when switching between processes. This stops one process from + * doing Spectre-v2 attacks on another. + * * Both, the conditional and the always IBPB mode use the mm * pointer to avoid the IBPB when switching between tasks of the * same process. Using the mm pointer instead of mm->context.ctx_id @@ -340,8 +353,6 @@ static void cond_ibpb(struct task_struct *next) * exposed data is not really interesting. */ if (static_branch_likely(&switch_mm_cond_ibpb)) { - unsigned long prev_mm, next_mm; - /* * This is a bit more complex than the always mode because * it has to handle two cases: @@ -371,20 +382,14 @@ static void cond_ibpb(struct task_struct *next) * Optimize this with reasonably small overhead for the * above cases. Mangle the TIF_SPEC_IB bit into the mm * pointer of the incoming task which is stored in - * cpu_tlbstate.last_user_mm_ibpb for comparison. - */ - next_mm = mm_mangle_tif_spec_ib(next); - prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_ibpb); - - /* + * cpu_tlbstate.last_user_mm_spec for comparison. + * * Issue IBPB only if the mm's are different and one or * both have the IBPB bit set. */ if (next_mm != prev_mm && (next_mm | prev_mm) & LAST_USER_MM_IBPB) indirect_branch_prediction_barrier(); - - this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, next_mm); } if (static_branch_unlikely(&switch_mm_always_ibpb)) { @@ -393,11 +398,12 @@ static void cond_ibpb(struct task_struct *next) * different context than the user space task which ran * last on this CPU. */ - if (this_cpu_read(cpu_tlbstate.last_user_mm) != next->mm) { + if ((prev_mm & ~LAST_USER_MM_SPEC_MASK) != + (unsigned long)next->mm) indirect_branch_prediction_barrier(); - this_cpu_write(cpu_tlbstate.last_user_mm, next->mm); - } } + + this_cpu_write(cpu_tlbstate.last_user_mm_spec, next_mm); } #ifdef CONFIG_PERF_EVENTS @@ -531,11 +537,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, need_flush = true; } else { /* - * Avoid user/user BTB poisoning by flushing the branch - * predictor when switching between processes. This stops - * one process from doing Spectre-v2 attacks on another. + * Apply process to process speculation vulnerability + * mitigations if applicable. */ - cond_ibpb(tsk); + cond_mitigation(tsk); /* * Stop remote flushes for the previous mm. @@ -643,7 +648,7 @@ void initialize_tlbstate_and_flush(void) write_cr3(build_cr3(mm->pgd, 0)); /* Reinitialize tlbstate. */ - this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, LAST_USER_MM_IBPB); + this_cpu_write(cpu_tlbstate.last_user_mm_spec, LAST_USER_MM_INIT); this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0); this_cpu_write(cpu_tlbstate.next_asid, 1); this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id); From 58e106e725eed59896b9141a1c9a917d2f67962a Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Mon, 26 Apr 2021 21:59:11 +0200 Subject: [PATCH 004/315] sched: Add task_work callback for paranoid L1D flush The upcoming paranoid L1D flush infrastructure allows to conditionally (opt-in) flush L1D in switch_mm() as a defense against potential new side channels or for paranoia reasons. As the flush makes only sense when a task runs on a non-SMT enabled core, because SMT siblings share L1, the switch_mm() logic will kill a task which is flagged for L1D flush when it is running on a SMT thread. Add a taskwork callback so switch_mm() can queue a SIG_KILL command which is invoked when the task tries to return to user space. Signed-off-by: Balbir Singh Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210108121056.21940-1-sblbir@amazon.com --- arch/Kconfig | 3 +++ include/linux/sched.h | 10 ++++++++++ 2 files changed, 13 insertions(+) diff --git a/arch/Kconfig b/arch/Kconfig index 129df498a8e1..98db63496bab 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1282,6 +1282,9 @@ config ARCH_SPLIT_ARG64 config ARCH_HAS_ELFCORE_COMPAT bool +config ARCH_HAS_PARANOID_L1D_FLUSH + bool + source "kernel/gcov/Kconfig" source "scripts/gcc-plugins/Kconfig" diff --git a/include/linux/sched.h b/include/linux/sched.h index ec8d07d88641..c048e59d3fbd 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1400,6 +1400,16 @@ struct task_struct { struct llist_head kretprobe_instances; #endif +#ifdef CONFIG_ARCH_HAS_PARANOID_L1D_FLUSH + /* + * If L1D flush is supported on mm context switch + * then we use this callback head to queue kill work + * to kill tasks that are not running on SMT disabled + * cores + */ + struct callback_head l1d_flush_kill; +#endif + /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. From 8aacd1eab53ec853c2d29cdc9b64e9dc87d2a519 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Mon, 26 Apr 2021 22:09:43 +0200 Subject: [PATCH 005/315] x86/process: Make room for TIF_SPEC_L1D_FLUSH The upcoming support for paranoid L1D flush in switch_mm() requires that TIF_SPEC_IB and the new TIF_SPEC_L1D_FLUSH are two consecutive bits in thread_info::flags. Move TIF_SPEC_FORCE_UPDATE to a spare bit to make room for the new one. Suggested-by: Thomas Gleixner Signed-off-by: Balbir Singh Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210108121056.21940-1-sblbir@amazon.com --- arch/x86/include/asm/thread_info.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index de406d93b515..d9afd354549c 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -81,7 +81,6 @@ struct thread_info { #define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/ #define TIF_SSBD 5 /* Speculative store bypass disable */ #define TIF_SPEC_IB 9 /* Indirect branch speculation mitigation */ -#define TIF_SPEC_FORCE_UPDATE 10 /* Force speculation MSR update in context switch */ #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ #define TIF_UPROBE 12 /* breakpointed or singlestepping */ #define TIF_PATCH_PENDING 13 /* pending live patching update */ @@ -93,6 +92,7 @@ struct thread_info { #define TIF_MEMDIE 20 /* is terminating due to OOM killer */ #define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */ #define TIF_IO_BITMAP 22 /* uses I/O bitmap */ +#define TIF_SPEC_FORCE_UPDATE 23 /* Force speculation MSR update in context switch */ #define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ #define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */ #define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ @@ -104,7 +104,6 @@ struct thread_info { #define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) #define _TIF_SSBD (1 << TIF_SSBD) #define _TIF_SPEC_IB (1 << TIF_SPEC_IB) -#define _TIF_SPEC_FORCE_UPDATE (1 << TIF_SPEC_FORCE_UPDATE) #define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY) #define _TIF_UPROBE (1 << TIF_UPROBE) #define _TIF_PATCH_PENDING (1 << TIF_PATCH_PENDING) @@ -115,6 +114,7 @@ struct thread_info { #define _TIF_SLD (1 << TIF_SLD) #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) +#define _TIF_SPEC_FORCE_UPDATE (1 << TIF_SPEC_FORCE_UPDATE) #define _TIF_FORCED_TF (1 << TIF_FORCED_TF) #define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP) #define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES) From b5f06f64e269f9820cd5ad9e9a98afa6c8914b7a Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Mon, 26 Apr 2021 21:42:30 +0200 Subject: [PATCH 006/315] x86/mm: Prepare for opt-in based L1D flush in switch_mm() The goal of this is to allow tasks that want to protect sensitive information, against e.g. the recently found snoop assisted data sampling vulnerabilites, to flush their L1D on being switched out. This protects their data from being snooped or leaked via side channels after the task has context switched out. This could also be used to wipe L1D when an untrusted task is switched in, but that's not a really well defined scenario while the opt-in variant is clearly defined. The mechanism is default disabled and can be enabled on the kernel command line. Prepare for the actual prctl based opt-in: 1) Provide the necessary setup functionality similar to the other mitigations and enable the static branch when the command line option is set and the CPU provides support for hardware assisted L1D flushing. Software based L1D flush is not supported because it's CPU model specific and not really well defined. This does not come with a sysfs file like the other mitigations because it is not bound to any specific vulnerability. Support has to be queried via the prctl(2) interface. 2) Add TIF_SPEC_L1D_FLUSH next to L1D_SPEC_IB so the two bits can be mangled into the mm pointer in one go which allows to reuse the existing mechanism in switch_mm() for the conditional IBPB speculation barrier efficiently. 3) Add the L1D flush specific functionality which flushes L1D when the outgoing task opted in. Also check whether the incoming task has requested L1D flush and if so validate that it is not accidentaly running on an SMT sibling as this makes the whole excercise moot because SMT siblings share L1D which opens tons of other attack vectors. If that happens schedule task work which signals the incoming task on return to user/guest with SIGBUS as this is part of the paranoid L1D flush contract. Suggested-by: Thomas Gleixner Signed-off-by: Balbir Singh Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210108121056.21940-1-sblbir@amazon.com --- arch/x86/Kconfig | 1 + arch/x86/include/asm/nospec-branch.h | 2 + arch/x86/include/asm/thread_info.h | 2 + arch/x86/kernel/cpu/bugs.c | 37 ++++++++++++++++++ arch/x86/mm/tlb.c | 58 +++++++++++++++++++++++++++- 5 files changed, 98 insertions(+), 2 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 49270655e827..d8a2c3fe492a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -119,6 +119,7 @@ config X86 select ARCH_WANT_HUGE_PMD_SHARE select ARCH_WANT_LD_ORPHAN_WARN select ARCH_WANTS_THP_SWAP if X86_64 + select ARCH_HAS_PARANOID_L1D_FLUSH select BUILDTIME_TABLE_SORT select CLKEVT_I8253 select CLOCKSOURCE_VALIDATE_LAST_CYCLE diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 3ad8c6d3cbb3..ec2d5c8c6694 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -252,6 +252,8 @@ DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb); DECLARE_STATIC_KEY_FALSE(mds_user_clear); DECLARE_STATIC_KEY_FALSE(mds_idle_clear); +DECLARE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); + #include /** diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index d9afd354549c..cf132663c219 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -81,6 +81,7 @@ struct thread_info { #define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/ #define TIF_SSBD 5 /* Speculative store bypass disable */ #define TIF_SPEC_IB 9 /* Indirect branch speculation mitigation */ +#define TIF_SPEC_L1D_FLUSH 10 /* Flush L1D on mm switches (processes) */ #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ #define TIF_UPROBE 12 /* breakpointed or singlestepping */ #define TIF_PATCH_PENDING 13 /* pending live patching update */ @@ -104,6 +105,7 @@ struct thread_info { #define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) #define _TIF_SSBD (1 << TIF_SSBD) #define _TIF_SPEC_IB (1 << TIF_SPEC_IB) +#define _TIF_SPEC_L1D_FLUSH (1 << TIF_SPEC_L1D_FLUSH) #define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY) #define _TIF_UPROBE (1 << TIF_UPROBE) #define _TIF_PATCH_PENDING (1 << TIF_PATCH_PENDING) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index d41b70fe4918..1a5a1b085eaa 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -43,6 +43,7 @@ static void __init mds_select_mitigation(void); static void __init mds_print_mitigation(void); static void __init taa_select_mitigation(void); static void __init srbds_select_mitigation(void); +static void __init l1d_flush_select_mitigation(void); /* The base value of the SPEC_CTRL MSR that always has to be preserved. */ u64 x86_spec_ctrl_base; @@ -76,6 +77,13 @@ EXPORT_SYMBOL_GPL(mds_user_clear); DEFINE_STATIC_KEY_FALSE(mds_idle_clear); EXPORT_SYMBOL_GPL(mds_idle_clear); +/* + * Controls whether l1d flush based mitigations are enabled, + * based on hw features and admin setting via boot parameter + * defaults to false + */ +DEFINE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); + void __init check_bugs(void) { identify_boot_cpu(); @@ -111,6 +119,7 @@ void __init check_bugs(void) mds_select_mitigation(); taa_select_mitigation(); srbds_select_mitigation(); + l1d_flush_select_mitigation(); /* * As MDS and TAA mitigations are inter-related, print MDS @@ -491,6 +500,34 @@ static int __init srbds_parse_cmdline(char *str) } early_param("srbds", srbds_parse_cmdline); +#undef pr_fmt +#define pr_fmt(fmt) "L1D Flush : " fmt + +enum l1d_flush_mitigations { + L1D_FLUSH_OFF = 0, + L1D_FLUSH_ON, +}; + +static enum l1d_flush_mitigations l1d_flush_mitigation __initdata = L1D_FLUSH_OFF; + +static void __init l1d_flush_select_mitigation(void) +{ + if (!l1d_flush_mitigation || !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) + return; + + static_branch_enable(&switch_mm_cond_l1d_flush); + pr_info("Conditional flush on switch_mm() enabled\n"); +} + +static int __init l1d_flush_parse_cmdline(char *str) +{ + if (!strcmp(str, "on")) + l1d_flush_mitigation = L1D_FLUSH_ON; + + return 0; +} +early_param("l1d_flush", l1d_flush_parse_cmdline); + #undef pr_fmt #define pr_fmt(fmt) "Spectre V1 : " fmt diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index c98bc84a82f6..59ba2968af1b 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -8,11 +8,13 @@ #include #include #include +#include #include #include #include #include +#include #include #include @@ -43,11 +45,12 @@ */ /* - * Bits to mangle the TIF_SPEC_IB state into the mm pointer which is + * Bits to mangle the TIF_SPEC_* state into the mm pointer which is * stored in cpu_tlb_state.last_user_mm_spec. */ #define LAST_USER_MM_IBPB 0x1UL -#define LAST_USER_MM_SPEC_MASK (LAST_USER_MM_IBPB) +#define LAST_USER_MM_L1D_FLUSH 0x2UL +#define LAST_USER_MM_SPEC_MASK (LAST_USER_MM_IBPB | LAST_USER_MM_L1D_FLUSH) /* Bits to set when tlbstate and flush is (re)initialized */ #define LAST_USER_MM_INIT LAST_USER_MM_IBPB @@ -321,11 +324,52 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next, local_irq_restore(flags); } +/* + * Invoked from return to user/guest by a task that opted-in to L1D + * flushing but ended up running on an SMT enabled core due to wrong + * affinity settings or CPU hotplug. This is part of the paranoid L1D flush + * contract which this task requested. + */ +static void l1d_flush_force_sigbus(struct callback_head *ch) +{ + force_sig(SIGBUS); +} + +static void l1d_flush_evaluate(unsigned long prev_mm, unsigned long next_mm, + struct task_struct *next) +{ + /* Flush L1D if the outgoing task requests it */ + if (prev_mm & LAST_USER_MM_L1D_FLUSH) + wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); + + /* Check whether the incoming task opted in for L1D flush */ + if (likely(!(next_mm & LAST_USER_MM_L1D_FLUSH))) + return; + + /* + * Validate that it is not running on an SMT sibling as this would + * make the excercise pointless because the siblings share L1D. If + * it runs on a SMT sibling, notify it with SIGBUS on return to + * user/guest + */ + if (this_cpu_read(cpu_info.smt_active)) { + clear_ti_thread_flag(&next->thread_info, TIF_SPEC_L1D_FLUSH); + next->l1d_flush_kill.func = l1d_flush_force_sigbus; + task_work_add(next, &next->l1d_flush_kill, TWA_RESUME); + } +} + static unsigned long mm_mangle_tif_spec_bits(struct task_struct *next) { unsigned long next_tif = task_thread_info(next)->flags; unsigned long spec_bits = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_SPEC_MASK; + /* + * Ensure that the bit shift above works as expected and the two flags + * end up in bit 0 and 1. + */ + BUILD_BUG_ON(TIF_SPEC_L1D_FLUSH != TIF_SPEC_IB + 1); + return (unsigned long)next->mm | spec_bits; } @@ -403,6 +447,16 @@ static void cond_mitigation(struct task_struct *next) indirect_branch_prediction_barrier(); } + if (static_branch_unlikely(&switch_mm_cond_l1d_flush)) { + /* + * Flush L1D when the outgoing task requested it and/or + * check whether the incoming task requested L1D flushing + * and ended up on an SMT sibling. + */ + if (unlikely((prev_mm | next_mm) & LAST_USER_MM_L1D_FLUSH)) + l1d_flush_evaluate(prev_mm, next_mm, next); + } + this_cpu_write(cpu_tlbstate.last_user_mm_spec, next_mm); } From e893bb1bb4d2eb635eba61e5d9c5135d96855773 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Fri, 8 Jan 2021 23:10:55 +1100 Subject: [PATCH 007/315] x86, prctl: Hook L1D flushing in via prctl Use the existing PR_GET/SET_SPECULATION_CTRL API to expose the L1D flush capability. For L1D flushing PR_SPEC_FORCE_DISABLE and PR_SPEC_DISABLE_NOEXEC are not supported. Enabling L1D flush does not check if the task is running on an SMT enabled core, rather a check is done at runtime (at the time of flush), if the task runs on a SMT sibling then the task is sent a SIGBUS which is executed before the task returns to user space or to a guest. This is better than the other alternatives of: a. Ensuring strict affinity of the task (hard to enforce without further changes in the scheduler) b. Silently skipping flush for tasks that move to SMT enabled cores. Hook up the core prctl and implement the x86 specific parts which in turn makes it functional. Suggested-by: Thomas Gleixner Signed-off-by: Balbir Singh Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210108121056.21940-5-sblbir@amazon.com --- arch/x86/kernel/cpu/bugs.c | 33 +++++++++++++++++++++++++++++++++ include/uapi/linux/prctl.h | 1 + 2 files changed, 34 insertions(+) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 1a5a1b085eaa..ecfca3bbcd96 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1252,6 +1252,24 @@ static void task_update_spec_tif(struct task_struct *tsk) speculation_ctrl_update_current(); } +static int l1d_flush_prctl_set(struct task_struct *task, unsigned long ctrl) +{ + + if (!static_branch_unlikely(&switch_mm_cond_l1d_flush)) + return -EPERM; + + switch (ctrl) { + case PR_SPEC_ENABLE: + set_ti_thread_flag(&task->thread_info, TIF_SPEC_L1D_FLUSH); + return 0; + case PR_SPEC_DISABLE: + clear_ti_thread_flag(&task->thread_info, TIF_SPEC_L1D_FLUSH); + return 0; + default: + return -ERANGE; + } +} + static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl) { if (ssb_mode != SPEC_STORE_BYPASS_PRCTL && @@ -1361,6 +1379,8 @@ int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which, return ssb_prctl_set(task, ctrl); case PR_SPEC_INDIRECT_BRANCH: return ib_prctl_set(task, ctrl); + case PR_SPEC_L1D_FLUSH: + return l1d_flush_prctl_set(task, ctrl); default: return -ENODEV; } @@ -1377,6 +1397,17 @@ void arch_seccomp_spec_mitigate(struct task_struct *task) } #endif +static int l1d_flush_prctl_get(struct task_struct *task) +{ + if (!static_branch_unlikely(&switch_mm_cond_l1d_flush)) + return PR_SPEC_FORCE_DISABLE; + + if (test_ti_thread_flag(&task->thread_info, TIF_SPEC_L1D_FLUSH)) + return PR_SPEC_PRCTL | PR_SPEC_ENABLE; + else + return PR_SPEC_PRCTL | PR_SPEC_DISABLE; +} + static int ssb_prctl_get(struct task_struct *task) { switch (ssb_mode) { @@ -1427,6 +1458,8 @@ int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) return ssb_prctl_get(task); case PR_SPEC_INDIRECT_BRANCH: return ib_prctl_get(task); + case PR_SPEC_L1D_FLUSH: + return l1d_flush_prctl_get(task); default: return -ENODEV; } diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 967d9c55323d..964c41ed303e 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -213,6 +213,7 @@ struct prctl_mm_map { /* Speculation control variants */ # define PR_SPEC_STORE_BYPASS 0 # define PR_SPEC_INDIRECT_BRANCH 1 +# define PR_SPEC_L1D_FLUSH 2 /* Return and control values for PR_SET/GET_SPECULATION_CTRL */ # define PR_SPEC_NOT_AFFECTED 0 # define PR_SPEC_PRCTL (1UL << 0) From b7fe54f6c2d437082dcbecfbd832f38edd9caaf4 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Fri, 8 Jan 2021 23:10:56 +1100 Subject: [PATCH 008/315] Documentation: Add L1D flushing Documentation Add documentation of l1d flushing, explain the need for the feature and how it can be used. Signed-off-by: Balbir Singh Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210108121056.21940-6-sblbir@amazon.com --- Documentation/admin-guide/hw-vuln/index.rst | 1 + .../admin-guide/hw-vuln/l1d_flush.rst | 69 +++++++++++++++++++ .../admin-guide/kernel-parameters.txt | 17 +++++ Documentation/userspace-api/spec_ctrl.rst | 8 +++ 4 files changed, 95 insertions(+) create mode 100644 Documentation/admin-guide/hw-vuln/l1d_flush.rst diff --git a/Documentation/admin-guide/hw-vuln/index.rst b/Documentation/admin-guide/hw-vuln/index.rst index f12cda55538b..8cbc711cda93 100644 --- a/Documentation/admin-guide/hw-vuln/index.rst +++ b/Documentation/admin-guide/hw-vuln/index.rst @@ -16,3 +16,4 @@ are configurable at compile, boot or run time. multihit.rst special-register-buffer-data-sampling.rst core-scheduling.rst + l1d_flush.rst diff --git a/Documentation/admin-guide/hw-vuln/l1d_flush.rst b/Documentation/admin-guide/hw-vuln/l1d_flush.rst new file mode 100644 index 000000000000..210020bc3f56 --- /dev/null +++ b/Documentation/admin-guide/hw-vuln/l1d_flush.rst @@ -0,0 +1,69 @@ +L1D Flushing +============ + +With an increasing number of vulnerabilities being reported around data +leaks from the Level 1 Data cache (L1D) the kernel provides an opt-in +mechanism to flush the L1D cache on context switch. + +This mechanism can be used to address e.g. CVE-2020-0550. For applications +the mechanism keeps them safe from vulnerabilities, related to leaks +(snooping of) from the L1D cache. + + +Related CVEs +------------ +The following CVEs can be addressed by this +mechanism + + ============= ======================== ================== + CVE-2020-0550 Improper Data Forwarding OS related aspects + ============= ======================== ================== + +Usage Guidelines +---------------- + +Please see document: :ref:`Documentation/userspace-api/spec_ctrl.rst +` for details. + +**NOTE**: The feature is disabled by default, applications need to +specifically opt into the feature to enable it. + +Mitigation +---------- + +When PR_SET_L1D_FLUSH is enabled for a task a flush of the L1D cache is +performed when the task is scheduled out and the incoming task belongs to a +different process and therefore to a different address space. + +If the underlying CPU supports L1D flushing in hardware, the hardware +mechanism is used, software fallback for the mitigation, is not supported. + +Mitigation control on the kernel command line +--------------------------------------------- + +The kernel command line allows to control the L1D flush mitigations at boot +time with the option "l1d_flush=". The valid arguments for this option are: + + ============ ============================================================= + on Enables the prctl interface, applications trying to use + the prctl() will fail with an error if l1d_flush is not + enabled + ============ ============================================================= + +By default the mechanism is disabled. + +Limitations +----------- + +The mechanism does not mitigate L1D data leaks between tasks belonging to +different processes which are concurrently executing on sibling threads of +a physical CPU core when SMT is enabled on the system. + +This can be addressed by controlled placement of processes on physical CPU +cores or by disabling SMT. See the relevant chapter in the L1TF mitigation +document: :ref:`Documentation/admin-guide/hw-vuln/l1tf.rst `. + +**NOTE** : The opt-in of a task for L1D flushing works only when the task's +affinity is limited to cores running in non-SMT mode. If a task which +requested L1D flushing is scheduled on a SMT-enabled core the kernel sends +a SIGBUS to the task. diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index bdb22006f713..b105db25f7a5 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2421,6 +2421,23 @@ feature (tagged TLBs) on capable Intel chips. Default is 1 (enabled) + l1d_flush= [X86,INTEL] + Control mitigation for L1D based snooping vulnerability. + + Certain CPUs are vulnerable to an exploit against CPU + internal buffers which can forward information to a + disclosure gadget under certain conditions. + + In vulnerable processors, the speculatively + forwarded data can be used in a cache side channel + attack, to access data to which the attacker does + not have direct access. + + This parameter controls the mitigation. The + options are: + + on - enable the interface for the mitigation + l1tf= [X86] Control mitigation of the L1TF vulnerability on affected CPUs diff --git a/Documentation/userspace-api/spec_ctrl.rst b/Documentation/userspace-api/spec_ctrl.rst index 7ddd8f667459..5e8ed9eef9aa 100644 --- a/Documentation/userspace-api/spec_ctrl.rst +++ b/Documentation/userspace-api/spec_ctrl.rst @@ -106,3 +106,11 @@ Speculation misfeature controls * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_ENABLE, 0, 0); * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_DISABLE, 0, 0); * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_FORCE_DISABLE, 0, 0); + +- PR_SPEC_L1D_FLUSH: Flush L1D Cache on context switch out of the task + (works only when tasks run on non SMT cores) + + Invocations: + * prctl(PR_GET_SPECULATION_CTRL, PR_SPEC_L1D_FLUSH, 0, 0, 0); + * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_L1D_FLUSH, PR_SPEC_ENABLE, 0, 0); + * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_L1D_FLUSH, PR_SPEC_DISABLE, 0, 0); From 06447ae5e33bfbc5a777cc06d9854a31f3912833 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Wed, 14 Jul 2021 21:56:55 +0200 Subject: [PATCH 009/315] ioprio: move user space relevant ioprio bits to UAPI includes systemd added a modified copy of include/linux/ioprio.h into its code to get the relevant content definitions for the exposed ioprio_[get|set] system calls. Move the user space relevant ioprio bits to the UAPI includes to be able to use the ioprio_[get|set] syscalls as intended. Cc: Kay Sievers Cc: Greg Kroah-Hartman Cc: Jens Axboe Cc: linux-block@vger.kernel.org Signed-off-by: Oliver Hartkopp Link: https://lore.kernel.org/r/20210714195655.181943-1-socketcan@hartkopp.net Signed-off-by: Jens Axboe --- include/linux/ioprio.h | 41 +-------------------------------- include/uapi/linux/ioprio.h | 46 +++++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 40 deletions(-) create mode 100644 include/uapi/linux/ioprio.h diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h index e9bfe6972aed..ef9ad4fb245f 100644 --- a/include/linux/ioprio.h +++ b/include/linux/ioprio.h @@ -6,46 +6,7 @@ #include #include -/* - * Gives us 8 prio classes with 13-bits of data for each class - */ -#define IOPRIO_CLASS_SHIFT (13) -#define IOPRIO_PRIO_MASK ((1UL << IOPRIO_CLASS_SHIFT) - 1) - -#define IOPRIO_PRIO_CLASS(mask) ((mask) >> IOPRIO_CLASS_SHIFT) -#define IOPRIO_PRIO_DATA(mask) ((mask) & IOPRIO_PRIO_MASK) -#define IOPRIO_PRIO_VALUE(class, data) (((class) << IOPRIO_CLASS_SHIFT) | data) - -#define ioprio_valid(mask) (IOPRIO_PRIO_CLASS((mask)) != IOPRIO_CLASS_NONE) - -/* - * These are the io priority groups as implemented by CFQ. RT is the realtime - * class, it always gets premium service. BE is the best-effort scheduling - * class, the default for any process. IDLE is the idle scheduling class, it - * is only served when no one else is using the disk. - */ -enum { - IOPRIO_CLASS_NONE, - IOPRIO_CLASS_RT, - IOPRIO_CLASS_BE, - IOPRIO_CLASS_IDLE, -}; - -/* - * 8 best effort priority levels are supported - */ -#define IOPRIO_BE_NR (8) - -enum { - IOPRIO_WHO_PROCESS = 1, - IOPRIO_WHO_PGRP, - IOPRIO_WHO_USER, -}; - -/* - * Fallback BE priority - */ -#define IOPRIO_NORM (4) +#include /* * if process has set io priority explicitly, use that. if not, convert diff --git a/include/uapi/linux/ioprio.h b/include/uapi/linux/ioprio.h new file mode 100644 index 000000000000..77b17e08b0da --- /dev/null +++ b/include/uapi/linux/ioprio.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_IOPRIO_H +#define _UAPI_LINUX_IOPRIO_H + +/* + * Gives us 8 prio classes with 13-bits of data for each class + */ +#define IOPRIO_CLASS_SHIFT (13) +#define IOPRIO_PRIO_MASK ((1UL << IOPRIO_CLASS_SHIFT) - 1) + +#define IOPRIO_PRIO_CLASS(mask) ((mask) >> IOPRIO_CLASS_SHIFT) +#define IOPRIO_PRIO_DATA(mask) ((mask) & IOPRIO_PRIO_MASK) +#define IOPRIO_PRIO_VALUE(class, data) (((class) << IOPRIO_CLASS_SHIFT) | data) + +/* + * These are the io priority groups as implemented by CFQ. RT is the realtime + * class, it always gets premium service. BE is the best-effort scheduling + * class, the default for any process. IDLE is the idle scheduling class, it + * is only served when no one else is using the disk. + */ +enum { + IOPRIO_CLASS_NONE, + IOPRIO_CLASS_RT, + IOPRIO_CLASS_BE, + IOPRIO_CLASS_IDLE, +}; + +#define ioprio_valid(mask) (IOPRIO_PRIO_CLASS((mask)) != IOPRIO_CLASS_NONE) + +/* + * 8 best effort priority levels are supported + */ +#define IOPRIO_BE_NR (8) + +enum { + IOPRIO_WHO_PROCESS = 1, + IOPRIO_WHO_PGRP, + IOPRIO_WHO_USER, +}; + +/* + * Fallback BE priority + */ +#define IOPRIO_NORM (4) + +#endif /* _UAPI_LINUX_IOPRIO_H */ From 4c7251e1b576d884046e62d23505e75486f88c1f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 07:56:32 +0200 Subject: [PATCH 010/315] MIPS: don't include in There is no need to include genhd.h from a random arch header, and not doing so prevents the possibility for nasty include loops. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Martin K. Petersen Reviewed-by: Ira Weiny Link: https://lore.kernel.org/r/20210727055646.118787-2-hch@lst.de Signed-off-by: Jens Axboe --- arch/mips/include/asm/mach-rc32434/rb.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/mips/include/asm/mach-rc32434/rb.h b/arch/mips/include/asm/mach-rc32434/rb.h index d502673a4f6c..34d179ca020b 100644 --- a/arch/mips/include/asm/mach-rc32434/rb.h +++ b/arch/mips/include/asm/mach-rc32434/rb.h @@ -7,8 +7,6 @@ #ifndef __ASM_RC32434_RB_H #define __ASM_RC32434_RB_H -#include - #define REGBASE 0x18000000 #define IDT434_REG_BASE ((volatile void *) KSEG1ADDR(REGBASE)) #define UART0BASE 0x58000 From e45cef51dba9765a6e1df1be724f3d26323512c8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 07:56:33 +0200 Subject: [PATCH 011/315] bvec: fix the include guards for bvec.h Fix the include guards to match the file naming. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Chaitanya Kulkarni Reviewed-by: Martin K. Petersen Reviewed-by: Ira Weiny Link: https://lore.kernel.org/r/20210727055646.118787-3-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bvec.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/bvec.h b/include/linux/bvec.h index ff832e698efb..883faf5f1523 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -4,8 +4,8 @@ * * Copyright (C) 2001 Ming Lei */ -#ifndef __LINUX_BVEC_ITER_H -#define __LINUX_BVEC_ITER_H +#ifndef __LINUX_BVEC_H +#define __LINUX_BVEC_H #include #include @@ -183,4 +183,4 @@ static inline void bvec_advance(const struct bio_vec *bvec, } } -#endif /* __LINUX_BVEC_ITER_H */ +#endif /* __LINUX_BVEC_H */ From e6e7471706dc42cbe0e01278540c0730138d43e5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 07:56:34 +0200 Subject: [PATCH 012/315] bvec: add a bvec_kmap_local helper Add a helper to call kmap_local_page on a bvec. There is no need for an unmap helper given that kunmap_local accept any address in the mapped page. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Martin K. Petersen Reviewed-by: Ira Weiny Link: https://lore.kernel.org/r/20210727055646.118787-4-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bvec.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/include/linux/bvec.h b/include/linux/bvec.h index 883faf5f1523..f8710af18eef 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -7,6 +7,7 @@ #ifndef __LINUX_BVEC_H #define __LINUX_BVEC_H +#include #include #include #include @@ -183,4 +184,16 @@ static inline void bvec_advance(const struct bio_vec *bvec, } } +/** + * bvec_kmap_local - map a bvec into the kernel virtual address space + * @bvec: bvec to map + * + * Must be called on single-page bvecs only. Call kunmap_local on the returned + * address to unmap. + */ +static inline void *bvec_kmap_local(struct bio_vec *bvec) +{ + return kmap_local_page(bvec->bv_page) + bvec->bv_offset; +} + #endif /* __LINUX_BVEC_H */ From f93a181af40b159aabea2ccf1a0496e9280be2d5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 07:56:35 +0200 Subject: [PATCH 013/315] bvec: add memcpy_{from,to}_bvec and memzero_bvec helper Add helpers to perform common memory operation on a bvec. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Martin K. Petersen Reviewed-by: Ira Weiny Link: https://lore.kernel.org/r/20210727055646.118787-5-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bvec.h | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/include/linux/bvec.h b/include/linux/bvec.h index f8710af18eef..f9fa43b940ff 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -196,4 +196,37 @@ static inline void *bvec_kmap_local(struct bio_vec *bvec) return kmap_local_page(bvec->bv_page) + bvec->bv_offset; } +/** + * memcpy_from_bvec - copy data from a bvec + * @bvec: bvec to copy from + * + * Must be called on single-page bvecs only. + */ +static inline void memcpy_from_bvec(char *to, struct bio_vec *bvec) +{ + memcpy_from_page(to, bvec->bv_page, bvec->bv_offset, bvec->bv_len); +} + +/** + * memcpy_to_bvec - copy data to a bvec + * @bvec: bvec to copy to + * + * Must be called on single-page bvecs only. + */ +static inline void memcpy_to_bvec(struct bio_vec *bvec, const char *from) +{ + memcpy_to_page(bvec->bv_page, bvec->bv_offset, from, bvec->bv_len); +} + +/** + * memzero_bvec - zero all data in a bvec + * @bvec: bvec to zero + * + * Must be called on single-page bvecs only. + */ +static inline void memzero_bvec(struct bio_vec *bvec) +{ + memzero_page(bvec->bv_page, bvec->bv_offset, bvec->bv_len); +} + #endif /* __LINUX_BVEC_H */ From ab6c340eeac426fb649ddb4f23b7c752f0092204 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 07:56:36 +0200 Subject: [PATCH 014/315] block: use memzero_page in zero_fill_bio Use memzero_bvec to zero each segment in the bio instead of manually mapping and zeroing the data. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Martin K. Petersen Reviewed-by: Ira Weiny Link: https://lore.kernel.org/r/20210727055646.118787-6-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/block/bio.c b/block/bio.c index 1fab762e079b..2e436bccb1e2 100644 --- a/block/bio.c +++ b/block/bio.c @@ -495,16 +495,11 @@ EXPORT_SYMBOL(bio_kmalloc); void zero_fill_bio(struct bio *bio) { - unsigned long flags; struct bio_vec bv; struct bvec_iter iter; - bio_for_each_segment(bv, bio, iter) { - char *data = bvec_kmap_irq(&bv, &flags); - memset(data, 0, bv.bv_len); - flush_dcache_page(bv.bv_page); - bvec_kunmap_irq(data, &flags); - } + bio_for_each_segment(bv, bio, iter) + memzero_bvec(&bv); } EXPORT_SYMBOL(zero_fill_bio); From 732022b86a37e816718786ce0b2cebc2b1739fa3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 07:56:37 +0200 Subject: [PATCH 015/315] rbd: use memzero_bvec Use memzero_bvec instead of reimplementing it. Signed-off-by: Christoph Hellwig Acked-by: Ilya Dryomov Reviewed-by: Martin K. Petersen Reviewed-by: Ira Weiny Link: https://lore.kernel.org/r/20210727055646.118787-7-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/rbd.c | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 90b947c96402..6d596c2c2cd6 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1219,24 +1219,13 @@ static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev) rbd_dev->mapping.size = 0; } -static void zero_bvec(struct bio_vec *bv) -{ - void *buf; - unsigned long flags; - - buf = bvec_kmap_irq(bv, &flags); - memset(buf, 0, bv->bv_len); - flush_dcache_page(bv->bv_page); - bvec_kunmap_irq(buf, &flags); -} - static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes) { struct ceph_bio_iter it = *bio_pos; ceph_bio_iter_advance(&it, off); ceph_bio_iter_advance_step(&it, bytes, ({ - zero_bvec(&bv); + memzero_bvec(&bv); })); } @@ -1246,7 +1235,7 @@ static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes) ceph_bvec_iter_advance(&it, off); ceph_bvec_iter_advance_step(&it, bytes, ({ - zero_bvec(&bv); + memzero_bvec(&bv); })); } From 18a6234ccf0661401f07b6316a25d4adbba1d4bd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 07:56:38 +0200 Subject: [PATCH 016/315] dm-writecache: use bvec_kmap_local instead of bvec_kmap_irq There is no need to disable interrupts in bio_copy_block, and the local only mappings helps to avoid any sort of problems with stray writes into the bio data. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Ira Weiny Link: https://lore.kernel.org/r/20210727055646.118787-8-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/dm-writecache.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index e21e29e81bbf..3d2cf811ec3e 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -1214,14 +1214,13 @@ static void memcpy_flushcache_optimized(void *dest, void *source, size_t size) static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data) { void *buf; - unsigned long flags; unsigned size; int rw = bio_data_dir(bio); unsigned remaining_size = wc->block_size; do { struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter); - buf = bvec_kmap_irq(&bv, &flags); + buf = bvec_kmap_local(&bv); size = bv.bv_len; if (unlikely(size > remaining_size)) size = remaining_size; @@ -1239,7 +1238,7 @@ static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data memcpy_flushcache_optimized(data, buf, size); } - bvec_kunmap_irq(buf, &flags); + kunmap_local(buf); data = (char *)data + size; remaining_size -= size; From 6e0a48552b8cfc3767b98e3e8beed3f4cbafc9f4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 07:56:39 +0200 Subject: [PATCH 017/315] ps3disk: use memcpy_{from,to}_bvec Use the bvec helpers instead of open coding the copy. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Tested-by: Geoff Levand Link: https://lore.kernel.org/r/20210727055646.118787-9-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/ps3disk.c | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c index f374ea2c67ce..8d51efbe045d 100644 --- a/drivers/block/ps3disk.c +++ b/drivers/block/ps3disk.c @@ -83,26 +83,12 @@ static void ps3disk_scatter_gather(struct ps3_storage_device *dev, unsigned int offset = 0; struct req_iterator iter; struct bio_vec bvec; - unsigned int i = 0; - size_t size; - void *buf; rq_for_each_segment(bvec, req, iter) { - unsigned long flags; - dev_dbg(&dev->sbd.core, "%s:%u: bio %u: %u sectors from %llu\n", - __func__, __LINE__, i, bio_sectors(iter.bio), - iter.bio->bi_iter.bi_sector); - - size = bvec.bv_len; - buf = bvec_kmap_irq(&bvec, &flags); if (gather) - memcpy(dev->bounce_buf+offset, buf, size); + memcpy_from_bvec(dev->bounce_buf + offset, &bvec); else - memcpy(buf, dev->bounce_buf+offset, size); - offset += size; - flush_kernel_dcache_page(bvec.bv_page); - bvec_kunmap_irq(buf, &flags); - i++; + memcpy_to_bvec(&bvec, dev->bounce_buf + offset); } } From bda135d9c03fae64c910a8c8d751eccd8408f400 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 07:56:40 +0200 Subject: [PATCH 018/315] block: remove bvec_kmap_irq and bvec_kunmap_irq These two helpers are entirely unused now. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210727055646.118787-10-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bio.h | 42 ------------------------------------------ 1 file changed, 42 deletions(-) diff --git a/include/linux/bio.h b/include/linux/bio.h index 2203b686e1f0..7b5f65a81f2b 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -5,7 +5,6 @@ #ifndef __LINUX_BIO_H #define __LINUX_BIO_H -#include #include #include /* struct bio, bio_vec and BIO_* flags are defined in blk_types.h */ @@ -519,47 +518,6 @@ static inline void bio_clone_blkg_association(struct bio *dst, struct bio *src) { } #endif /* CONFIG_BLK_CGROUP */ -#ifdef CONFIG_HIGHMEM -/* - * remember never ever reenable interrupts between a bvec_kmap_irq and - * bvec_kunmap_irq! - */ -static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags) -{ - unsigned long addr; - - /* - * might not be a highmem page, but the preempt/irq count - * balancing is a lot nicer this way - */ - local_irq_save(*flags); - addr = (unsigned long) kmap_atomic(bvec->bv_page); - - BUG_ON(addr & ~PAGE_MASK); - - return (char *) addr + bvec->bv_offset; -} - -static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags) -{ - unsigned long ptr = (unsigned long) buffer & PAGE_MASK; - - kunmap_atomic((void *) ptr); - local_irq_restore(*flags); -} - -#else -static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags) -{ - return page_address(bvec->bv_page) + bvec->bv_offset; -} - -static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags) -{ - *flags = 0; -} -#endif - /* * BIO list management for use by remapping drivers (e.g. DM or MD) and loop. * From f8b679a070c536600c64a78c83b96aa617f8fa71 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 07:56:41 +0200 Subject: [PATCH 019/315] block: rewrite bio_copy_data_iter to use bvec_kmap_local and memcpy_to_bvec Use the proper helpers instead of open coding the copy. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210727055646.118787-11-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/block/bio.c b/block/bio.c index 2e436bccb1e2..0c89fa2f7a85 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1186,27 +1186,15 @@ EXPORT_SYMBOL(bio_advance); void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, struct bio *src, struct bvec_iter *src_iter) { - struct bio_vec src_bv, dst_bv; - void *src_p, *dst_p; - unsigned bytes; - while (src_iter->bi_size && dst_iter->bi_size) { - src_bv = bio_iter_iovec(src, *src_iter); - dst_bv = bio_iter_iovec(dst, *dst_iter); + struct bio_vec src_bv = bio_iter_iovec(src, *src_iter); + struct bio_vec dst_bv = bio_iter_iovec(dst, *dst_iter); + unsigned int bytes = min(src_bv.bv_len, dst_bv.bv_len); + void *src_buf; - bytes = min(src_bv.bv_len, dst_bv.bv_len); - - src_p = kmap_atomic(src_bv.bv_page); - dst_p = kmap_atomic(dst_bv.bv_page); - - memcpy(dst_p + dst_bv.bv_offset, - src_p + src_bv.bv_offset, - bytes); - - kunmap_atomic(dst_p); - kunmap_atomic(src_p); - - flush_dcache_page(dst_bv.bv_page); + src_buf = bvec_kmap_local(&src_bv); + memcpy_to_bvec(&dst_bv, src_buf); + kunmap_local(src_buf); bio_advance_iter_single(src, src_iter, bytes); bio_advance_iter_single(dst, dst_iter, bytes); From f434cdc78e01e40fcfb8ef7e6752e3e405b84b58 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 07:56:42 +0200 Subject: [PATCH 020/315] block: use memcpy_to_bvec in copy_to_high_bio_irq Use memcpy_to_bvec instead of opencoding the logic. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210727055646.118787-12-hch@lst.de Signed-off-by: Jens Axboe --- block/bounce.c | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/block/bounce.c b/block/bounce.c index 94081e013c58..7e9e666c04dc 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -67,18 +67,6 @@ static __init int init_emergency_pool(void) __initcall(init_emergency_pool); -/* - * highmem version, map in to vec - */ -static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom) -{ - unsigned char *vto; - - vto = kmap_atomic(to->bv_page); - memcpy(vto + to->bv_offset, vfrom, to->bv_len); - kunmap_atomic(vto); -} - /* * Simple bounce buffer support for highmem pages. Depending on the * queue gfp mask set, *to may or may not be a highmem page. kmap it @@ -86,7 +74,6 @@ static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom) */ static void copy_to_high_bio_irq(struct bio *to, struct bio *from) { - unsigned char *vfrom; struct bio_vec tovec, fromvec; struct bvec_iter iter; /* @@ -104,11 +91,8 @@ static void copy_to_high_bio_irq(struct bio *to, struct bio *from) * been modified by the block layer, so use the original * copy, bounce_copy_vec already uses tovec->bv_len */ - vfrom = page_address(fromvec.bv_page) + - tovec.bv_offset; - - bounce_copy_vec(&tovec, vfrom); - flush_dcache_page(tovec.bv_page); + memcpy_to_bvec(&tovec, page_address(fromvec.bv_page) + + tovec.bv_offset); } bio_advance_iter(from, &from_iter, tovec.bv_len); } From d24920e20ca66780d4059e2ece9f858cbae02310 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 07:56:43 +0200 Subject: [PATCH 021/315] block: use memcpy_from_bvec in bio_copy_kern_endio_read Use memcpy_from_bvec instead of open coding the logic. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210727055646.118787-13-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-map.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-map.c b/block/blk-map.c index 3743158ddaeb..d1448aaad980 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -400,7 +400,7 @@ static void bio_copy_kern_endio_read(struct bio *bio) struct bvec_iter_all iter_all; bio_for_each_segment_all(bvec, bio, iter_all) { - memcpy(p, page_address(bvec->bv_page), bvec->bv_len); + memcpy_from_bvec(p, bvec); p += bvec->bv_len; } From 4aebe8596ab77b0b7125e3584ed0259c4657a06d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 07:56:44 +0200 Subject: [PATCH 022/315] block: use memcpy_from_bvec in __blk_queue_bounce Rewrite the actual bounce buffering loop in __blk_queue_bounce to that the memcpy_to_bvec helper can be used to perform the data copies. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210727055646.118787-14-hch@lst.de Signed-off-by: Jens Axboe --- block/bounce.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/block/bounce.c b/block/bounce.c index 7e9e666c04dc..05fc7148489d 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -239,24 +239,19 @@ void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) * because the 'bio' is single-page bvec. */ for (i = 0, to = bio->bi_io_vec; i < bio->bi_vcnt; to++, i++) { - struct page *page = to->bv_page; + struct page *bounce_page; - if (!PageHighMem(page)) + if (!PageHighMem(to->bv_page)) continue; - to->bv_page = mempool_alloc(&page_pool, GFP_NOIO); - inc_zone_page_state(to->bv_page, NR_BOUNCE); + bounce_page = mempool_alloc(&page_pool, GFP_NOIO); + inc_zone_page_state(bounce_page, NR_BOUNCE); if (rw == WRITE) { - char *vto, *vfrom; - - flush_dcache_page(page); - - vto = page_address(to->bv_page) + to->bv_offset; - vfrom = kmap_atomic(page) + to->bv_offset; - memcpy(vto, vfrom, to->bv_len); - kunmap_atomic(vfrom); + flush_dcache_page(to->bv_page); + memcpy_from_bvec(page_address(bounce_page), to); } + to->bv_page = bounce_page; } trace_block_bio_bounce(*bio_orig); From 8aec120a9ca80c14ce002505cea1e1639f8e9ea5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 07:56:45 +0200 Subject: [PATCH 023/315] block: use bvec_kmap_local in t10_pi_type1_{prepare,complete} Using local kmaps slightly reduces the chances to stray writes, and the bvec interface cleans up the code a little bit. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210727055646.118787-15-hch@lst.de Signed-off-by: Jens Axboe --- block/t10-pi.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/block/t10-pi.c b/block/t10-pi.c index d910534b3a41..00c203b2a921 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -147,11 +147,10 @@ static void t10_pi_type1_prepare(struct request *rq) break; bip_for_each_vec(iv, bip, iter) { - void *p, *pmap; unsigned int j; + void *p; - pmap = kmap_atomic(iv.bv_page); - p = pmap + iv.bv_offset; + p = bvec_kmap_local(&iv); for (j = 0; j < iv.bv_len; j += tuple_sz) { struct t10_pi_tuple *pi = p; @@ -161,8 +160,7 @@ static void t10_pi_type1_prepare(struct request *rq) ref_tag++; p += tuple_sz; } - - kunmap_atomic(pmap); + kunmap_local(p); } bip->bip_flags |= BIP_MAPPED_INTEGRITY; @@ -195,11 +193,10 @@ static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes) struct bvec_iter iter; bip_for_each_vec(iv, bip, iter) { - void *p, *pmap; unsigned int j; + void *p; - pmap = kmap_atomic(iv.bv_page); - p = pmap + iv.bv_offset; + p = bvec_kmap_local(&iv); for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) { struct t10_pi_tuple *pi = p; @@ -210,8 +207,7 @@ static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes) intervals--; p += tuple_sz; } - - kunmap_atomic(pmap); + kunmap_local(p); } } } From 503469b5b30f76169c6302d1469e69a2fb67faf9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 07:56:46 +0200 Subject: [PATCH 024/315] block: use bvec_kmap_local in bio_integrity_process Using local kmaps slightly reduces the chances to stray writes, and the bvec interface cleans up the code a little bit. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210727055646.118787-16-hch@lst.de Signed-off-by: Jens Axboe --- block/bio-integrity.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 4b4eb8964a6f..8f54d49dc500 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -172,18 +172,16 @@ static blk_status_t bio_integrity_process(struct bio *bio, iter.prot_buf = prot_buf; __bio_for_each_segment(bv, bio, bviter, *proc_iter) { - void *kaddr = kmap_atomic(bv.bv_page); + void *kaddr = bvec_kmap_local(&bv); - iter.data_buf = kaddr + bv.bv_offset; + iter.data_buf = kaddr; iter.data_size = bv.bv_len; - ret = proc_fn(&iter); - if (ret) { - kunmap_atomic(kaddr); - return ret; - } + kunmap_local(kaddr); + + if (ret) + break; - kunmap_atomic(kaddr); } return ret; } From a45e43cad798173b41e0d6f119784826d3ead02c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Jul 2021 09:53:55 +0200 Subject: [PATCH 025/315] block: assert the locking state in delete_partition Add a lockdep assert instead of the outdated locking comment. Signed-off-by: Christoph Hellwig Reviewed-by: Josef Bacik Reviewed-by: Ming Lei Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210722075402.983367-3-hch@lst.de Signed-off-by: Jens Axboe --- block/partitions/core.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/block/partitions/core.c b/block/partitions/core.c index 4230d4f71879..9902b1635b7d 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -281,12 +281,10 @@ struct device_type part_type = { .uevent = part_uevent, }; -/* - * Must be called either with open_mutex held, before a disk can be opened or - * after all disk users are gone. - */ static void delete_partition(struct block_device *part) { + lockdep_assert_held(&part->bd_disk->open_mutex); + fsync_bdev(part); __invalidate_device(part, true); From d7a66574b34e0b354442140927f9b787efccabfd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Jul 2021 09:53:56 +0200 Subject: [PATCH 026/315] block: unhash the whole device inode earlier Unhash the whole device inode early in del_gendisk. This allows to remove the first GENHD_FL_UP check in the open path as we simply won't find a just removed inode. The second non-racy check after taking open_mutex is still kept. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210722075402.983367-4-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 7 +------ fs/block_dev.c | 2 +- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 298ee78c1bda..716f5ca479ad 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -585,6 +585,7 @@ void del_gendisk(struct gendisk *disk) disk_del_events(disk); mutex_lock(&disk->open_mutex); + remove_inode_hash(disk->part0->bd_inode); disk->flags &= ~GENHD_FL_UP; blk_drop_partitions(disk); mutex_unlock(&disk->open_mutex); @@ -592,12 +593,6 @@ void del_gendisk(struct gendisk *disk) fsync_bdev(disk->part0); __invalidate_device(disk->part0, true); - /* - * Unhash the bdev inode for this device so that it can't be looked - * up any more even if openers still hold references to it. - */ - remove_inode_hash(disk->part0->bd_inode); - set_capacity(disk, 0); if (!(disk->flags & GENHD_FL_HIDDEN)) { diff --git a/fs/block_dev.c b/fs/block_dev.c index 9ef4f1fc2cb0..932f4034ad66 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1340,7 +1340,7 @@ struct block_device *blkdev_get_no_open(dev_t dev) disk = bdev->bd_disk; if (!kobject_get_unless_zero(&disk_to_dev(disk)->kobj)) goto bdput; - if ((disk->flags & (GENHD_FL_UP | GENHD_FL_HIDDEN)) != GENHD_FL_UP) + if (disk->flags & GENHD_FL_HIDDEN) goto put_disk; if (!try_module_get(bdev->bd_disk->fops->owner)) goto put_disk; From 0468c5323413c6903e4cbcef841a55e6c5578cd2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Jul 2021 09:53:57 +0200 Subject: [PATCH 027/315] block: allocate bd_meta_info later in add_partitions Move the allocation of bd_meta_info after initializing the struct device to avoid the special bdput error handling path. Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20210722075402.983367-5-hch@lst.de Signed-off-by: Jens Axboe --- block/partitions/core.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/block/partitions/core.c b/block/partitions/core.c index 9902b1635b7d..09c58a110a89 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -356,13 +356,6 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, bdev->bd_start_sect = start; bdev_set_nr_sectors(bdev, len); - if (info) { - err = -ENOMEM; - bdev->bd_meta_info = kmemdup(info, sizeof(*info), GFP_KERNEL); - if (!bdev->bd_meta_info) - goto out_bdput; - } - pdev = &bdev->bd_device; dname = dev_name(ddev); if (isdigit(dname[strlen(dname) - 1])) @@ -386,6 +379,13 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, } pdev->devt = devt; + if (info) { + err = -ENOMEM; + bdev->bd_meta_info = kmemdup(info, sizeof(*info), GFP_KERNEL); + if (!bdev->bd_meta_info) + goto out_put; + } + /* delay uevent until 'holders' subdir is created */ dev_set_uevent_suppress(pdev, 1); err = device_add(pdev); @@ -415,9 +415,6 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, kobject_uevent(&pdev->kobj, KOBJ_ADD); return bdev; -out_bdput: - bdput(bdev); - return ERR_PTR(err); out_del: kobject_put(bdev->bd_holder_dir); device_del(pdev); From 9d3b8813895d737fcef4ec8df518f67e5cc381b8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Jul 2021 09:53:58 +0200 Subject: [PATCH 028/315] block: change the refcounting for partitions Instead of acquiring an inode reference on open make sure partitions always hold device model references to the disk while alive, and switch open to grab only a device model reference to the opened block device. If that is a partition the disk reference is transitively held by the partition already. Link: https://lore.kernel.org/r/20210722075402.983367-6-hch@lst.de Signed-off-by: Jens Axboe --- block/partitions/core.c | 9 ++++++- fs/block_dev.c | 60 ++++++++++++++++------------------------- 2 files changed, 31 insertions(+), 38 deletions(-) diff --git a/block/partitions/core.c b/block/partitions/core.c index 09c58a110a89..4f7a1a9cd544 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -261,6 +261,7 @@ static void part_release(struct device *dev) { if (MAJOR(dev->devt) == BLOCK_EXT_MAJOR) blk_free_ext_minor(MINOR(dev->devt)); + put_disk(dev_to_bdev(dev)->bd_disk); bdput(dev_to_bdev(dev)); } @@ -349,9 +350,13 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, if (xa_load(&disk->part_tbl, partno)) return ERR_PTR(-EBUSY); + /* ensure we always have a reference to the whole disk */ + get_device(disk_to_dev(disk)); + + err = -ENOMEM; bdev = bdev_alloc(disk, partno); if (!bdev) - return ERR_PTR(-ENOMEM); + goto out_put_disk; bdev->bd_start_sect = start; bdev_set_nr_sectors(bdev, len); @@ -420,6 +425,8 @@ out_del: device_del(pdev); out_put: put_device(pdev); +out_put_disk: + put_disk(disk); return ERR_PTR(err); } diff --git a/fs/block_dev.c b/fs/block_dev.c index 932f4034ad66..4a6c8c0a3bc9 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -921,16 +921,6 @@ void bdev_add(struct block_device *bdev, dev_t dev) insert_inode_hash(bdev->bd_inode); } -static struct block_device *bdget(dev_t dev) -{ - struct inode *inode; - - inode = ilookup(blockdev_superblock, dev); - if (!inode) - return NULL; - return &BDEV_I(inode)->bdev; -} - /** * bdgrab -- Grab a reference to an already referenced block device * @bdev: Block device to grab a reference to. @@ -1282,16 +1272,14 @@ static void blkdev_put_whole(struct block_device *bdev, fmode_t mode) static int blkdev_get_part(struct block_device *part, fmode_t mode) { struct gendisk *disk = part->bd_disk; - struct block_device *whole; int ret; if (part->bd_openers) goto done; - whole = bdgrab(disk->part0); - ret = blkdev_get_whole(whole, mode); + ret = blkdev_get_whole(bdev_whole(part), mode); if (ret) - goto out_put_whole; + return ret; ret = -ENXIO; if (!bdev_nr_sectors(part)) @@ -1306,9 +1294,7 @@ done: return 0; out_blkdev_put: - blkdev_put_whole(whole, mode); -out_put_whole: - bdput(whole); + blkdev_put_whole(bdev_whole(part), mode); return ret; } @@ -1321,42 +1307,42 @@ static void blkdev_put_part(struct block_device *part, fmode_t mode) blkdev_flush_mapping(part); whole->bd_disk->open_partitions--; blkdev_put_whole(whole, mode); - bdput(whole); } struct block_device *blkdev_get_no_open(dev_t dev) { struct block_device *bdev; - struct gendisk *disk; + struct inode *inode; - bdev = bdget(dev); - if (!bdev) { + inode = ilookup(blockdev_superblock, dev); + if (!inode) { blk_request_module(dev); - bdev = bdget(dev); - if (!bdev) + inode = ilookup(blockdev_superblock, dev); + if (!inode) return NULL; } - disk = bdev->bd_disk; - if (!kobject_get_unless_zero(&disk_to_dev(disk)->kobj)) - goto bdput; - if (disk->flags & GENHD_FL_HIDDEN) - goto put_disk; - if (!try_module_get(bdev->bd_disk->fops->owner)) - goto put_disk; + /* switch from the inode reference to a device mode one: */ + bdev = &BDEV_I(inode)->bdev; + if (!kobject_get_unless_zero(&bdev->bd_device.kobj)) + bdev = NULL; + iput(inode); + + if (!bdev) + return NULL; + if ((bdev->bd_disk->flags & GENHD_FL_HIDDEN) || + !try_module_get(bdev->bd_disk->fops->owner)) { + put_device(&bdev->bd_device); + return NULL; + } + return bdev; -put_disk: - put_disk(disk); -bdput: - bdput(bdev); - return NULL; } void blkdev_put_no_open(struct block_device *bdev) { module_put(bdev->bd_disk->fops->owner); - put_disk(bdev->bd_disk); - bdput(bdev); + put_device(&bdev->bd_device); } /** From 4b2731226d7de4302e4d8766c86e3a21c56dc3b1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Jul 2021 09:54:00 +0200 Subject: [PATCH 029/315] loop: don't grab a reference to the block device The whole device block device won't be removed while the disk is still alive, so don't bother to grab a reference to it. Signed-off-by: Christoph Hellwig Reviewed-by: Josef Bacik Reviewed-by: Ming Lei Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210722075402.983367-8-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/loop.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index f0cdff0c5fbf..a03ef5025f27 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1304,10 +1304,6 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, if (partscan) lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN; - /* Grab the block_device to prevent its destruction after we - * put /dev/loopXX inode. Later in __loop_clr_fd() we bdput(bdev). - */ - bdgrab(bdev); loop_global_unlock(lo, is_loop); if (partscan) loop_reread_partitions(lo); @@ -1398,7 +1394,6 @@ static int __loop_clr_fd(struct loop_device *lo, bool release) blk_queue_physical_block_size(lo->lo_queue, 512); blk_queue_io_min(lo->lo_queue, 512); if (bdev) { - bdput(bdev); invalidate_bdev(bdev); bdev->bd_inode->i_mapping->wb_err = 0; } From 14cf1dbb55bb07427babee425fd2a8a9300737cc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Jul 2021 09:54:01 +0200 Subject: [PATCH 030/315] block: remove bdgrab All callers are gone, and no one should grab a pure inode reference to a block device anymore. Signed-off-by: Christoph Hellwig Reviewed-by: Josef Bacik Link: https://lore.kernel.org/r/20210722075402.983367-9-hch@lst.de Signed-off-by: Jens Axboe --- fs/block_dev.c | 15 --------------- include/linux/blkdev.h | 1 - 2 files changed, 16 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 4a6c8c0a3bc9..4f2c4e9e84f5 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -921,21 +921,6 @@ void bdev_add(struct block_device *bdev, dev_t dev) insert_inode_hash(bdev->bd_inode); } -/** - * bdgrab -- Grab a reference to an already referenced block device - * @bdev: Block device to grab a reference to. - * - * Returns the block_device with an additional reference when successful, - * or NULL if the inode is already beeing freed. - */ -struct block_device *bdgrab(struct block_device *bdev) -{ - if (!igrab(bdev->bd_inode)) - return NULL; - return bdev; -} -EXPORT_SYMBOL(bdgrab); - long nr_blockdev_pages(void) { struct inode *inode; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d3afea47ade6..eb1289a58917 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1984,7 +1984,6 @@ void blkdev_put_no_open(struct block_device *bdev); struct block_device *bdev_alloc(struct gendisk *disk, u8 partno); void bdev_add(struct block_device *bdev, dev_t dev); struct block_device *I_BDEV(struct inode *inode); -struct block_device *bdgrab(struct block_device *bdev); void bdput(struct block_device *); int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart, loff_t lend); From 2f4731dcd0bb73379fbb9e3eb07ae7324125caef Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Jul 2021 09:54:02 +0200 Subject: [PATCH 031/315] block: remove bdput Now that we've stopped using inode references for anything meaninful in the block layer get rid of the helper to put it and just open code the call to iput on the block_device inode. Signed-off-by: Christoph Hellwig Reviewed-by: Josef Bacik Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210722075402.983367-10-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 4 ++-- block/partitions/core.c | 2 +- fs/block_dev.c | 6 ------ include/linux/blkdev.h | 1 - 4 files changed, 3 insertions(+), 10 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 716f5ca479ad..5dbb99b57b33 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1076,7 +1076,7 @@ static void disk_release(struct device *dev) xa_destroy(&disk->part_tbl); if (test_bit(GD_QUEUE_REF, &disk->state) && disk->queue) blk_put_queue(disk->queue); - bdput(disk->part0); /* frees the disk */ + iput(disk->part0->bd_inode); /* frees the disk */ } struct class block_class = { .name = "block", @@ -1261,7 +1261,7 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) out_destroy_part_tbl: xa_destroy(&disk->part_tbl); - bdput(disk->part0); + iput(disk->part0->bd_inode); out_free_disk: kfree(disk); return NULL; diff --git a/block/partitions/core.c b/block/partitions/core.c index 4f7a1a9cd544..2415bffc2771 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -262,7 +262,7 @@ static void part_release(struct device *dev) if (MAJOR(dev->devt) == BLOCK_EXT_MAJOR) blk_free_ext_minor(MINOR(dev->devt)); put_disk(dev_to_bdev(dev)->bd_disk); - bdput(dev_to_bdev(dev)); + iput(dev_to_bdev(dev)->bd_inode); } static int part_uevent(struct device *dev, struct kobj_uevent_env *env) diff --git a/fs/block_dev.c b/fs/block_dev.c index 4f2c4e9e84f5..6658f40ae492 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -934,12 +934,6 @@ long nr_blockdev_pages(void) return ret; } -void bdput(struct block_device *bdev) -{ - iput(bdev->bd_inode); -} -EXPORT_SYMBOL(bdput); - /** * bd_may_claim - test whether a block device can be claimed * @bdev: block device of interest diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index eb1289a58917..b5c033cf5f26 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1984,7 +1984,6 @@ void blkdev_put_no_open(struct block_device *bdev); struct block_device *bdev_alloc(struct gendisk *disk, u8 partno); void bdev_add(struct block_device *bdev, dev_t dev); struct block_device *I_BDEV(struct inode *inode); -void bdput(struct block_device *); int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart, loff_t lend); From 26e2d7a362f6a83146ea3eaa8f17ca9ce35388d3 Mon Sep 17 00:00:00 2001 From: Abd-Alrhman Masalkhi Date: Tue, 27 Jul 2021 08:25:13 +0200 Subject: [PATCH 032/315] block: reduce stack usage in diskstats_show MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I have compiled the kernel with a cross compiler "hppa-linux-gnu-" v9.3.0 on x86-64 host machine. I got the following warning: block/genhd.c: In function ‘diskstats_show’: block/genhd.c:1227:1: warning: the frame size of 1688 bytes is larger than 1280 bytes [-Wframe-larger-than=] 1227 | } By Reduced the stack footprint by using the %pg printk specifier instead of disk_name to remove the need for the on-stack buffer. Signed-off-by: Abd-Alrhman Masalkhi Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210727062518.122108-2-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 5dbb99b57b33..cf705cf95440 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1111,7 +1111,6 @@ static int diskstats_show(struct seq_file *seqf, void *v) { struct gendisk *gp = v; struct block_device *hd; - char buf[BDEVNAME_SIZE]; unsigned int inflight; struct disk_stats stat; unsigned long idx; @@ -1134,15 +1133,14 @@ static int diskstats_show(struct seq_file *seqf, void *v) else inflight = part_in_flight(hd); - seq_printf(seqf, "%4d %7d %s " + seq_printf(seqf, "%4d %7d %pg " "%lu %lu %lu %u " "%lu %lu %lu %u " "%u %u %u " "%lu %lu %lu %u " "%lu %u" "\n", - MAJOR(hd->bd_dev), MINOR(hd->bd_dev), - disk_name(gp, hd->bd_partno, buf), + MAJOR(hd->bd_dev), MINOR(hd->bd_dev), hd, stat.ios[STAT_READ], stat.merges[STAT_READ], stat.sectors[STAT_READ], From a9e7bc3de4051d037a8e6f2d30448c347263737e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 08:25:14 +0200 Subject: [PATCH 033/315] block: use the %pg format specifier in printk_all_partitions Simplify printing the partition name by using the %pg format specifier that is equivalent to a bdevname call. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210727062518.122108-3-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index cf705cf95440..770f21b4fd1a 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -678,7 +678,6 @@ void __init printk_all_partitions(void) while ((dev = class_dev_iter_next(&iter))) { struct gendisk *disk = dev_to_disk(dev); struct block_device *part; - char name_buf[BDEVNAME_SIZE]; char devt_buf[BDEVT_SIZE]; unsigned long idx; @@ -698,11 +697,10 @@ void __init printk_all_partitions(void) xa_for_each(&disk->part_tbl, idx, part) { if (!bdev_nr_sectors(part)) continue; - printk("%s%s %10llu %s %s", + printk("%s%s %10llu %pg %s", bdev_is_partition(part) ? " " : "", bdevt_str(part->bd_dev, devt_buf), - bdev_nr_sectors(part) >> 1, - disk_name(disk, part->bd_partno, name_buf), + bdev_nr_sectors(part) >> 1, part, part->bd_meta_info ? part->bd_meta_info->uuid : ""); if (bdev_is_partition(part)) From a291bb43e5c9fdedc4be3dfd496e64e7c5a78b1f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 08:25:15 +0200 Subject: [PATCH 034/315] block: use the %pg format specifier in show_partition Simplify printing the partition name by using the %pg format specifier that is equivalent to a bdevname call. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210727062518.122108-4-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 770f21b4fd1a..6ed58fda2c05 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -778,7 +778,6 @@ static int show_partition(struct seq_file *seqf, void *v) struct gendisk *sgp = v; struct block_device *part; unsigned long idx; - char buf[BDEVNAME_SIZE]; /* Don't show non-partitionable removeable devices or empty devices */ if (!get_capacity(sgp) || (!disk_max_parts(sgp) && @@ -791,10 +790,9 @@ static int show_partition(struct seq_file *seqf, void *v) xa_for_each(&sgp->part_tbl, idx, part) { if (!bdev_nr_sectors(part)) continue; - seq_printf(seqf, "%4d %7d %10llu %s\n", + seq_printf(seqf, "%4d %7d %10llu %pg\n", MAJOR(part->bd_dev), MINOR(part->bd_dev), - bdev_nr_sectors(part) >> 1, - disk_name(sgp, part->bd_partno, buf)); + bdev_nr_sectors(part) >> 1, part); } rcu_read_unlock(); return 0; From 453b8ab696b32cfd8bad80a5501937440d1cf214 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 08:25:16 +0200 Subject: [PATCH 035/315] block: simplify printing the device names disk_stack_limits Printk ->disk_name directly for the disk and use the %pg format specifier for the block device, which is equivalent to a bdevname call. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210727062518.122108-5-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-settings.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index 902c40d67120..109012719aa0 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -661,15 +661,9 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, struct request_queue *t = disk->queue; if (blk_stack_limits(&t->limits, &bdev_get_queue(bdev)->limits, - get_start_sect(bdev) + (offset >> 9)) < 0) { - char top[BDEVNAME_SIZE], bottom[BDEVNAME_SIZE]; - - disk_name(disk, 0, top); - bdevname(bdev, bottom); - - printk(KERN_NOTICE "%s: Warning: Device %s is misaligned\n", - top, bottom); - } + get_start_sect(bdev) + (offset >> 9)) < 0) + pr_notice("%s: Warning: Device %pg is misaligned\n", + disk->disk_name, bdev); blk_queue_update_readahead(disk->queue); } From 1d7035478f64c040441c9cb2aa32e0d7fae526d2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 08:25:17 +0200 Subject: [PATCH 036/315] block: simplify disk name formatting in check_partition disk_name for partition 0 just copies out the disk_name field. Replace the call to disk_name with a %s format specifier. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210727062518.122108-6-hch@lst.de Signed-off-by: Jens Axboe --- block/partitions/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/partitions/core.c b/block/partitions/core.c index 2415bffc2771..fb3a556cacce 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -136,7 +136,7 @@ static struct parsed_partitions *check_partition(struct gendisk *hd) state->pp_buf[0] = '\0'; state->bdev = hd->part0; - disk_name(hd, 0, state->name); + snprintf(state->name, BDEVNAME_SIZE, "%s", hd->disk_name); snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name); if (isdigit(state->name[strlen(state->name)-1])) sprintf(state->name, "p"); From abd2864a3e46368a58f3718491521779099bfc14 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2021 08:25:18 +0200 Subject: [PATCH 037/315] block: remove disk_name() Remove the disk_name function now that all users are gone. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210727062518.122108-7-hch@lst.de Signed-off-by: Jens Axboe --- block/blk.h | 1 - block/genhd.c | 17 +++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/block/blk.h b/block/blk.h index 4b885c0f6708..56f33fbcde59 100644 --- a/block/blk.h +++ b/block/blk.h @@ -344,7 +344,6 @@ static inline void blk_queue_clear_zone_settings(struct request_queue *q) {} int blk_alloc_ext_minor(void); void blk_free_ext_minor(unsigned int minor); -char *disk_name(struct gendisk *hd, int partno, char *buf); #define ADDPART_FLAG_NONE 0 #define ADDPART_FLAG_RAID 1 #define ADDPART_FLAG_WHOLEDISK 2 diff --git a/block/genhd.c b/block/genhd.c index 6ed58fda2c05..38f053074159 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -78,11 +78,17 @@ bool set_capacity_and_notify(struct gendisk *disk, sector_t size) EXPORT_SYMBOL_GPL(set_capacity_and_notify); /* - * Format the device name of the indicated disk into the supplied buffer and - * return a pointer to that same buffer for convenience. + * Format the device name of the indicated block device into the supplied buffer + * and return a pointer to that same buffer for convenience. + * + * Note: do not use this in new code, use the %pg specifier to sprintf and + * printk insted. */ -char *disk_name(struct gendisk *hd, int partno, char *buf) +const char *bdevname(struct block_device *bdev, char *buf) { + struct gendisk *hd = bdev->bd_disk; + int partno = bdev->bd_partno; + if (!partno) snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name); else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1])) @@ -92,11 +98,6 @@ char *disk_name(struct gendisk *hd, int partno, char *buf) return buf; } - -const char *bdevname(struct block_device *bdev, char *buf) -{ - return disk_name(bdev->bd_disk, bdev->bd_partno, buf); -} EXPORT_SYMBOL(bdevname); static void part_stat_read_all(struct block_device *part, From 2164877c7f373e14e55fca20b7c4a9c436fe4462 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 28 Jul 2021 07:37:56 +0200 Subject: [PATCH 038/315] block: remove cmdline-parser.c cmdline-parser.c is only used by the cmdline faux partition format, so merge the code into that and avoid an indirect call. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210728053756.409654-1-hch@lst.de Signed-off-by: Jens Axboe --- arch/m68k/configs/stmark2_defconfig | 1 - block/Kconfig | 10 -- block/Makefile | 1 - block/cmdline-parser.c | 255 -------------------------- block/partitions/Kconfig | 1 - block/partitions/cmdline.c | 269 +++++++++++++++++++++++++++- include/linux/cmdline-parser.h | 46 ----- 7 files changed, 263 insertions(+), 320 deletions(-) delete mode 100644 block/cmdline-parser.c delete mode 100644 include/linux/cmdline-parser.h diff --git a/arch/m68k/configs/stmark2_defconfig b/arch/m68k/configs/stmark2_defconfig index d92306472fce..8898ae321779 100644 --- a/arch/m68k/configs/stmark2_defconfig +++ b/arch/m68k/configs/stmark2_defconfig @@ -22,7 +22,6 @@ CONFIG_RAMSIZE=0x8000000 CONFIG_VECTORBASE=0x40000000 CONFIG_KERNELBASE=0x40001000 # CONFIG_BLK_DEV_BSG is not set -CONFIG_BLK_CMDLINE_PARSER=y CONFIG_BINFMT_FLAT=y CONFIG_BINFMT_ZFLAT=y CONFIG_BINFMT_MISC=y diff --git a/block/Kconfig b/block/Kconfig index fd732aede922..15dfb7660645 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -114,16 +114,6 @@ config BLK_DEV_THROTTLING_LOW Note, this is an experimental interface and could be changed someday. -config BLK_CMDLINE_PARSER - bool "Block device command line partition parser" - help - Enabling this option allows you to specify the partition layout from - the kernel boot args. This is typically of use for embedded devices - which don't otherwise have any standardized method for listing the - partitions on a block device. - - See Documentation/block/cmdline-partition.rst for more information. - config BLK_WBT bool "Enable support for block device writeback throttling" help diff --git a/block/Makefile b/block/Makefile index bfbe4e13ca1e..c72592b4cf31 100644 --- a/block/Makefile +++ b/block/Makefile @@ -28,7 +28,6 @@ obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o obj-$(CONFIG_IOSCHED_BFQ) += bfq.o -obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o obj-$(CONFIG_BLK_DEV_INTEGRITY_T10) += t10-pi.o obj-$(CONFIG_BLK_MQ_PCI) += blk-mq-pci.o diff --git a/block/cmdline-parser.c b/block/cmdline-parser.c deleted file mode 100644 index f2a14571882b..000000000000 --- a/block/cmdline-parser.c +++ /dev/null @@ -1,255 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Parse command line, get partition information - * - * Written by Cai Zhiyong - * - */ -#include -#include - -static int parse_subpart(struct cmdline_subpart **subpart, char *partdef) -{ - int ret = 0; - struct cmdline_subpart *new_subpart; - - *subpart = NULL; - - new_subpart = kzalloc(sizeof(struct cmdline_subpart), GFP_KERNEL); - if (!new_subpart) - return -ENOMEM; - - if (*partdef == '-') { - new_subpart->size = (sector_t)(~0ULL); - partdef++; - } else { - new_subpart->size = (sector_t)memparse(partdef, &partdef); - if (new_subpart->size < (sector_t)PAGE_SIZE) { - pr_warn("cmdline partition size is invalid."); - ret = -EINVAL; - goto fail; - } - } - - if (*partdef == '@') { - partdef++; - new_subpart->from = (sector_t)memparse(partdef, &partdef); - } else { - new_subpart->from = (sector_t)(~0ULL); - } - - if (*partdef == '(') { - int length; - char *next = strchr(++partdef, ')'); - - if (!next) { - pr_warn("cmdline partition format is invalid."); - ret = -EINVAL; - goto fail; - } - - length = min_t(int, next - partdef, - sizeof(new_subpart->name) - 1); - strncpy(new_subpart->name, partdef, length); - new_subpart->name[length] = '\0'; - - partdef = ++next; - } else - new_subpart->name[0] = '\0'; - - new_subpart->flags = 0; - - if (!strncmp(partdef, "ro", 2)) { - new_subpart->flags |= PF_RDONLY; - partdef += 2; - } - - if (!strncmp(partdef, "lk", 2)) { - new_subpart->flags |= PF_POWERUP_LOCK; - partdef += 2; - } - - *subpart = new_subpart; - return 0; -fail: - kfree(new_subpart); - return ret; -} - -static void free_subpart(struct cmdline_parts *parts) -{ - struct cmdline_subpart *subpart; - - while (parts->subpart) { - subpart = parts->subpart; - parts->subpart = subpart->next_subpart; - kfree(subpart); - } -} - -static int parse_parts(struct cmdline_parts **parts, const char *bdevdef) -{ - int ret = -EINVAL; - char *next; - int length; - struct cmdline_subpart **next_subpart; - struct cmdline_parts *newparts; - char buf[BDEVNAME_SIZE + 32 + 4]; - - *parts = NULL; - - newparts = kzalloc(sizeof(struct cmdline_parts), GFP_KERNEL); - if (!newparts) - return -ENOMEM; - - next = strchr(bdevdef, ':'); - if (!next) { - pr_warn("cmdline partition has no block device."); - goto fail; - } - - length = min_t(int, next - bdevdef, sizeof(newparts->name) - 1); - strncpy(newparts->name, bdevdef, length); - newparts->name[length] = '\0'; - newparts->nr_subparts = 0; - - next_subpart = &newparts->subpart; - - while (next && *(++next)) { - bdevdef = next; - next = strchr(bdevdef, ','); - - length = (!next) ? (sizeof(buf) - 1) : - min_t(int, next - bdevdef, sizeof(buf) - 1); - - strncpy(buf, bdevdef, length); - buf[length] = '\0'; - - ret = parse_subpart(next_subpart, buf); - if (ret) - goto fail; - - newparts->nr_subparts++; - next_subpart = &(*next_subpart)->next_subpart; - } - - if (!newparts->subpart) { - pr_warn("cmdline partition has no valid partition."); - ret = -EINVAL; - goto fail; - } - - *parts = newparts; - - return 0; -fail: - free_subpart(newparts); - kfree(newparts); - return ret; -} - -void cmdline_parts_free(struct cmdline_parts **parts) -{ - struct cmdline_parts *next_parts; - - while (*parts) { - next_parts = (*parts)->next_parts; - free_subpart(*parts); - kfree(*parts); - *parts = next_parts; - } -} -EXPORT_SYMBOL(cmdline_parts_free); - -int cmdline_parts_parse(struct cmdline_parts **parts, const char *cmdline) -{ - int ret; - char *buf; - char *pbuf; - char *next; - struct cmdline_parts **next_parts; - - *parts = NULL; - - next = pbuf = buf = kstrdup(cmdline, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - next_parts = parts; - - while (next && *pbuf) { - next = strchr(pbuf, ';'); - if (next) - *next = '\0'; - - ret = parse_parts(next_parts, pbuf); - if (ret) - goto fail; - - if (next) - pbuf = ++next; - - next_parts = &(*next_parts)->next_parts; - } - - if (!*parts) { - pr_warn("cmdline partition has no valid partition."); - ret = -EINVAL; - goto fail; - } - - ret = 0; -done: - kfree(buf); - return ret; - -fail: - cmdline_parts_free(parts); - goto done; -} -EXPORT_SYMBOL(cmdline_parts_parse); - -struct cmdline_parts *cmdline_parts_find(struct cmdline_parts *parts, - const char *bdev) -{ - while (parts && strncmp(bdev, parts->name, sizeof(parts->name))) - parts = parts->next_parts; - return parts; -} -EXPORT_SYMBOL(cmdline_parts_find); - -/* - * add_part() - * 0 success. - * 1 can not add so many partitions. - */ -int cmdline_parts_set(struct cmdline_parts *parts, sector_t disk_size, - int slot, - int (*add_part)(int, struct cmdline_subpart *, void *), - void *param) -{ - sector_t from = 0; - struct cmdline_subpart *subpart; - - for (subpart = parts->subpart; subpart; - subpart = subpart->next_subpart, slot++) { - if (subpart->from == (sector_t)(~0ULL)) - subpart->from = from; - else - from = subpart->from; - - if (from >= disk_size) - break; - - if (subpart->size > (disk_size - from)) - subpart->size = disk_size - from; - - from += subpart->size; - - if (add_part(slot, subpart, param)) - break; - } - - return slot; -} -EXPORT_SYMBOL(cmdline_parts_set); diff --git a/block/partitions/Kconfig b/block/partitions/Kconfig index 6e2a649669e5..278593b8e4e9 100644 --- a/block/partitions/Kconfig +++ b/block/partitions/Kconfig @@ -264,7 +264,6 @@ config SYSV68_PARTITION config CMDLINE_PARTITION bool "Command line partition support" if PARTITION_ADVANCED - select BLK_CMDLINE_PARSER help Say Y here if you want to read the partition table from bootargs. The format for the command line is just like mtdparts. diff --git a/block/partitions/cmdline.c b/block/partitions/cmdline.c index 8f545c36cde4..482a29e95dbd 100644 --- a/block/partitions/cmdline.c +++ b/block/partitions/cmdline.c @@ -14,20 +14,248 @@ * For further information, see "Documentation/block/cmdline-partition.rst" * */ - -#include - +#include +#include +#include #include "check.h" + +/* partition flags */ +#define PF_RDONLY 0x01 /* Device is read only */ +#define PF_POWERUP_LOCK 0x02 /* Always locked after reset */ + +struct cmdline_subpart { + char name[BDEVNAME_SIZE]; /* partition name, such as 'rootfs' */ + sector_t from; + sector_t size; + int flags; + struct cmdline_subpart *next_subpart; +}; + +struct cmdline_parts { + char name[BDEVNAME_SIZE]; /* block device, such as 'mmcblk0' */ + unsigned int nr_subparts; + struct cmdline_subpart *subpart; + struct cmdline_parts *next_parts; +}; + +static int parse_subpart(struct cmdline_subpart **subpart, char *partdef) +{ + int ret = 0; + struct cmdline_subpart *new_subpart; + + *subpart = NULL; + + new_subpart = kzalloc(sizeof(struct cmdline_subpart), GFP_KERNEL); + if (!new_subpart) + return -ENOMEM; + + if (*partdef == '-') { + new_subpart->size = (sector_t)(~0ULL); + partdef++; + } else { + new_subpart->size = (sector_t)memparse(partdef, &partdef); + if (new_subpart->size < (sector_t)PAGE_SIZE) { + pr_warn("cmdline partition size is invalid."); + ret = -EINVAL; + goto fail; + } + } + + if (*partdef == '@') { + partdef++; + new_subpart->from = (sector_t)memparse(partdef, &partdef); + } else { + new_subpart->from = (sector_t)(~0ULL); + } + + if (*partdef == '(') { + int length; + char *next = strchr(++partdef, ')'); + + if (!next) { + pr_warn("cmdline partition format is invalid."); + ret = -EINVAL; + goto fail; + } + + length = min_t(int, next - partdef, + sizeof(new_subpart->name) - 1); + strncpy(new_subpart->name, partdef, length); + new_subpart->name[length] = '\0'; + + partdef = ++next; + } else + new_subpart->name[0] = '\0'; + + new_subpart->flags = 0; + + if (!strncmp(partdef, "ro", 2)) { + new_subpart->flags |= PF_RDONLY; + partdef += 2; + } + + if (!strncmp(partdef, "lk", 2)) { + new_subpart->flags |= PF_POWERUP_LOCK; + partdef += 2; + } + + *subpart = new_subpart; + return 0; +fail: + kfree(new_subpart); + return ret; +} + +static void free_subpart(struct cmdline_parts *parts) +{ + struct cmdline_subpart *subpart; + + while (parts->subpart) { + subpart = parts->subpart; + parts->subpart = subpart->next_subpart; + kfree(subpart); + } +} + +static int parse_parts(struct cmdline_parts **parts, const char *bdevdef) +{ + int ret = -EINVAL; + char *next; + int length; + struct cmdline_subpart **next_subpart; + struct cmdline_parts *newparts; + char buf[BDEVNAME_SIZE + 32 + 4]; + + *parts = NULL; + + newparts = kzalloc(sizeof(struct cmdline_parts), GFP_KERNEL); + if (!newparts) + return -ENOMEM; + + next = strchr(bdevdef, ':'); + if (!next) { + pr_warn("cmdline partition has no block device."); + goto fail; + } + + length = min_t(int, next - bdevdef, sizeof(newparts->name) - 1); + strncpy(newparts->name, bdevdef, length); + newparts->name[length] = '\0'; + newparts->nr_subparts = 0; + + next_subpart = &newparts->subpart; + + while (next && *(++next)) { + bdevdef = next; + next = strchr(bdevdef, ','); + + length = (!next) ? (sizeof(buf) - 1) : + min_t(int, next - bdevdef, sizeof(buf) - 1); + + strncpy(buf, bdevdef, length); + buf[length] = '\0'; + + ret = parse_subpart(next_subpart, buf); + if (ret) + goto fail; + + newparts->nr_subparts++; + next_subpart = &(*next_subpart)->next_subpart; + } + + if (!newparts->subpart) { + pr_warn("cmdline partition has no valid partition."); + ret = -EINVAL; + goto fail; + } + + *parts = newparts; + + return 0; +fail: + free_subpart(newparts); + kfree(newparts); + return ret; +} + +static void cmdline_parts_free(struct cmdline_parts **parts) +{ + struct cmdline_parts *next_parts; + + while (*parts) { + next_parts = (*parts)->next_parts; + free_subpart(*parts); + kfree(*parts); + *parts = next_parts; + } +} + +static int cmdline_parts_parse(struct cmdline_parts **parts, + const char *cmdline) +{ + int ret; + char *buf; + char *pbuf; + char *next; + struct cmdline_parts **next_parts; + + *parts = NULL; + + next = pbuf = buf = kstrdup(cmdline, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + next_parts = parts; + + while (next && *pbuf) { + next = strchr(pbuf, ';'); + if (next) + *next = '\0'; + + ret = parse_parts(next_parts, pbuf); + if (ret) + goto fail; + + if (next) + pbuf = ++next; + + next_parts = &(*next_parts)->next_parts; + } + + if (!*parts) { + pr_warn("cmdline partition has no valid partition."); + ret = -EINVAL; + goto fail; + } + + ret = 0; +done: + kfree(buf); + return ret; + +fail: + cmdline_parts_free(parts); + goto done; +} + +static struct cmdline_parts *cmdline_parts_find(struct cmdline_parts *parts, + const char *bdev) +{ + while (parts && strncmp(bdev, parts->name, sizeof(parts->name))) + parts = parts->next_parts; + return parts; +} + static char *cmdline; static struct cmdline_parts *bdev_parts; -static int add_part(int slot, struct cmdline_subpart *subpart, void *param) +static int add_part(int slot, struct cmdline_subpart *subpart, + struct parsed_partitions *state) { int label_min; struct partition_meta_info *info; char tmp[sizeof(info->volname) + 4]; - struct parsed_partitions *state = (struct parsed_partitions *)param; if (slot >= state->limit) return 1; @@ -50,6 +278,35 @@ static int add_part(int slot, struct cmdline_subpart *subpart, void *param) return 0; } +static int cmdline_parts_set(struct cmdline_parts *parts, sector_t disk_size, + struct parsed_partitions *state) +{ + sector_t from = 0; + struct cmdline_subpart *subpart; + int slot = 1; + + for (subpart = parts->subpart; subpart; + subpart = subpart->next_subpart, slot++) { + if (subpart->from == (sector_t)(~0ULL)) + subpart->from = from; + else + from = subpart->from; + + if (from >= disk_size) + break; + + if (subpart->size > (disk_size - from)) + subpart->size = disk_size - from; + + from += subpart->size; + + if (add_part(slot, subpart, state)) + break; + } + + return slot; +} + static int __init cmdline_parts_setup(char *s) { cmdline = s; @@ -147,7 +404,7 @@ int cmdline_partition(struct parsed_partitions *state) disk_size = get_capacity(state->bdev->bd_disk) << 9; - cmdline_parts_set(parts, disk_size, 1, add_part, (void *)state); + cmdline_parts_set(parts, disk_size, state); cmdline_parts_verifier(1, state); strlcat(state->pp_buf, "\n", PAGE_SIZE); diff --git a/include/linux/cmdline-parser.h b/include/linux/cmdline-parser.h deleted file mode 100644 index 68a541807bdf..000000000000 --- a/include/linux/cmdline-parser.h +++ /dev/null @@ -1,46 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Parsing command line, get the partitions information. - * - * Written by Cai Zhiyong - * - */ -#ifndef CMDLINEPARSEH -#define CMDLINEPARSEH - -#include -#include -#include - -/* partition flags */ -#define PF_RDONLY 0x01 /* Device is read only */ -#define PF_POWERUP_LOCK 0x02 /* Always locked after reset */ - -struct cmdline_subpart { - char name[BDEVNAME_SIZE]; /* partition name, such as 'rootfs' */ - sector_t from; - sector_t size; - int flags; - struct cmdline_subpart *next_subpart; -}; - -struct cmdline_parts { - char name[BDEVNAME_SIZE]; /* block device, such as 'mmcblk0' */ - unsigned int nr_subparts; - struct cmdline_subpart *subpart; - struct cmdline_parts *next_parts; -}; - -void cmdline_parts_free(struct cmdline_parts **parts); - -int cmdline_parts_parse(struct cmdline_parts **parts, const char *cmdline); - -struct cmdline_parts *cmdline_parts_find(struct cmdline_parts *parts, - const char *bdev); - -int cmdline_parts_set(struct cmdline_parts *parts, sector_t disk_size, - int slot, - int (*add_part)(int, struct cmdline_subpart *, void *), - void *param); - -#endif /* CMDLINEPARSEH */ From cf179948554a2e0d2b622317bf6bf33138ac36e5 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Tue, 13 Jul 2021 01:05:25 +0200 Subject: [PATCH 039/315] block: add disk sequence number Associating uevents with block devices in userspace is difficult and racy: the uevent netlink socket is lossy, and on slow and overloaded systems has a very high latency. Block devices do not have exclusive owners in userspace, any process can set one up (e.g. loop devices). Moreover, device names can be reused (e.g. loop0 can be reused again and again). A userspace process setting up a block device and watching for its events cannot thus reliably tell whether an event relates to the device it just set up or another earlier instance with the same name. Being able to set a UUID on a loop device would solve the race conditions. But it does not allow to derive orderings from uevents: if you see a uevent with a UUID that does not match the device you are waiting for, you cannot tell whether it's because the right uevent has not arrived yet, or it was already sent and you missed it. So you cannot tell whether you should wait for it or not. Associating a unique, monotonically increasing sequential number to the lifetime of each block device, which can be retrieved with an ioctl immediately upon setting it up, allows to solve the race conditions with uevents, and also allows userspace processes to know whether they should wait for the uevent they need or if it was dropped and thus they should move on. Additionally, increment the disk sequence number when the media change, i.e. on DISK_EVENT_MEDIA_CHANGE event. Reviewed-by: Christoph Hellwig Signed-off-by: Matteo Croce Tested-by: Luca Boccassi Link: https://lore.kernel.org/r/20210712230530.29323-2-mcroce@linux.microsoft.com Signed-off-by: Jens Axboe --- block/disk-events.c | 3 +++ block/genhd.c | 24 ++++++++++++++++++++++++ include/linux/genhd.h | 2 ++ 3 files changed, 29 insertions(+) diff --git a/block/disk-events.c b/block/disk-events.c index a75931ff5da4..04c52f3992ed 100644 --- a/block/disk-events.c +++ b/block/disk-events.c @@ -190,6 +190,9 @@ static void disk_check_events(struct disk_events *ev, spin_unlock_irq(&ev->lock); + if (events & DISK_EVENT_MEDIA_CHANGE) + inc_diskseq(disk); + /* * Tell userland about new events. Only the events listed in * @disk->events are reported, and only if DISK_EVENT_FLAG_UEVENT diff --git a/block/genhd.c b/block/genhd.c index 38f053074159..ceb08af72c1a 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -29,6 +29,23 @@ static struct kobject *block_depr; +/* + * Unique, monotonically increasing sequential number associated with block + * devices instances (i.e. incremented each time a device is attached). + * Associating uevents with block devices in userspace is difficult and racy: + * the uevent netlink socket is lossy, and on slow and overloaded systems has + * a very high latency. + * Block devices do not have exclusive owners in userspace, any process can set + * one up (e.g. loop devices). Moreover, device names can be reused (e.g. loop0 + * can be reused again and again). + * A userspace process setting up a block device and watching for its events + * cannot thus reliably tell whether an event relates to the device it just set + * up or another earlier instance with the same name. + * This sequential number allows userspace processes to solve this problem, and + * uniquely associate an uevent to the lifetime to a device. + */ +static atomic64_t diskseq; + /* for extended dynamic devt allocation, currently only one major is used */ #define NR_EXT_DEVT (1 << MINORBITS) static DEFINE_IDA(ext_devt_ida); @@ -1252,6 +1269,8 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) disk_to_dev(disk)->class = &block_class; disk_to_dev(disk)->type = &disk_type; device_initialize(disk_to_dev(disk)); + inc_diskseq(disk); + return disk; out_destroy_part_tbl: @@ -1352,3 +1371,8 @@ int bdev_read_only(struct block_device *bdev) return bdev->bd_read_only || get_disk_ro(bdev->bd_disk); } EXPORT_SYMBOL(bdev_read_only); + +void inc_diskseq(struct gendisk *disk) +{ + disk->diskseq = atomic64_inc_return(&diskseq); +} diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 13b34177cc85..140c028845af 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -172,6 +172,7 @@ struct gendisk { int node_id; struct badblocks *bb; struct lockdep_map lockdep_map; + u64 diskseq; }; /* @@ -332,6 +333,7 @@ static inline void bd_unlink_disk_holder(struct block_device *bdev, #endif /* CONFIG_SYSFS */ dev_t part_devt(struct gendisk *disk, u8 partno); +void inc_diskseq(struct gendisk *disk); dev_t blk_lookup_devt(const char *name, int partno); void blk_request_module(dev_t devt); #ifdef CONFIG_BLOCK From 87eb710747126ca6606f064deef93d045486ebbe Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Tue, 13 Jul 2021 01:05:26 +0200 Subject: [PATCH 040/315] block: export the diskseq in uevents Export the newly introduced diskseq in uevents: $ udevadm info /sys/class/block/* |grep -e DEVNAME -e DISKSEQ E: DEVNAME=/dev/loop0 E: DISKSEQ=1 E: DEVNAME=/dev/loop1 E: DISKSEQ=2 E: DEVNAME=/dev/loop2 E: DISKSEQ=3 E: DEVNAME=/dev/loop3 E: DISKSEQ=4 E: DEVNAME=/dev/loop4 E: DISKSEQ=5 E: DEVNAME=/dev/loop5 E: DISKSEQ=6 E: DEVNAME=/dev/loop6 E: DISKSEQ=7 E: DEVNAME=/dev/loop7 E: DISKSEQ=8 E: DEVNAME=/dev/nvme0n1 E: DISKSEQ=9 E: DEVNAME=/dev/nvme0n1p1 E: DISKSEQ=9 E: DEVNAME=/dev/nvme0n1p2 E: DISKSEQ=9 E: DEVNAME=/dev/nvme0n1p3 E: DISKSEQ=9 E: DEVNAME=/dev/nvme0n1p4 E: DISKSEQ=9 E: DEVNAME=/dev/nvme0n1p5 E: DISKSEQ=9 E: DEVNAME=/dev/sda E: DISKSEQ=10 E: DEVNAME=/dev/sda1 E: DISKSEQ=10 E: DEVNAME=/dev/sda2 E: DISKSEQ=10 Reviewed-by: Christoph Hellwig Signed-off-by: Matteo Croce Tested-by: Luca Boccassi Link: https://lore.kernel.org/r/20210712230530.29323-3-mcroce@linux.microsoft.com Signed-off-by: Jens Axboe --- block/genhd.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/block/genhd.c b/block/genhd.c index ceb08af72c1a..e1b2f898d790 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1092,8 +1092,17 @@ static void disk_release(struct device *dev) blk_put_queue(disk->queue); iput(disk->part0->bd_inode); /* frees the disk */ } + +static int block_uevent(struct device *dev, struct kobj_uevent_env *env) +{ + struct gendisk *disk = dev_to_disk(dev); + + return add_uevent_var(env, "DISKSEQ=%llu", disk->diskseq); +} + struct class block_class = { .name = "block", + .dev_uevent = block_uevent, }; static char *block_devnode(struct device *dev, umode_t *mode, From 7957d93bf32bc211415827e44fdd9cdf1388df59 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Tue, 13 Jul 2021 01:05:27 +0200 Subject: [PATCH 041/315] block: add ioctl to read the disk sequence number Add a new BLKGETDISKSEQ ioctl which retrieves the disk sequence number from the genhd structure. # ./getdiskseq /dev/loop* /dev/loop0: 13 /dev/loop0p1: 13 /dev/loop0p2: 13 /dev/loop0p3: 13 /dev/loop1: 14 /dev/loop1p1: 14 /dev/loop1p2: 14 /dev/loop2: 5 /dev/loop3: 6 Reviewed-by: Christoph Hellwig Signed-off-by: Matteo Croce Tested-by: Luca Boccassi Link: https://lore.kernel.org/r/20210712230530.29323-4-mcroce@linux.microsoft.com Signed-off-by: Jens Axboe --- block/ioctl.c | 2 ++ include/uapi/linux/fs.h | 1 + 2 files changed, 3 insertions(+) diff --git a/block/ioctl.c b/block/ioctl.c index 24beec9ca9c9..0c3a4a53fa11 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -469,6 +469,8 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode, BLKDEV_DISCARD_SECURE); case BLKZEROOUT: return blk_ioctl_zeroout(bdev, mode, arg); + case BLKGETDISKSEQ: + return put_u64(argp, bdev->bd_disk->diskseq); case BLKREPORTZONE: return blkdev_report_zones_ioctl(bdev, mode, cmd, arg); case BLKRESETZONE: diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 4c32e97dcdf0..bdf7b404b3e7 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -184,6 +184,7 @@ struct fsxattr { #define BLKSECDISCARD _IO(0x12,125) #define BLKROTATIONAL _IO(0x12,126) #define BLKZEROOUT _IO(0x12,127) +#define BLKGETDISKSEQ _IOR(0x12,128,__u64) /* * A jump here: 130-136 are reserved for zoned block devices * (see uapi/linux/blkzoned.h) From 13927b31b13f3c6556221eff3487247bd3c7a245 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Tue, 13 Jul 2021 01:05:28 +0200 Subject: [PATCH 042/315] block: export diskseq in sysfs Add a new sysfs handle to export the new diskseq value. Place it in /block//diskseq and document it. $ grep . /sys/class/block/*/diskseq /sys/class/block/loop0/diskseq:13 /sys/class/block/loop1/diskseq:14 /sys/class/block/loop2/diskseq:5 /sys/class/block/loop3/diskseq:6 /sys/class/block/ram0/diskseq:1 /sys/class/block/ram1/diskseq:2 /sys/class/block/vda/diskseq:7 Reviewed-by: Christoph Hellwig Signed-off-by: Matteo Croce Tested-by: Luca Boccassi Link: https://lore.kernel.org/r/20210712230530.29323-5-mcroce@linux.microsoft.com Signed-off-by: Jens Axboe --- Documentation/ABI/testing/sysfs-block | 12 ++++++++++++ block/genhd.c | 10 ++++++++++ 2 files changed, 22 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block index e34cdeeeb9d4..a0ed87386639 100644 --- a/Documentation/ABI/testing/sysfs-block +++ b/Documentation/ABI/testing/sysfs-block @@ -28,6 +28,18 @@ Description: For more details refer Documentation/admin-guide/iostats.rst +What: /sys/block//diskseq +Date: February 2021 +Contact: Matteo Croce +Description: + The /sys/block//diskseq files reports the disk + sequence number, which is a monotonically increasing + number assigned to every drive. + Some devices, like the loop device, refresh such number + every time the backing file is changed. + The value type is 64 bit unsigned. + + What: /sys/block///stat Date: February 2008 Contact: Jerome Marchand diff --git a/block/genhd.c b/block/genhd.c index e1b2f898d790..a4817e42f3a3 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -977,6 +977,14 @@ static ssize_t disk_discard_alignment_show(struct device *dev, return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue)); } +static ssize_t diskseq_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%llu\n", disk->diskseq); +} + static DEVICE_ATTR(range, 0444, disk_range_show, NULL); static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL); static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL); @@ -989,6 +997,7 @@ static DEVICE_ATTR(capability, 0444, disk_capability_show, NULL); static DEVICE_ATTR(stat, 0444, part_stat_show, NULL); static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL); static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store); +static DEVICE_ATTR(diskseq, 0444, diskseq_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST ssize_t part_fail_show(struct device *dev, @@ -1034,6 +1043,7 @@ static struct attribute *disk_attrs[] = { &dev_attr_events.attr, &dev_attr_events_async.attr, &dev_attr_events_poll_msecs.attr, + &dev_attr_diskseq.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif From e6138dc12de9df17cbda9c40314d69592855ac5e Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Tue, 13 Jul 2021 01:05:29 +0200 Subject: [PATCH 043/315] block: add a helper to raise a media changed event Refactor disk_check_events() and move some code into disk_event_uevent(). Then add disk_force_media_change(), a helper which will be used by devices to force issuing a DISK_EVENT_MEDIA_CHANGE event. Co-developed-by: Christoph Hellwig Signed-off-by: Christoph Hellwig Signed-off-by: Matteo Croce Tested-by: Luca Boccassi Link: https://lore.kernel.org/r/20210712230530.29323-6-mcroce@linux.microsoft.com Signed-off-by: Jens Axboe --- block/disk-events.c | 61 ++++++++++++++++++++++++++++++++----------- include/linux/genhd.h | 1 + 2 files changed, 47 insertions(+), 15 deletions(-) diff --git a/block/disk-events.c b/block/disk-events.c index 04c52f3992ed..7445b8ff2775 100644 --- a/block/disk-events.c +++ b/block/disk-events.c @@ -163,15 +163,31 @@ void disk_flush_events(struct gendisk *disk, unsigned int mask) spin_unlock_irq(&ev->lock); } +/* + * Tell userland about new events. Only the events listed in @disk->events are + * reported, and only if DISK_EVENT_FLAG_UEVENT is set. Otherwise, events are + * processed internally but never get reported to userland. + */ +static void disk_event_uevent(struct gendisk *disk, unsigned int events) +{ + char *envp[ARRAY_SIZE(disk_uevents) + 1] = { }; + int nr_events = 0, i; + + for (i = 0; i < ARRAY_SIZE(disk_uevents); i++) + if (events & disk->events & (1 << i)) + envp[nr_events++] = disk_uevents[i]; + + if (nr_events) + kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); +} + static void disk_check_events(struct disk_events *ev, unsigned int *clearing_ptr) { struct gendisk *disk = ev->disk; - char *envp[ARRAY_SIZE(disk_uevents) + 1] = { }; unsigned int clearing = *clearing_ptr; unsigned int events; unsigned long intv; - int nr_events = 0, i; /* check events */ events = disk->fops->check_events(disk, clearing); @@ -193,19 +209,8 @@ static void disk_check_events(struct disk_events *ev, if (events & DISK_EVENT_MEDIA_CHANGE) inc_diskseq(disk); - /* - * Tell userland about new events. Only the events listed in - * @disk->events are reported, and only if DISK_EVENT_FLAG_UEVENT - * is set. Otherwise, events are processed internally but never - * get reported to userland. - */ - for (i = 0; i < ARRAY_SIZE(disk_uevents); i++) - if ((events & disk->events & (1 << i)) && - (disk->event_flags & DISK_EVENT_FLAG_UEVENT)) - envp[nr_events++] = disk_uevents[i]; - - if (nr_events) - kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); + if (disk->event_flags & DISK_EVENT_FLAG_UEVENT) + disk_event_uevent(disk, events); } /** @@ -284,6 +289,32 @@ bool bdev_check_media_change(struct block_device *bdev) } EXPORT_SYMBOL(bdev_check_media_change); +/** + * disk_force_media_change - force a media change event + * @disk: the disk which will raise the event + * @events: the events to raise + * + * Generate uevents for the disk. If DISK_EVENT_MEDIA_CHANGE is present, + * attempt to free all dentries and inodes and invalidates all block + * device page cache entries in that case. + * + * Returns %true if DISK_EVENT_MEDIA_CHANGE was raised, or %false if not. + */ +bool disk_force_media_change(struct gendisk *disk, unsigned int events) +{ + disk_event_uevent(disk, events); + + if (!(events & DISK_EVENT_MEDIA_CHANGE)) + return false; + + if (__invalidate_device(disk->part0, true)) + pr_warn("VFS: busy inodes on changed media %s\n", + disk->disk_name); + set_bit(GD_NEED_PART_SCAN, &disk->state); + return true; +} +EXPORT_SYMBOL_GPL(disk_force_media_change); + /* * Separate this part out so that a different pointer for clearing_ptr can be * passed in for disk_clear_events. diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 140c028845af..849486de81c6 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -237,6 +237,7 @@ extern void disk_block_events(struct gendisk *disk); extern void disk_unblock_events(struct gendisk *disk); extern void disk_flush_events(struct gendisk *disk, unsigned int mask); bool set_capacity_and_notify(struct gendisk *disk, sector_t size); +bool disk_force_media_change(struct gendisk *disk, unsigned int events); /* drivers/char/random.c */ extern void add_disk_randomness(struct gendisk *disk) __latent_entropy; From 9f65c489b68d42427dc0651488dd260d678f525d Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Tue, 13 Jul 2021 01:05:30 +0200 Subject: [PATCH 044/315] loop: raise media_change event Make the loop device raise a DISK_MEDIA_CHANGE event on attach or detach. # udevadm monitor -up |grep -e DISK_MEDIA_CHANGE -e DEVNAME & # losetup -f zero [ 7.454235] loop0: detected capacity change from 0 to 16384 DISK_MEDIA_CHANGE=1 DEVNAME=/dev/loop0 DEVNAME=/dev/loop0 DEVNAME=/dev/loop0 # losetup -f zero [ 10.205245] loop1: detected capacity change from 0 to 16384 DISK_MEDIA_CHANGE=1 DEVNAME=/dev/loop1 DEVNAME=/dev/loop1 DEVNAME=/dev/loop1 # losetup -f zero2 [ 13.532368] loop2: detected capacity change from 0 to 40960 DISK_MEDIA_CHANGE=1 DEVNAME=/dev/loop2 DEVNAME=/dev/loop2 # losetup -D DEVNAME=/dev/loop1 DISK_MEDIA_CHANGE=1 DEVNAME=/dev/loop1 DEVNAME=/dev/loop2 DISK_MEDIA_CHANGE=1 DEVNAME=/dev/loop2 DEVNAME=/dev/loop0 DISK_MEDIA_CHANGE=1 DEVNAME=/dev/loop0 Signed-off-by: Matteo Croce Reviewed-by: Christoph Hellwig Tested-by: Luca Boccassi Link: https://lore.kernel.org/r/20210712230530.29323-7-mcroce@linux.microsoft.com Signed-off-by: Jens Axboe --- drivers/block/loop.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index a03ef5025f27..f8486d9b75a4 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -774,6 +774,7 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, goto out_err; /* and ... switch */ + disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE); blk_mq_freeze_queue(lo->lo_queue); mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask); lo->lo_backing_file = file; @@ -1257,6 +1258,7 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, goto out_unlock; } + disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE); set_disk_ro(lo->lo_disk, (lo->lo_flags & LO_FLAGS_READ_ONLY) != 0); INIT_WORK(&lo->rootcg_work, loop_rootcg_workfn); @@ -1410,6 +1412,7 @@ static int __loop_clr_fd(struct loop_device *lo, bool release) partscan = lo->lo_flags & LO_FLAGS_PARTSCAN && bdev; lo_number = lo->lo_number; + disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE); out_unlock: mutex_unlock(&lo->lo_mutex); if (partscan) { @@ -2386,6 +2389,8 @@ static int loop_add(int i) disk->fops = &lo_fops; disk->private_data = lo; disk->queue = lo->lo_queue; + disk->events = DISK_EVENT_MEDIA_CHANGE; + disk->event_flags = DISK_EVENT_FLAG_UEVENT; sprintf(disk->disk_name, "loop%d", i); add_disk(disk); mutex_unlock(&loop_ctl_mutex); From 2bc1f6e442eec88fa60f1ee6bef2c9871227cf8a Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 13 Jul 2021 17:18:37 +0900 Subject: [PATCH 045/315] block: remove blk-mq-sysfs dead code In block/blk-mq-sysfs.c, struct blk_mq_ctx_sysfs_entry is not used to define any attribute since the "mq" sysfs directory contains only sub-directories (no attribute files). As a result, blk_mq_sysfs_show(), blk_mq_sysfs_store(), and struct sysfs_ops blk_mq_sysfs_ops are all unused and unnecessary. Remove all this unused code. Signed-off-by: Damien Le Moal Link: https://lore.kernel.org/r/20210713081837.524422-1-damien.lemoal@wdc.com Signed-off-by: Jens Axboe --- block/blk-mq-sysfs.c | 55 -------------------------------------------- 1 file changed, 55 deletions(-) diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 7b52e7657b2d..253c857cba47 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -45,60 +45,12 @@ static void blk_mq_hw_sysfs_release(struct kobject *kobj) kfree(hctx); } -struct blk_mq_ctx_sysfs_entry { - struct attribute attr; - ssize_t (*show)(struct blk_mq_ctx *, char *); - ssize_t (*store)(struct blk_mq_ctx *, const char *, size_t); -}; - struct blk_mq_hw_ctx_sysfs_entry { struct attribute attr; ssize_t (*show)(struct blk_mq_hw_ctx *, char *); ssize_t (*store)(struct blk_mq_hw_ctx *, const char *, size_t); }; -static ssize_t blk_mq_sysfs_show(struct kobject *kobj, struct attribute *attr, - char *page) -{ - struct blk_mq_ctx_sysfs_entry *entry; - struct blk_mq_ctx *ctx; - struct request_queue *q; - ssize_t res; - - entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr); - ctx = container_of(kobj, struct blk_mq_ctx, kobj); - q = ctx->queue; - - if (!entry->show) - return -EIO; - - mutex_lock(&q->sysfs_lock); - res = entry->show(ctx, page); - mutex_unlock(&q->sysfs_lock); - return res; -} - -static ssize_t blk_mq_sysfs_store(struct kobject *kobj, struct attribute *attr, - const char *page, size_t length) -{ - struct blk_mq_ctx_sysfs_entry *entry; - struct blk_mq_ctx *ctx; - struct request_queue *q; - ssize_t res; - - entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr); - ctx = container_of(kobj, struct blk_mq_ctx, kobj); - q = ctx->queue; - - if (!entry->store) - return -EIO; - - mutex_lock(&q->sysfs_lock); - res = entry->store(ctx, page, length); - mutex_unlock(&q->sysfs_lock); - return res; -} - static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj, struct attribute *attr, char *page) { @@ -198,23 +150,16 @@ static struct attribute *default_hw_ctx_attrs[] = { }; ATTRIBUTE_GROUPS(default_hw_ctx); -static const struct sysfs_ops blk_mq_sysfs_ops = { - .show = blk_mq_sysfs_show, - .store = blk_mq_sysfs_store, -}; - static const struct sysfs_ops blk_mq_hw_sysfs_ops = { .show = blk_mq_hw_sysfs_show, .store = blk_mq_hw_sysfs_store, }; static struct kobj_type blk_mq_ktype = { - .sysfs_ops = &blk_mq_sysfs_ops, .release = blk_mq_sysfs_release, }; static struct kobj_type blk_mq_ctx_ktype = { - .sysfs_ops = &blk_mq_sysfs_ops, .release = blk_mq_ctx_sysfs_release, }; From 94dace8c85717588c2b4d116759cc3253f47d0eb Mon Sep 17 00:00:00 2001 From: Gioh Kim Date: Mon, 26 Jul 2021 13:59:49 +0200 Subject: [PATCH 046/315] block/rnbd-clt: Use put_cpu_ptr after get_cpu_ptr This patch replaces put_cpu_var with put_cpu_ptr because get_cpu_ptr should be paired with put_cpu_ptr. Signed-off-by: Gioh Kim Signed-off-by: Jack Wang Link: https://lore.kernel.org/r/20210726115950.470543-2-jinpu.wang@ionos.com Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index e9cc413495f0..bd4a41afbbfc 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -271,7 +271,7 @@ unlock: */ if (cpu_q) *cpup = cpu_q->cpu; - put_cpu_var(sess->cpu_rr); + put_cpu_ptr(sess->cpu_rr); if (q) rnbd_clt_dev_requeue(q); From 3087b335b5316cd180aa4c5a28abaa890905634e Mon Sep 17 00:00:00 2001 From: Md Haris Iqbal Date: Mon, 26 Jul 2021 13:59:50 +0200 Subject: [PATCH 047/315] block/rnbd: Use sysfs_emit instead of s*printf function for sysfs show sysfs_emit function was added to be aware of the PAGE_SIZE maximum of the temporary buffer used for outputting sysfs content, so there is no possible overruns. So replace the uses of any s*printf functions for the sysfs show functions with sysfs_emit. Signed-off-by: Md Haris Iqbal Signed-off-by: Jack Wang Link: https://lore.kernel.org/r/20210726115950.470543-3-jinpu.wang@ionos.com Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt-sysfs.c | 33 +++++++++++++---------------- drivers/block/rnbd/rnbd-srv-sysfs.c | 14 ++++++------ 2 files changed, 22 insertions(+), 25 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c index 324afdd63a96..4b93fd83bf79 100644 --- a/drivers/block/rnbd/rnbd-clt-sysfs.c +++ b/drivers/block/rnbd/rnbd-clt-sysfs.c @@ -227,17 +227,17 @@ static ssize_t state_show(struct kobject *kobj, switch (dev->dev_state) { case DEV_STATE_INIT: - return snprintf(page, PAGE_SIZE, "init\n"); + return sysfs_emit(page, "init\n"); case DEV_STATE_MAPPED: /* TODO fix cli tool before changing to proper state */ - return snprintf(page, PAGE_SIZE, "open\n"); + return sysfs_emit(page, "open\n"); case DEV_STATE_MAPPED_DISCONNECTED: /* TODO fix cli tool before changing to proper state */ - return snprintf(page, PAGE_SIZE, "closed\n"); + return sysfs_emit(page, "closed\n"); case DEV_STATE_UNMAPPED: - return snprintf(page, PAGE_SIZE, "unmapped\n"); + return sysfs_emit(page, "unmapped\n"); default: - return snprintf(page, PAGE_SIZE, "unknown\n"); + return sysfs_emit(page, "unknown\n"); } } @@ -263,7 +263,7 @@ static ssize_t mapping_path_show(struct kobject *kobj, dev = container_of(kobj, struct rnbd_clt_dev, kobj); - return scnprintf(page, PAGE_SIZE, "%s\n", dev->pathname); + return sysfs_emit(page, "%s\n", dev->pathname); } static struct kobj_attribute rnbd_clt_mapping_path_attr = @@ -276,8 +276,7 @@ static ssize_t access_mode_show(struct kobject *kobj, dev = container_of(kobj, struct rnbd_clt_dev, kobj); - return snprintf(page, PAGE_SIZE, "%s\n", - rnbd_access_mode_str(dev->access_mode)); + return sysfs_emit(page, "%s\n", rnbd_access_mode_str(dev->access_mode)); } static struct kobj_attribute rnbd_clt_access_mode = @@ -286,8 +285,8 @@ static struct kobj_attribute rnbd_clt_access_mode = static ssize_t rnbd_clt_unmap_dev_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return scnprintf(page, PAGE_SIZE, "Usage: echo > %s\n", - attr->attr.name); + return sysfs_emit(page, "Usage: echo > %s\n", + attr->attr.name); } static ssize_t rnbd_clt_unmap_dev_store(struct kobject *kobj, @@ -357,9 +356,8 @@ static ssize_t rnbd_clt_resize_dev_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return scnprintf(page, PAGE_SIZE, - "Usage: echo > %s\n", - attr->attr.name); + return sysfs_emit(page, "Usage: echo > %s\n", + attr->attr.name); } static ssize_t rnbd_clt_resize_dev_store(struct kobject *kobj, @@ -390,8 +388,7 @@ static struct kobj_attribute rnbd_clt_resize_dev_attr = static ssize_t rnbd_clt_remap_dev_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return scnprintf(page, PAGE_SIZE, "Usage: echo <1> > %s\n", - attr->attr.name); + return sysfs_emit(page, "Usage: echo <1> > %s\n", attr->attr.name); } static ssize_t rnbd_clt_remap_dev_store(struct kobject *kobj, @@ -436,7 +433,7 @@ static ssize_t session_show(struct kobject *kobj, struct kobj_attribute *attr, dev = container_of(kobj, struct rnbd_clt_dev, kobj); - return scnprintf(page, PAGE_SIZE, "%s\n", dev->sess->sessname); + return sysfs_emit(page, "%s\n", dev->sess->sessname); } static struct kobj_attribute rnbd_clt_session_attr = @@ -499,8 +496,8 @@ static ssize_t rnbd_clt_map_device_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return scnprintf(page, PAGE_SIZE, - "Usage: echo \"[dest_port=server port number] sessname= path=<[srcaddr@]dstaddr> [path=<[srcaddr@]dstaddr>] device_path= [access_mode=] [nr_poll_queues=]\" > %s\n\naddr ::= [ ip: | ip: | gid: ]\n", + return sysfs_emit(page, + "Usage: echo \"[dest_port=server port number] sessname= path=<[srcaddr@]dstaddr> [path=<[srcaddr@]dstaddr>] device_path= [access_mode=] [nr_poll_queues=]\" > %s\n\naddr ::= [ ip: | ip: | gid: ]\n", attr->attr.name); } diff --git a/drivers/block/rnbd/rnbd-srv-sysfs.c b/drivers/block/rnbd/rnbd-srv-sysfs.c index acf5fced11ef..4db98e0e76f0 100644 --- a/drivers/block/rnbd/rnbd-srv-sysfs.c +++ b/drivers/block/rnbd/rnbd-srv-sysfs.c @@ -90,8 +90,8 @@ static ssize_t read_only_show(struct kobject *kobj, struct kobj_attribute *attr, sess_dev = container_of(kobj, struct rnbd_srv_sess_dev, kobj); - return scnprintf(page, PAGE_SIZE, "%d\n", - !(sess_dev->open_flags & FMODE_WRITE)); + return sysfs_emit(page, "%d\n", + !(sess_dev->open_flags & FMODE_WRITE)); } static struct kobj_attribute rnbd_srv_dev_session_ro_attr = @@ -105,8 +105,8 @@ static ssize_t access_mode_show(struct kobject *kobj, sess_dev = container_of(kobj, struct rnbd_srv_sess_dev, kobj); - return scnprintf(page, PAGE_SIZE, "%s\n", - rnbd_access_mode_str(sess_dev->access_mode)); + return sysfs_emit(page, "%s\n", + rnbd_access_mode_str(sess_dev->access_mode)); } static struct kobj_attribute rnbd_srv_dev_session_access_mode_attr = @@ -119,7 +119,7 @@ static ssize_t mapping_path_show(struct kobject *kobj, sess_dev = container_of(kobj, struct rnbd_srv_sess_dev, kobj); - return scnprintf(page, PAGE_SIZE, "%s\n", sess_dev->pathname); + return sysfs_emit(page, "%s\n", sess_dev->pathname); } static struct kobj_attribute rnbd_srv_dev_session_mapping_path_attr = @@ -128,8 +128,8 @@ static struct kobj_attribute rnbd_srv_dev_session_mapping_path_attr = static ssize_t rnbd_srv_dev_session_force_close_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - return scnprintf(page, PAGE_SIZE, "Usage: echo 1 > %s\n", - attr->attr.name); + return sysfs_emit(page, "Usage: echo 1 > %s\n", + attr->attr.name); } static ssize_t rnbd_srv_dev_session_force_close_store(struct kobject *kobj, From 90b7198001f23ea37d3b46dc631bdaa2357a20b1 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 5 Aug 2021 10:41:59 -0700 Subject: [PATCH 048/315] blk-mq: Introduce the BLK_MQ_F_NO_SCHED_BY_DEFAULT flag elevator_get_default() uses the following algorithm to select an I/O scheduler from inside add_disk(): - In case of a single hardware queue or if sharing hardware queues across multiple request queues (BLK_MQ_F_TAG_HCTX_SHARED), use mq-deadline. - Otherwise, use 'none'. This is a good choice for most but not for all block drivers. Make it possible to override the selection of mq-deadline with a new flag, namely BLK_MQ_F_NO_SCHED_BY_DEFAULT. Cc: Christoph Hellwig Cc: Ming Lei Cc: Tetsuo Handa Cc: Martijn Coenen Cc: Jaegeuk Kim Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20210805174200.3250718-2-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/elevator.c | 3 +++ include/linux/blk-mq.h | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/block/elevator.c b/block/elevator.c index 52ada14cfe45..d0295e68f481 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -630,6 +630,9 @@ static inline bool elv_support_iosched(struct request_queue *q) */ static struct elevator_type *elevator_get_default(struct request_queue *q) { + if (q->tag_set && q->tag_set->flags & BLK_MQ_F_NO_SCHED_BY_DEFAULT) + return NULL; + if (q->nr_hw_queues != 1 && !blk_mq_is_sbitmap_shared(q->tag_set->flags)) return NULL; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 1d18447ebebc..22215db36122 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -404,7 +404,13 @@ enum { BLK_MQ_F_STACKING = 1 << 2, BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3, BLK_MQ_F_BLOCKING = 1 << 5, + /* Do not allow an I/O scheduler to be configured. */ BLK_MQ_F_NO_SCHED = 1 << 6, + /* + * Select 'none' during queue registration in case of a single hwq + * or shared hwqs instead of 'mq-deadline'. + */ + BLK_MQ_F_NO_SCHED_BY_DEFAULT = 1 << 7, BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, BLK_MQ_F_ALLOC_POLICY_BITS = 1, From 2112f5c1330a671fa852051d85cb9eadc05d7eb7 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 5 Aug 2021 10:42:00 -0700 Subject: [PATCH 049/315] loop: Select I/O scheduler 'none' from inside add_disk() We noticed that the user interface of Android devices becomes very slow under memory pressure. This is because Android uses the zram driver on top of the loop driver for swapping, because under memory pressure the swap code alternates reads and writes quickly, because mq-deadline is the default scheduler for loop devices and because mq-deadline delays writes by five seconds for such a workload with default settings. Fix this by making the kernel select I/O scheduler 'none' from inside add_disk() for loop devices. This default can be overridden at any time from user space, e.g. via a udev rule. This approach has an advantage compared to changing the I/O scheduler from userspace from 'mq-deadline' into 'none', namely that synchronize_rcu() does not get called. This patch changes the default I/O scheduler for loop devices from 'mq-deadline' into 'none'. Additionally, this patch reduces the Android boot time on my test setup with 0.5 seconds compared to configuring the loop I/O scheduler from user space. Cc: Christoph Hellwig Cc: Ming Lei Cc: Tetsuo Handa Cc: Martijn Coenen Cc: Jaegeuk Kim Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20210805174200.3250718-3-bvanassche@acm.org Signed-off-by: Jens Axboe --- drivers/block/loop.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index f8486d9b75a4..fa1c298a8cfb 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -2333,7 +2333,8 @@ static int loop_add(int i) lo->tag_set.queue_depth = 128; lo->tag_set.numa_node = NUMA_NO_NODE; lo->tag_set.cmd_size = sizeof(struct loop_cmd); - lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING; + lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING | + BLK_MQ_F_NO_SCHED_BY_DEFAULT; lo->tag_set.driver_data = lo; err = blk_mq_alloc_tag_set(&lo->tag_set); From c66fd019713e9cf7d6f1243c378cd177d01fe18a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:41:40 +0200 Subject: [PATCH 050/315] block: make the block holder code optional Move the block holder code into a separate file as it is not in any way related to the other block_dev.c code, and add a new selectable config option for it so that we don't have to build it without any remapped drivers selected. The Kconfig symbol contains a _DEPRECATED suffix to match the comments added in commit 49731baa41df ("block: restore multiple bd_link_disk_holder() support"). Signed-off-by: Christoph Hellwig Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20210804094147.459763-2-hch@lst.de Signed-off-by: Jens Axboe --- block/Kconfig | 4 ++ block/Makefile | 1 + block/holder.c | 139 ++++++++++++++++++++++++++++++++++++ drivers/md/Kconfig | 2 + drivers/md/bcache/Kconfig | 1 + fs/block_dev.c | 144 +------------------------------------- include/linux/blk_types.h | 2 +- include/linux/genhd.h | 4 +- 8 files changed, 151 insertions(+), 146 deletions(-) create mode 100644 block/holder.c diff --git a/block/Kconfig b/block/Kconfig index 15dfb7660645..bac87d773c54 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -241,4 +241,8 @@ config BLK_MQ_RDMA config BLK_PM def_bool BLOCK && PM +# do not use in new code +config BLOCK_HOLDER_DEPRECATED + bool + source "block/Kconfig.iosched" diff --git a/block/Makefile b/block/Makefile index c72592b4cf31..0d951adce796 100644 --- a/block/Makefile +++ b/block/Makefile @@ -41,3 +41,4 @@ obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o obj-$(CONFIG_BLK_PM) += blk-pm.o obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += keyslot-manager.o blk-crypto.o obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) += blk-crypto-fallback.o +obj-$(CONFIG_BLOCK_HOLDER_DEPRECATED) += holder.o diff --git a/block/holder.c b/block/holder.c new file mode 100644 index 000000000000..904a1dcd5c12 --- /dev/null +++ b/block/holder.c @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include + +struct bd_holder_disk { + struct list_head list; + struct gendisk *disk; + int refcnt; +}; + +static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev, + struct gendisk *disk) +{ + struct bd_holder_disk *holder; + + list_for_each_entry(holder, &bdev->bd_holder_disks, list) + if (holder->disk == disk) + return holder; + return NULL; +} + +static int add_symlink(struct kobject *from, struct kobject *to) +{ + return sysfs_create_link(from, to, kobject_name(to)); +} + +static void del_symlink(struct kobject *from, struct kobject *to) +{ + sysfs_remove_link(from, kobject_name(to)); +} + +/** + * bd_link_disk_holder - create symlinks between holding disk and slave bdev + * @bdev: the claimed slave bdev + * @disk: the holding disk + * + * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. + * + * This functions creates the following sysfs symlinks. + * + * - from "slaves" directory of the holder @disk to the claimed @bdev + * - from "holders" directory of the @bdev to the holder @disk + * + * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is + * passed to bd_link_disk_holder(), then: + * + * /sys/block/dm-0/slaves/sda --> /sys/block/sda + * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0 + * + * The caller must have claimed @bdev before calling this function and + * ensure that both @bdev and @disk are valid during the creation and + * lifetime of these symlinks. + * + * CONTEXT: + * Might sleep. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) +{ + struct bd_holder_disk *holder; + int ret = 0; + + mutex_lock(&bdev->bd_disk->open_mutex); + + WARN_ON_ONCE(!bdev->bd_holder); + + /* FIXME: remove the following once add_disk() handles errors */ + if (WARN_ON(!disk->slave_dir || !bdev->bd_holder_dir)) + goto out_unlock; + + holder = bd_find_holder_disk(bdev, disk); + if (holder) { + holder->refcnt++; + goto out_unlock; + } + + holder = kzalloc(sizeof(*holder), GFP_KERNEL); + if (!holder) { + ret = -ENOMEM; + goto out_unlock; + } + + INIT_LIST_HEAD(&holder->list); + holder->disk = disk; + holder->refcnt = 1; + + ret = add_symlink(disk->slave_dir, bdev_kobj(bdev)); + if (ret) + goto out_free; + + ret = add_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); + if (ret) + goto out_del; + /* + * bdev could be deleted beneath us which would implicitly destroy + * the holder directory. Hold on to it. + */ + kobject_get(bdev->bd_holder_dir); + + list_add(&holder->list, &bdev->bd_holder_disks); + goto out_unlock; + +out_del: + del_symlink(disk->slave_dir, bdev_kobj(bdev)); +out_free: + kfree(holder); +out_unlock: + mutex_unlock(&bdev->bd_disk->open_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(bd_link_disk_holder); + +/** + * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder() + * @bdev: the calimed slave bdev + * @disk: the holding disk + * + * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. + * + * CONTEXT: + * Might sleep. + */ +void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) +{ + struct bd_holder_disk *holder; + + mutex_lock(&bdev->bd_disk->open_mutex); + holder = bd_find_holder_disk(bdev, disk); + if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { + del_symlink(disk->slave_dir, bdev_kobj(bdev)); + del_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); + kobject_put(bdev->bd_holder_dir); + list_del_init(&holder->list); + kfree(holder); + } + mutex_unlock(&bdev->bd_disk->open_mutex); +} +EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 0602e82a9516..f821dae101a9 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -15,6 +15,7 @@ if MD config BLK_DEV_MD tristate "RAID support" + select BLOCK_HOLDER_DEPRECATED if SYSFS help This driver lets you combine several hard disk partitions into one logical block device. This can be used to simply append one @@ -201,6 +202,7 @@ config BLK_DEV_DM_BUILTIN config BLK_DEV_DM tristate "Device mapper support" + select BLOCK_HOLDER_DEPRECATED if SYSFS select BLK_DEV_DM_BUILTIN depends on DAX || DAX=n help diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig index d1ca4d059c20..cf3e8096942a 100644 --- a/drivers/md/bcache/Kconfig +++ b/drivers/md/bcache/Kconfig @@ -2,6 +2,7 @@ config BCACHE tristate "Block device as cache" + select BLOCK_HOLDER_DEPRECATED if SYSFS select CRC64 help Allows a block device to be used as cache for other devices; uses diff --git a/fs/block_dev.c b/fs/block_dev.c index 6658f40ae492..ae9651cad923 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -902,7 +902,7 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) bdev->bd_disk = disk; bdev->bd_partno = partno; bdev->bd_inode = inode; -#ifdef CONFIG_SYSFS +#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED INIT_LIST_HEAD(&bdev->bd_holder_disks); #endif bdev->bd_stats = alloc_percpu(struct disk_stats); @@ -1063,148 +1063,6 @@ void bd_abort_claiming(struct block_device *bdev, void *holder) } EXPORT_SYMBOL(bd_abort_claiming); -#ifdef CONFIG_SYSFS -struct bd_holder_disk { - struct list_head list; - struct gendisk *disk; - int refcnt; -}; - -static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev, - struct gendisk *disk) -{ - struct bd_holder_disk *holder; - - list_for_each_entry(holder, &bdev->bd_holder_disks, list) - if (holder->disk == disk) - return holder; - return NULL; -} - -static int add_symlink(struct kobject *from, struct kobject *to) -{ - return sysfs_create_link(from, to, kobject_name(to)); -} - -static void del_symlink(struct kobject *from, struct kobject *to) -{ - sysfs_remove_link(from, kobject_name(to)); -} - -/** - * bd_link_disk_holder - create symlinks between holding disk and slave bdev - * @bdev: the claimed slave bdev - * @disk: the holding disk - * - * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. - * - * This functions creates the following sysfs symlinks. - * - * - from "slaves" directory of the holder @disk to the claimed @bdev - * - from "holders" directory of the @bdev to the holder @disk - * - * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is - * passed to bd_link_disk_holder(), then: - * - * /sys/block/dm-0/slaves/sda --> /sys/block/sda - * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0 - * - * The caller must have claimed @bdev before calling this function and - * ensure that both @bdev and @disk are valid during the creation and - * lifetime of these symlinks. - * - * CONTEXT: - * Might sleep. - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) -{ - struct bd_holder_disk *holder; - int ret = 0; - - mutex_lock(&bdev->bd_disk->open_mutex); - - WARN_ON_ONCE(!bdev->bd_holder); - - /* FIXME: remove the following once add_disk() handles errors */ - if (WARN_ON(!disk->slave_dir || !bdev->bd_holder_dir)) - goto out_unlock; - - holder = bd_find_holder_disk(bdev, disk); - if (holder) { - holder->refcnt++; - goto out_unlock; - } - - holder = kzalloc(sizeof(*holder), GFP_KERNEL); - if (!holder) { - ret = -ENOMEM; - goto out_unlock; - } - - INIT_LIST_HEAD(&holder->list); - holder->disk = disk; - holder->refcnt = 1; - - ret = add_symlink(disk->slave_dir, bdev_kobj(bdev)); - if (ret) - goto out_free; - - ret = add_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); - if (ret) - goto out_del; - /* - * bdev could be deleted beneath us which would implicitly destroy - * the holder directory. Hold on to it. - */ - kobject_get(bdev->bd_holder_dir); - - list_add(&holder->list, &bdev->bd_holder_disks); - goto out_unlock; - -out_del: - del_symlink(disk->slave_dir, bdev_kobj(bdev)); -out_free: - kfree(holder); -out_unlock: - mutex_unlock(&bdev->bd_disk->open_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(bd_link_disk_holder); - -/** - * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder() - * @bdev: the calimed slave bdev - * @disk: the holding disk - * - * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. - * - * CONTEXT: - * Might sleep. - */ -void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) -{ - struct bd_holder_disk *holder; - - mutex_lock(&bdev->bd_disk->open_mutex); - - holder = bd_find_holder_disk(bdev, disk); - - if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { - del_symlink(disk->slave_dir, bdev_kobj(bdev)); - del_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); - kobject_put(bdev->bd_holder_dir); - list_del_init(&holder->list); - kfree(holder); - } - - mutex_unlock(&bdev->bd_disk->open_mutex); -} -EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); -#endif - static void blkdev_flush_mapping(struct block_device *bdev) { WARN_ON_ONCE(bdev->bd_holders); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 290f9061b29a..7a4e139d24ef 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -34,7 +34,7 @@ struct block_device { void * bd_holder; int bd_holders; bool bd_write_holder; -#ifdef CONFIG_SYSFS +#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED struct list_head bd_holder_disks; #endif struct kobject *bd_holder_dir; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 849486de81c6..e21a91c16a79 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -318,7 +318,7 @@ void set_capacity(struct gendisk *disk, sector_t size); int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long); long compat_blkdev_ioctl(struct file *, unsigned, unsigned long); -#ifdef CONFIG_SYSFS +#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk); void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk); #else @@ -331,7 +331,7 @@ static inline void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) { } -#endif /* CONFIG_SYSFS */ +#endif /* CONFIG_BLOCK_HOLDER_DEPRECATED */ dev_t part_devt(struct gendisk *disk, u8 partno); void inc_diskseq(struct gendisk *disk); From fbd9a39542ecdd2ade55869c13856b2590db3df8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:41:41 +0200 Subject: [PATCH 051/315] block: remove the extra kobject reference in bd_link_disk_holder Since commit 0d02129e76ed ("block: merge struct block_device and struct hd_struct") there is no way for the bdev to go away as long as there is a holder, so remove the extra references. Signed-off-by: Christoph Hellwig Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20210804094147.459763-3-hch@lst.de Signed-off-by: Jens Axboe --- block/holder.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/block/holder.c b/block/holder.c index 904a1dcd5c12..960654a71342 100644 --- a/block/holder.c +++ b/block/holder.c @@ -92,11 +92,6 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) ret = add_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); if (ret) goto out_del; - /* - * bdev could be deleted beneath us which would implicitly destroy - * the holder directory. Hold on to it. - */ - kobject_get(bdev->bd_holder_dir); list_add(&holder->list, &bdev->bd_holder_disks); goto out_unlock; @@ -130,7 +125,6 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { del_symlink(disk->slave_dir, bdev_kobj(bdev)); del_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); - kobject_put(bdev->bd_holder_dir); list_del_init(&holder->list); kfree(holder); } From 0dbcfe247f22a6d73302dfa691c48b3c14d31c4c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:41:42 +0200 Subject: [PATCH 052/315] block: look up holders by bdev Invert they way the holder relations are tracked. This very slightly reduces the memory overhead for partitioned devices. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210804094147.459763-4-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 4 +++- block/holder.c | 18 +++++++++--------- fs/block_dev.c | 3 --- include/linux/blk_types.h | 3 --- include/linux/genhd.h | 4 +++- 5 files changed, 15 insertions(+), 17 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index a4817e42f3a3..cd4eab744667 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1289,7 +1289,9 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) disk_to_dev(disk)->type = &disk_type; device_initialize(disk_to_dev(disk)); inc_diskseq(disk); - +#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED + INIT_LIST_HEAD(&disk->slave_bdevs); +#endif return disk; out_destroy_part_tbl: diff --git a/block/holder.c b/block/holder.c index 960654a71342..11e65d99a9fb 100644 --- a/block/holder.c +++ b/block/holder.c @@ -3,7 +3,7 @@ struct bd_holder_disk { struct list_head list; - struct gendisk *disk; + struct block_device *bdev; int refcnt; }; @@ -12,8 +12,8 @@ static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev, { struct bd_holder_disk *holder; - list_for_each_entry(holder, &bdev->bd_holder_disks, list) - if (holder->disk == disk) + list_for_each_entry(holder, &disk->slave_bdevs, list) + if (holder->bdev == bdev) return holder; return NULL; } @@ -61,7 +61,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) struct bd_holder_disk *holder; int ret = 0; - mutex_lock(&bdev->bd_disk->open_mutex); + mutex_lock(&disk->open_mutex); WARN_ON_ONCE(!bdev->bd_holder); @@ -82,7 +82,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) } INIT_LIST_HEAD(&holder->list); - holder->disk = disk; + holder->bdev = bdev; holder->refcnt = 1; ret = add_symlink(disk->slave_dir, bdev_kobj(bdev)); @@ -93,7 +93,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) if (ret) goto out_del; - list_add(&holder->list, &bdev->bd_holder_disks); + list_add(&holder->list, &disk->slave_bdevs); goto out_unlock; out_del: @@ -101,7 +101,7 @@ out_del: out_free: kfree(holder); out_unlock: - mutex_unlock(&bdev->bd_disk->open_mutex); + mutex_unlock(&disk->open_mutex); return ret; } EXPORT_SYMBOL_GPL(bd_link_disk_holder); @@ -120,7 +120,7 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) { struct bd_holder_disk *holder; - mutex_lock(&bdev->bd_disk->open_mutex); + mutex_lock(&disk->open_mutex); holder = bd_find_holder_disk(bdev, disk); if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { del_symlink(disk->slave_dir, bdev_kobj(bdev)); @@ -128,6 +128,6 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) list_del_init(&holder->list); kfree(holder); } - mutex_unlock(&bdev->bd_disk->open_mutex); + mutex_unlock(&disk->open_mutex); } EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); diff --git a/fs/block_dev.c b/fs/block_dev.c index ae9651cad923..cc801767a377 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -902,9 +902,6 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) bdev->bd_disk = disk; bdev->bd_partno = partno; bdev->bd_inode = inode; -#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED - INIT_LIST_HEAD(&bdev->bd_holder_disks); -#endif bdev->bd_stats = alloc_percpu(struct disk_stats); if (!bdev->bd_stats) { iput(inode); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 7a4e139d24ef..e92735655684 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -34,9 +34,6 @@ struct block_device { void * bd_holder; int bd_holders; bool bd_write_holder; -#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED - struct list_head bd_holder_disks; -#endif struct kobject *bd_holder_dir; u8 bd_partno; spinlock_t bd_size_lock; /* for bd_inode->i_size updates */ diff --git a/include/linux/genhd.h b/include/linux/genhd.h index e21a91c16a79..0721807d76ee 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -159,7 +159,9 @@ struct gendisk { unsigned open_partitions; /* number of open partitions */ struct kobject *slave_dir; - +#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED + struct list_head slave_bdevs; +#endif struct timer_rand_state *random; atomic_t sync_io; /* RAID */ struct disk_events *ev; From d626338735909bc2b2e7cafc332f44ed41cfdeee Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:41:43 +0200 Subject: [PATCH 053/315] block: support delayed holder registration device mapper needs to register holders before it is ready to do I/O. Currently it does so by registering the disk early, which can leave the disk and queue in a weird half state where the queue is registered with the disk, except for sysfs and the elevator. And this state has been a bit promlematic before, and will get more so when sorting out the responsibilities between the queue and the disk. Support registering holders on an initialized but not registered disk instead by delaying the sysfs registration until the disk is registered. Signed-off-by: Christoph Hellwig Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20210804094147.459763-5-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 10 +++++++ block/holder.c | 68 ++++++++++++++++++++++++++++++++----------- include/linux/genhd.h | 5 ++++ 3 files changed, 66 insertions(+), 17 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index cd4eab744667..db916f779077 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -447,6 +447,16 @@ static void register_disk(struct device *parent, struct gendisk *disk, kobject_create_and_add("holders", &ddev->kobj); disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); + /* + * XXX: this is a mess, can't wait for real error handling in add_disk. + * Make sure ->slave_dir is NULL if we failed some of the registration + * so that the cleanup in bd_unlink_disk_holder works properly. + */ + if (bd_register_pending_holders(disk) < 0) { + kobject_put(disk->slave_dir); + disk->slave_dir = NULL; + } + if (disk->flags & GENHD_FL_HIDDEN) return; diff --git a/block/holder.c b/block/holder.c index 11e65d99a9fb..4568cc4f6827 100644 --- a/block/holder.c +++ b/block/holder.c @@ -28,6 +28,19 @@ static void del_symlink(struct kobject *from, struct kobject *to) sysfs_remove_link(from, kobject_name(to)); } +static int __link_disk_holder(struct block_device *bdev, struct gendisk *disk) +{ + int ret; + + ret = add_symlink(disk->slave_dir, bdev_kobj(bdev)); + if (ret) + return ret; + ret = add_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); + if (ret) + del_symlink(disk->slave_dir, bdev_kobj(bdev)); + return ret; +} + /** * bd_link_disk_holder - create symlinks between holding disk and slave bdev * @bdev: the claimed slave bdev @@ -66,7 +79,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) WARN_ON_ONCE(!bdev->bd_holder); /* FIXME: remove the following once add_disk() handles errors */ - if (WARN_ON(!disk->slave_dir || !bdev->bd_holder_dir)) + if (WARN_ON(!bdev->bd_holder_dir)) goto out_unlock; holder = bd_find_holder_disk(bdev, disk); @@ -84,28 +97,28 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) INIT_LIST_HEAD(&holder->list); holder->bdev = bdev; holder->refcnt = 1; - - ret = add_symlink(disk->slave_dir, bdev_kobj(bdev)); - if (ret) - goto out_free; - - ret = add_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); - if (ret) - goto out_del; + if (disk->slave_dir) { + ret = __link_disk_holder(bdev, disk); + if (ret) { + kfree(holder); + goto out_unlock; + } + } list_add(&holder->list, &disk->slave_bdevs); - goto out_unlock; - -out_del: - del_symlink(disk->slave_dir, bdev_kobj(bdev)); -out_free: - kfree(holder); out_unlock: mutex_unlock(&disk->open_mutex); return ret; } EXPORT_SYMBOL_GPL(bd_link_disk_holder); +static void __unlink_disk_holder(struct block_device *bdev, + struct gendisk *disk) +{ + del_symlink(disk->slave_dir, bdev_kobj(bdev)); + del_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); +} + /** * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder() * @bdev: the calimed slave bdev @@ -123,11 +136,32 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) mutex_lock(&disk->open_mutex); holder = bd_find_holder_disk(bdev, disk); if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { - del_symlink(disk->slave_dir, bdev_kobj(bdev)); - del_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); + if (disk->slave_dir) + __unlink_disk_holder(bdev, disk); list_del_init(&holder->list); kfree(holder); } mutex_unlock(&disk->open_mutex); } EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); + +int bd_register_pending_holders(struct gendisk *disk) +{ + struct bd_holder_disk *holder; + int ret; + + mutex_lock(&disk->open_mutex); + list_for_each_entry(holder, &disk->slave_bdevs, list) { + ret = __link_disk_holder(holder->bdev, disk); + if (ret) + goto out_undo; + } + mutex_unlock(&disk->open_mutex); + return 0; + +out_undo: + list_for_each_entry_continue_reverse(holder, &disk->slave_bdevs, list) + __unlink_disk_holder(holder->bdev, disk); + mutex_unlock(&disk->open_mutex); + return ret; +} diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 0721807d76ee..80952f038d79 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -323,6 +323,7 @@ long compat_blkdev_ioctl(struct file *, unsigned, unsigned long); #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk); void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk); +int bd_register_pending_holders(struct gendisk *disk); #else static inline int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) @@ -333,6 +334,10 @@ static inline void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) { } +static inline int bd_register_pending_holders(struct gendisk *disk) +{ + return 0; +} #endif /* CONFIG_BLOCK_HOLDER_DEPRECATED */ dev_t part_devt(struct gendisk *disk, u8 partno); From 74a2b6ec9380959546d95ecc01a8fe6c7157add9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:41:44 +0200 Subject: [PATCH 054/315] dm: cleanup cleanup_mapped_device md->queue is now always set when md->disk is set, so simplify the conditionals a bit. Signed-off-by: Christoph Hellwig Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20210804094147.459763-6-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/dm.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 2c5f9e585211..7971ec8ce677 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1694,13 +1694,9 @@ static void cleanup_mapped_device(struct mapped_device *md) md->disk->private_data = NULL; spin_unlock(&_minor_lock); del_gendisk(md->disk); - } - - if (md->queue) dm_queue_destroy_keyslot_manager(md->queue); - - if (md->disk) blk_cleanup_disk(md->disk); + } cleanup_srcu_struct(&md->io_barrier); From ba30585936b0b88f0fb2b19be279b346a6cc87eb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:41:45 +0200 Subject: [PATCH 055/315] dm: move setting md->type into dm_setup_md_queue Move setting md->type from both callers into dm_setup_md_queue. This ensures that md->type is only set to a valid value after the queue has been fully setup, something we'll rely on future changes. Signed-off-by: Christoph Hellwig Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20210804094147.459763-7-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/dm-ioctl.c | 4 ---- drivers/md/dm.c | 5 +++-- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 2209cbcd84db..2575074a2204 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1436,9 +1436,6 @@ static int table_load(struct file *filp, struct dm_ioctl *param, size_t param_si } if (dm_get_md_type(md) == DM_TYPE_NONE) { - /* Initial table load: acquire type of table. */ - dm_set_md_type(md, dm_table_get_type(t)); - /* setup md->queue to reflect md's type (may block) */ r = dm_setup_md_queue(md, t); if (r) { @@ -2187,7 +2184,6 @@ int __init dm_early_create(struct dm_ioctl *dmi, if (r) goto err_destroy_table; - md->type = dm_table_get_type(t); /* setup md->queue to reflect md's type (may block) */ r = dm_setup_md_queue(md, t); if (r) { diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 7971ec8ce677..f003bd5b93ce 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2052,9 +2052,9 @@ EXPORT_SYMBOL_GPL(dm_get_queue_limits); */ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) { - int r; + enum dm_queue_mode type = dm_table_get_type(t); struct queue_limits limits; - enum dm_queue_mode type = dm_get_md_type(md); + int r; switch (type) { case DM_TYPE_REQUEST_BASED: @@ -2081,6 +2081,7 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) r = dm_table_set_restrictions(t, md->queue, &limits); if (r) return r; + md->type = type; blk_register_queue(md->disk); From 89f871af1b26d98d983cba7ed0e86effa45ba5f8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:41:46 +0200 Subject: [PATCH 056/315] dm: delay registering the gendisk device mapper is currently the only outlier that tries to call register_disk after add_disk, leading to fairly inconsistent state of these block layer data structures. Instead change device-mapper to just register the gendisk later now that the holder mechanism can cope with that. Note that this introduces a user visible change: the dm kobject is now only visible after the initial table has been loaded. Signed-off-by: Christoph Hellwig Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20210804094147.459763-8-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/dm-rq.c | 1 - drivers/md/dm.c | 25 ++++++++++++------------- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 0dbd48cbdff9..5b95eea517d1 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -559,7 +559,6 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t) err = blk_mq_init_allocated_queue(md->tag_set, md->queue); if (err) goto out_tag_set; - elevator_init_mq(md->queue); return 0; out_tag_set: diff --git a/drivers/md/dm.c b/drivers/md/dm.c index f003bd5b93ce..7981b7287628 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1693,7 +1693,10 @@ static void cleanup_mapped_device(struct mapped_device *md) spin_lock(&_minor_lock); md->disk->private_data = NULL; spin_unlock(&_minor_lock); - del_gendisk(md->disk); + if (dm_get_md_type(md) != DM_TYPE_NONE) { + dm_sysfs_exit(md); + del_gendisk(md->disk); + } dm_queue_destroy_keyslot_manager(md->queue); blk_cleanup_disk(md->disk); } @@ -1788,7 +1791,6 @@ static struct mapped_device *alloc_dev(int minor) goto bad; } - add_disk_no_queue_reg(md->disk); format_dev_t(md->name, MKDEV(_major, minor)); md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0); @@ -1989,19 +1991,12 @@ static struct dm_table *__unbind(struct mapped_device *md) */ int dm_create(int minor, struct mapped_device **result) { - int r; struct mapped_device *md; md = alloc_dev(minor); if (!md) return -ENXIO; - r = dm_sysfs_init(md); - if (r) { - free_dev(md); - return r; - } - *result = md; return 0; } @@ -2081,10 +2076,15 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) r = dm_table_set_restrictions(t, md->queue, &limits); if (r) return r; + + add_disk(md->disk); + + r = dm_sysfs_init(md); + if (r) { + del_gendisk(md->disk); + return r; + } md->type = type; - - blk_register_queue(md->disk); - return 0; } @@ -2190,7 +2190,6 @@ static void __dm_destroy(struct mapped_device *md, bool wait) DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)", dm_device_name(md), atomic_read(&md->holders)); - dm_sysfs_exit(md); dm_table_destroy(__unbind(md)); free_dev(md); } From d1254a8749711e0d7441036a74ce592341f89697 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:41:47 +0200 Subject: [PATCH 057/315] block: remove support for delayed queue registrations Now that device mapper has been changed to register the disk once it is fully ready all this code is unused. Signed-off-by: Christoph Hellwig Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20210804094147.459763-9-hch@lst.de Signed-off-by: Jens Axboe --- block/elevator.c | 1 - block/genhd.c | 29 +++++++---------------------- include/linux/genhd.h | 6 ------ 3 files changed, 7 insertions(+), 29 deletions(-) diff --git a/block/elevator.c b/block/elevator.c index d0295e68f481..9beaafd238e0 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -705,7 +705,6 @@ void elevator_init_mq(struct request_queue *q) elevator_put(e); } } -EXPORT_SYMBOL_GPL(elevator_init_mq); /* only for dm-rq */ /* * switch to new_e io scheduler. be careful not to introduce deadlocks - diff --git a/block/genhd.c b/block/genhd.c index db916f779077..b0b6e0caa389 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -475,20 +475,20 @@ static void register_disk(struct device *parent, struct gendisk *disk, } /** - * __device_add_disk - add disk information to kernel list + * device_add_disk - add disk information to kernel list * @parent: parent device for the disk * @disk: per-device partitioning information * @groups: Additional per-device sysfs groups - * @register_queue: register the queue if set to true * * This function registers the partitioning information in @disk * with the kernel. * * FIXME: error handling */ -static void __device_add_disk(struct device *parent, struct gendisk *disk, - const struct attribute_group **groups, - bool register_queue) + +void device_add_disk(struct device *parent, struct gendisk *disk, + const struct attribute_group **groups) + { int ret; @@ -498,8 +498,7 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, * elevator if one is needed, that is, for devices requesting queue * registration. */ - if (register_queue) - elevator_init_mq(disk->queue); + elevator_init_mq(disk->queue); /* * If the driver provides an explicit major number it also must provide @@ -553,8 +552,7 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, bdev_add(disk->part0, dev->devt); } register_disk(parent, disk, groups); - if (register_queue) - blk_register_queue(disk); + blk_register_queue(disk); /* * Take an extra ref on queue which will be put on disk_release() @@ -568,21 +566,8 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, disk_add_events(disk); blk_integrity_add(disk); } - -void device_add_disk(struct device *parent, struct gendisk *disk, - const struct attribute_group **groups) - -{ - __device_add_disk(parent, disk, groups, true); -} EXPORT_SYMBOL(device_add_disk); -void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk) -{ - __device_add_disk(parent, disk, NULL, false); -} -EXPORT_SYMBOL(device_add_disk_no_queue_reg); - /** * del_gendisk - remove the gendisk * @disk: the struct gendisk to remove diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 80952f038d79..473d93c6ebda 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -219,12 +219,6 @@ static inline void add_disk(struct gendisk *disk) { device_add_disk(NULL, disk, NULL); } -extern void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk); -static inline void add_disk_no_queue_reg(struct gendisk *disk) -{ - device_add_disk_no_queue_reg(NULL, disk); -} - extern void del_gendisk(struct gendisk *gp); void set_disk_ro(struct gendisk *disk, bool read_only); From 5ed964f8e54eb3191b8b7b45aeb52672a0c995dc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Aug 2021 16:17:40 +0200 Subject: [PATCH 058/315] mm: hide laptop_mode_wb_timer entirely behind the BDI API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Don't leak the detaіls of the timer into the block layer, instead initialize the timer in bdi_alloc and delete it in bdi_unregister. Note that this means the timer is initialized (but not armed) for non-block queues as well now. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20210809141744.1203023-2-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-core.c | 5 ----- mm/backing-dev.c | 3 +++ mm/page-writeback.c | 2 -- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 04477697ee4b..5897bc37467d 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -394,10 +394,7 @@ void blk_cleanup_queue(struct request_queue *q) /* for synchronous bio-based driver finish in-flight integrity i/o */ blk_flush_integrity(); - /* @q won't process any more request, flush async actions */ - del_timer_sync(&q->backing_dev_info->laptop_mode_wb_timer); blk_sync_queue(q); - if (queue_is_mq(q)) blk_mq_exit_queue(q); @@ -546,8 +543,6 @@ struct request_queue *blk_alloc_queue(int node_id) atomic_set(&q->nr_active_requests_shared_sbitmap, 0); - timer_setup(&q->backing_dev_info->laptop_mode_wb_timer, - laptop_mode_timer_fn, 0); timer_setup(&q->timeout, blk_rq_timed_out_timer, 0); INIT_WORK(&q->timeout_work, blk_timeout_work); INIT_LIST_HEAD(&q->icq_list); diff --git a/mm/backing-dev.c b/mm/backing-dev.c index f5561ea7d90a..cd06dca232c3 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -807,6 +807,7 @@ struct backing_dev_info *bdi_alloc(int node_id) bdi->capabilities = BDI_CAP_WRITEBACK | BDI_CAP_WRITEBACK_ACCT; bdi->ra_pages = VM_READAHEAD_PAGES; bdi->io_pages = VM_READAHEAD_PAGES; + timer_setup(&bdi->laptop_mode_wb_timer, laptop_mode_timer_fn, 0); return bdi; } EXPORT_SYMBOL(bdi_alloc); @@ -928,6 +929,8 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi) void bdi_unregister(struct backing_dev_info *bdi) { + del_timer_sync(&bdi->laptop_mode_wb_timer); + /* make sure nobody finds us on the bdi_list anymore */ bdi_remove_from_list(bdi); wb_shutdown(&bdi->wb); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 9f63548f247c..c12f67cbfa19 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2010,7 +2010,6 @@ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, return ret; } -#ifdef CONFIG_BLOCK void laptop_mode_timer_fn(struct timer_list *t) { struct backing_dev_info *backing_dev_info = @@ -2045,7 +2044,6 @@ void laptop_sync_completion(void) rcu_read_unlock(); } -#endif /* * If ratelimit_pages is too high then we can get into dirty-data overload From 471aa704db4904f7af5a50019ca3b5b018c0cf62 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Aug 2021 16:17:41 +0200 Subject: [PATCH 059/315] block: pass a gendisk to blk_queue_update_readahead .. and rename the function to disk_update_readahead. This is in preparation for moving the BDI from the request_queue to the gendisk. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20210809141744.1203023-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-settings.c | 8 +++++--- block/blk-sysfs.c | 2 +- drivers/block/drbd/drbd_nl.c | 2 +- drivers/md/dm-table.c | 2 +- drivers/nvme/host/core.c | 2 +- include/linux/blkdev.h | 2 +- 6 files changed, 10 insertions(+), 8 deletions(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index 109012719aa0..44aaef9bf736 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -380,8 +380,10 @@ void blk_queue_alignment_offset(struct request_queue *q, unsigned int offset) } EXPORT_SYMBOL(blk_queue_alignment_offset); -void blk_queue_update_readahead(struct request_queue *q) +void disk_update_readahead(struct gendisk *disk) { + struct request_queue *q = disk->queue; + /* * For read-ahead of large files to be effective, we need to read ahead * at least twice the optimal I/O size. @@ -391,7 +393,7 @@ void blk_queue_update_readahead(struct request_queue *q) q->backing_dev_info->io_pages = queue_max_sectors(q) >> (PAGE_SHIFT - 9); } -EXPORT_SYMBOL_GPL(blk_queue_update_readahead); +EXPORT_SYMBOL_GPL(disk_update_readahead); /** * blk_limits_io_min - set minimum request size for a device @@ -665,7 +667,7 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, pr_notice("%s: Warning: Device %pg is misaligned\n", disk->disk_name, bdev); - blk_queue_update_readahead(disk->queue); + disk_update_readahead(disk); } EXPORT_SYMBOL(disk_stack_limits); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 370d83c18057..3af2ab7d5086 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -866,7 +866,7 @@ int blk_register_queue(struct gendisk *disk) "%s is registering an already registered queue\n", kobject_name(&dev->kobj)); - blk_queue_update_readahead(q); + disk_update_readahead(disk); ret = blk_trace_init_sysfs(dev); if (ret) diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index e7d0e637e632..44ccf8b4f4b2 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1364,7 +1364,7 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi if (b) { blk_stack_limits(&q->limits, &b->limits, 0); - blk_queue_update_readahead(q); + disk_update_readahead(device->vdisk); } fixup_discard_if_not_supported(q); fixup_write_zeroes(device, q); diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 0543cdf89e92..b03eabc1ed7c 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -2076,7 +2076,7 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, } dm_update_keyslot_manager(q, t); - blk_queue_update_readahead(q); + disk_update_readahead(t->md->disk); return 0; } diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index dfd9dec0c1f6..f6c0a59c4b53 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1890,7 +1890,7 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id) nvme_update_disk_info(ns->head->disk, ns, id); blk_stack_limits(&ns->head->disk->queue->limits, &ns->queue->limits, 0); - blk_queue_update_readahead(ns->head->disk->queue); + disk_update_readahead(ns->head->disk); blk_mq_unfreeze_queue(ns->head->disk->queue); } return 0; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b5c033cf5f26..ac3642c88a4d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1139,7 +1139,7 @@ void blk_queue_zone_write_granularity(struct request_queue *q, unsigned int size); extern void blk_queue_alignment_offset(struct request_queue *q, unsigned int alignment); -void blk_queue_update_readahead(struct request_queue *q); +void disk_update_readahead(struct gendisk *disk); extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min); extern void blk_queue_io_min(struct request_queue *q, unsigned int min); extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt); From 1008162b2782a3624d12b0aee8da58bc75d12e19 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Aug 2021 16:17:42 +0200 Subject: [PATCH 060/315] block: add a queue_has_disk helper Add a helper to check if a gendisk is associated with a request_queue. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20210809141744.1203023-4-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ac3642c88a4d..96f3d9617cd8 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -664,6 +664,7 @@ extern void blk_clear_pm_only(struct request_queue *q); dma_map_page_attrs(dev, (bv)->bv_page, (bv)->bv_offset, (bv)->bv_len, \ (dir), (attrs)) +#define queue_has_disk(q) ((q)->kobj.parent != NULL) #define queue_to_disk(q) (dev_to_disk(kobj_to_dev((q)->kobj.parent))) static inline bool queue_is_mq(struct request_queue *q) From edb0872f44ec9976ea6d052cb4b93cd2d23ac2ba Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Aug 2021 16:17:43 +0200 Subject: [PATCH 061/315] block: move the bdi from the request_queue to the gendisk The backing device information only makes sense for file system I/O, and thus belongs into the gendisk and not the lower level request_queue structure. Move it there. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20210809141744.1203023-5-hch@lst.de Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 4 ++-- block/blk-cgroup.c | 7 +++---- block/blk-core.c | 13 +++---------- block/blk-mq.c | 2 +- block/blk-settings.c | 14 +++++++++----- block/blk-sysfs.c | 26 ++++++++++++-------------- block/blk-wbt.c | 10 +++++----- block/genhd.c | 23 ++++++++++++++--------- drivers/block/drbd/drbd_req.c | 5 ++--- drivers/block/pktcdvd.c | 8 +++----- fs/block_dev.c | 4 ++-- fs/fat/fatent.c | 1 + include/linux/blkdev.h | 3 --- include/linux/genhd.h | 1 + 14 files changed, 58 insertions(+), 63 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 727955918563..1576e858d3a5 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5266,8 +5266,8 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) switch (ioprio_class) { default: pr_err("bdi %s: bfq: bad prio class %d\n", - bdi_dev_name(bfqq->bfqd->queue->backing_dev_info), - ioprio_class); + bdi_dev_name(queue_to_disk(bfqq->bfqd->queue)->bdi), + ioprio_class); fallthrough; case IOPRIO_CLASS_NONE: /* diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 575d7a2e7203..db034e35ae20 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -489,10 +489,9 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, const char *blkg_dev_name(struct blkcg_gq *blkg) { - /* some drivers (floppy) instantiate a queue w/o disk registered */ - if (blkg->q->backing_dev_info->dev) - return bdi_dev_name(blkg->q->backing_dev_info); - return NULL; + if (!queue_has_disk(blkg->q) || !queue_to_disk(blkg->q)->bdi->dev) + return NULL; + return bdi_dev_name(queue_to_disk(blkg->q)->bdi); } /** diff --git a/block/blk-core.c b/block/blk-core.c index 5897bc37467d..0874bc2fcdb4 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -14,7 +14,6 @@ */ #include #include -#include #include #include #include @@ -531,13 +530,9 @@ struct request_queue *blk_alloc_queue(int node_id) if (ret) goto fail_id; - q->backing_dev_info = bdi_alloc(node_id); - if (!q->backing_dev_info) - goto fail_split; - q->stats = blk_alloc_queue_stats(); if (!q->stats) - goto fail_stats; + goto fail_split; q->node = node_id; @@ -567,7 +562,7 @@ struct request_queue *blk_alloc_queue(int node_id) if (percpu_ref_init(&q->q_usage_counter, blk_queue_usage_counter_release, PERCPU_REF_INIT_ATOMIC, GFP_KERNEL)) - goto fail_bdi; + goto fail_stats; if (blkcg_init_queue(q)) goto fail_ref; @@ -580,10 +575,8 @@ struct request_queue *blk_alloc_queue(int node_id) fail_ref: percpu_ref_exit(&q->q_usage_counter); -fail_bdi: - blk_free_queue_stats(q->stats); fail_stats: - bdi_put(q->backing_dev_info); + blk_free_queue_stats(q->stats); fail_split: bioset_exit(&q->bio_split); fail_id: diff --git a/block/blk-mq.c b/block/blk-mq.c index 2c4ac51e54eb..d2725f94491d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -525,7 +525,7 @@ void blk_mq_free_request(struct request *rq) __blk_mq_dec_active_requests(hctx); if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) - laptop_io_completion(q->backing_dev_info); + laptop_io_completion(queue_to_disk(q)->bdi); rq_qos_done(q, rq); diff --git a/block/blk-settings.c b/block/blk-settings.c index 44aaef9bf736..3613d2cc0688 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -140,7 +141,9 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto limits->logical_block_size >> SECTOR_SHIFT); limits->max_sectors = max_sectors; - q->backing_dev_info->io_pages = max_sectors >> (PAGE_SHIFT - 9); + if (!queue_has_disk(q)) + return; + queue_to_disk(q)->bdi->io_pages = max_sectors >> (PAGE_SHIFT - 9); } EXPORT_SYMBOL(blk_queue_max_hw_sectors); @@ -388,10 +391,9 @@ void disk_update_readahead(struct gendisk *disk) * For read-ahead of large files to be effective, we need to read ahead * at least twice the optimal I/O size. */ - q->backing_dev_info->ra_pages = + disk->bdi->ra_pages = max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES); - q->backing_dev_info->io_pages = - queue_max_sectors(q) >> (PAGE_SHIFT - 9); + disk->bdi->io_pages = queue_max_sectors(q) >> (PAGE_SHIFT - 9); } EXPORT_SYMBOL_GPL(disk_update_readahead); @@ -473,7 +475,9 @@ EXPORT_SYMBOL(blk_limits_io_opt); void blk_queue_io_opt(struct request_queue *q, unsigned int opt) { blk_limits_io_opt(&q->limits, opt); - q->backing_dev_info->ra_pages = + if (!queue_has_disk(q)) + return; + queue_to_disk(q)->bdi->ra_pages = max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES); } EXPORT_SYMBOL(blk_queue_io_opt); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 3af2ab7d5086..1832587dce3a 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -88,9 +88,11 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) static ssize_t queue_ra_show(struct request_queue *q, char *page) { - unsigned long ra_kb = q->backing_dev_info->ra_pages << - (PAGE_SHIFT - 10); + unsigned long ra_kb; + if (!queue_has_disk(q)) + return -EINVAL; + ra_kb = queue_to_disk(q)->bdi->ra_pages << (PAGE_SHIFT - 10); return queue_var_show(ra_kb, page); } @@ -98,13 +100,14 @@ static ssize_t queue_ra_store(struct request_queue *q, const char *page, size_t count) { unsigned long ra_kb; - ssize_t ret = queue_var_store(&ra_kb, page, count); + ssize_t ret; + if (!queue_has_disk(q)) + return -EINVAL; + ret = queue_var_store(&ra_kb, page, count); if (ret < 0) return ret; - - q->backing_dev_info->ra_pages = ra_kb >> (PAGE_SHIFT - 10); - + queue_to_disk(q)->bdi->ra_pages = ra_kb >> (PAGE_SHIFT - 10); return ret; } @@ -251,7 +254,9 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) spin_lock_irq(&q->queue_lock); q->limits.max_sectors = max_sectors_kb << 1; - q->backing_dev_info->io_pages = max_sectors_kb >> (PAGE_SHIFT - 10); + if (queue_has_disk(q)) + queue_to_disk(q)->bdi->io_pages = + max_sectors_kb >> (PAGE_SHIFT - 10); spin_unlock_irq(&q->queue_lock); return ret; @@ -766,13 +771,6 @@ static void blk_exit_queue(struct request_queue *q) * e.g. blkcg_print_blkgs() to crash. */ blkcg_exit_queue(q); - - /* - * Since the cgroup code may dereference the @q->backing_dev_info - * pointer, only decrease its reference count after having removed the - * association with the block cgroup controller. - */ - bdi_put(q->backing_dev_info); } /** diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 3ed71b8da887..31086afaad9c 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -97,7 +97,7 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) */ static bool wb_recent_wait(struct rq_wb *rwb) { - struct bdi_writeback *wb = &rwb->rqos.q->backing_dev_info->wb; + struct bdi_writeback *wb = &queue_to_disk(rwb->rqos.q)->bdi->wb; return time_before(jiffies, wb->dirty_sleep + HZ); } @@ -234,7 +234,7 @@ enum { static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) { - struct backing_dev_info *bdi = rwb->rqos.q->backing_dev_info; + struct backing_dev_info *bdi = queue_to_disk(rwb->rqos.q)->bdi; struct rq_depth *rqd = &rwb->rq_depth; u64 thislat; @@ -287,7 +287,7 @@ static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) static void rwb_trace_step(struct rq_wb *rwb, const char *msg) { - struct backing_dev_info *bdi = rwb->rqos.q->backing_dev_info; + struct backing_dev_info *bdi = queue_to_disk(rwb->rqos.q)->bdi; struct rq_depth *rqd = &rwb->rq_depth; trace_wbt_step(bdi, msg, rqd->scale_step, rwb->cur_win_nsec, @@ -359,8 +359,8 @@ static void wb_timer_fn(struct blk_stat_callback *cb) status = latency_exceeded(rwb, cb->stat); - trace_wbt_timer(rwb->rqos.q->backing_dev_info, status, rqd->scale_step, - inflight); + trace_wbt_timer(queue_to_disk(rwb->rqos.q)->bdi, status, + rqd->scale_step, inflight); /* * If we exceeded the latency target, step down. If we did not, diff --git a/block/genhd.c b/block/genhd.c index b0b6e0caa389..f8def1129501 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -466,10 +466,9 @@ static void register_disk(struct device *parent, struct gendisk *disk, dev_set_uevent_suppress(ddev, 0); disk_uevent(disk, KOBJ_ADD); - if (disk->queue->backing_dev_info->dev) { - err = sysfs_create_link(&ddev->kobj, - &disk->queue->backing_dev_info->dev->kobj, - "bdi"); + if (disk->bdi->dev) { + err = sysfs_create_link(&ddev->kobj, &disk->bdi->dev->kobj, + "bdi"); WARN_ON(err); } } @@ -540,15 +539,14 @@ void device_add_disk(struct device *parent, struct gendisk *disk, disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO; disk->flags |= GENHD_FL_NO_PART_SCAN; } else { - struct backing_dev_info *bdi = disk->queue->backing_dev_info; struct device *dev = disk_to_dev(disk); /* Register BDI before referencing it from bdev */ dev->devt = MKDEV(disk->major, disk->first_minor); - ret = bdi_register(bdi, "%u:%u", + ret = bdi_register(disk->bdi, "%u:%u", disk->major, disk->first_minor); WARN_ON(ret); - bdi_set_owner(bdi, dev); + bdi_set_owner(disk->bdi, dev); bdev_add(disk->part0, dev->devt); } register_disk(parent, disk, groups); @@ -615,7 +613,7 @@ void del_gendisk(struct gendisk *disk) * Unregister bdi before releasing device numbers (as they can * get reused and we'd get clashes in sysfs). */ - bdi_unregister(disk->queue->backing_dev_info); + bdi_unregister(disk->bdi); } blk_unregister_queue(disk); @@ -1088,6 +1086,7 @@ static void disk_release(struct device *dev) might_sleep(); + bdi_put(disk->bdi); if (MAJOR(dev->devt) == BLOCK_EXT_MAJOR) blk_free_ext_minor(MINOR(dev->devt)); disk_release_events(disk); @@ -1268,9 +1267,13 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) if (!disk) return NULL; + disk->bdi = bdi_alloc(node_id); + if (!disk->bdi) + goto out_free_disk; + disk->part0 = bdev_alloc(disk, 0); if (!disk->part0) - goto out_free_disk; + goto out_free_bdi; disk->node_id = node_id; mutex_init(&disk->open_mutex); @@ -1292,6 +1295,8 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) out_destroy_part_tbl: xa_destroy(&disk->part_tbl); iput(disk->part0->bd_inode); +out_free_bdi: + bdi_put(disk->bdi); out_free_disk: kfree(disk); return NULL; diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 13beb98a7c5a..5ca233644d70 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -905,13 +905,12 @@ static bool drbd_may_do_local_read(struct drbd_device *device, sector_t sector, static bool remote_due_to_read_balancing(struct drbd_device *device, sector_t sector, enum drbd_read_balancing rbm) { - struct backing_dev_info *bdi; int stripe_shift; switch (rbm) { case RB_CONGESTED_REMOTE: - bdi = device->ldev->backing_bdev->bd_disk->queue->backing_dev_info; - return bdi_read_congested(bdi); + return bdi_read_congested( + device->ldev->backing_bdev->bd_disk->bdi); case RB_LEAST_PENDING: return atomic_read(&device->local_cnt) > atomic_read(&device->ap_pending_cnt) + atomic_read(&device->rs_pending_cnt); diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 538446b652de..0f26b2510a75 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -1183,10 +1183,8 @@ try_next_bio: wakeup = (pd->write_congestion_on > 0 && pd->bio_queue_size <= pd->write_congestion_off); spin_unlock(&pd->lock); - if (wakeup) { - clear_bdi_congested(pd->disk->queue->backing_dev_info, - BLK_RW_ASYNC); - } + if (wakeup) + clear_bdi_congested(pd->disk->bdi, BLK_RW_ASYNC); pkt->sleep_time = max(PACKET_WAIT_TIME, 1); pkt_set_state(pkt, PACKET_WAITING_STATE); @@ -2366,7 +2364,7 @@ static void pkt_make_request_write(struct request_queue *q, struct bio *bio) spin_lock(&pd->lock); if (pd->write_congestion_on > 0 && pd->bio_queue_size >= pd->write_congestion_on) { - set_bdi_congested(q->backing_dev_info, BLK_RW_ASYNC); + set_bdi_congested(bio->bi_bdev->bd_disk->bdi, BLK_RW_ASYNC); do { spin_unlock(&pd->lock); congestion_wait(BLK_RW_ASYNC, HZ); diff --git a/fs/block_dev.c b/fs/block_dev.c index cc801767a377..43be5463a4c4 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1087,7 +1087,7 @@ static int blkdev_get_whole(struct block_device *bdev, fmode_t mode) if (!bdev->bd_openers) { set_init_blocksize(bdev); if (bdev->bd_bdi == &noop_backing_dev_info) - bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info); + bdev->bd_bdi = bdi_get(disk->bdi); } if (test_bit(GD_NEED_PART_SCAN, &disk->state)) bdev_disk_changed(disk, false); @@ -1122,7 +1122,7 @@ static int blkdev_get_part(struct block_device *part, fmode_t mode) disk->open_partitions++; set_init_blocksize(part); if (part->bd_bdi == &noop_backing_dev_info) - part->bd_bdi = bdi_get(disk->queue->backing_dev_info); + part->bd_bdi = bdi_get(disk->bdi); done: part->bd_openers++; return 0; diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index 860e884e56e8..978ac6751aeb 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -5,6 +5,7 @@ #include #include +#include #include "fat.h" struct fatent_operations { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 96f3d9617cd8..23e1253a8d88 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include @@ -398,8 +397,6 @@ struct request_queue { struct blk_mq_hw_ctx **queue_hw_ctx; unsigned int nr_hw_queues; - struct backing_dev_info *backing_dev_info; - /* * The queue owner gets to use this for whatever they like. * ll_rw_blk doesn't touch it. diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 473d93c6ebda..b3bab578f03a 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -158,6 +158,7 @@ struct gendisk { struct mutex open_mutex; /* open/close mutex */ unsigned open_partitions; /* number of open partitions */ + struct backing_dev_info *bdi; struct kobject *slave_dir; #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED struct list_head slave_bdevs; From a11d7fc2d05fb509cd9e33d4093507d6eda3ad53 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Aug 2021 16:17:44 +0200 Subject: [PATCH 062/315] block: remove the bd_bdi in struct block_device Just retrieve the bdi from the disk. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20210809141744.1203023-6-hch@lst.de Signed-off-by: Jens Axboe --- block/ioctl.c | 7 ++++--- fs/block_dev.c | 13 +------------ fs/nilfs2/super.c | 2 +- fs/super.c | 2 +- fs/xfs/xfs_buf.c | 2 +- include/linux/backing-dev.h | 2 +- include/linux/blk_types.h | 1 - 7 files changed, 9 insertions(+), 20 deletions(-) diff --git a/block/ioctl.c b/block/ioctl.c index 0c3a4a53fa11..fff161eaab42 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -506,7 +506,7 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode, case BLKFRASET: if(!capable(CAP_SYS_ADMIN)) return -EACCES; - bdev->bd_bdi->ra_pages = (arg * 512) / PAGE_SIZE; + bdev->bd_disk->bdi->ra_pages = (arg * 512) / PAGE_SIZE; return 0; case BLKRRPART: return blkdev_reread_part(bdev, mode); @@ -556,7 +556,8 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, case BLKFRAGET: if (!argp) return -EINVAL; - return put_long(argp, (bdev->bd_bdi->ra_pages*PAGE_SIZE) / 512); + return put_long(argp, + (bdev->bd_disk->bdi->ra_pages * PAGE_SIZE) / 512); case BLKGETSIZE: size = i_size_read(bdev->bd_inode); if ((size >> 9) > ~0UL) @@ -628,7 +629,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) if (!argp) return -EINVAL; return compat_put_long(argp, - (bdev->bd_bdi->ra_pages * PAGE_SIZE) / 512); + (bdev->bd_disk->bdi->ra_pages * PAGE_SIZE) / 512); case BLKGETSIZE: size = i_size_read(bdev->bd_inode); if ((size >> 9) > ~0UL) diff --git a/fs/block_dev.c b/fs/block_dev.c index 43be5463a4c4..e1c14c2e0504 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -801,7 +801,6 @@ static struct inode *bdev_alloc_inode(struct super_block *sb) if (!ei) return NULL; memset(&ei->bdev, 0, sizeof(ei->bdev)); - ei->bdev.bd_bdi = &noop_backing_dev_info; return &ei->vfs_inode; } @@ -826,16 +825,11 @@ static void init_once(void *data) static void bdev_evict_inode(struct inode *inode) { - struct block_device *bdev = &BDEV_I(inode)->bdev; truncate_inode_pages_final(&inode->i_data); invalidate_inode_buffers(inode); /* is it needed here? */ clear_inode(inode); /* Detach inode from wb early as bdi_put() may free bdi->wb */ inode_detach_wb(inode); - if (bdev->bd_bdi != &noop_backing_dev_info) { - bdi_put(bdev->bd_bdi); - bdev->bd_bdi = &noop_backing_dev_info; - } } static const struct super_operations bdev_sops = { @@ -1084,11 +1078,8 @@ static int blkdev_get_whole(struct block_device *bdev, fmode_t mode) } } - if (!bdev->bd_openers) { + if (!bdev->bd_openers) set_init_blocksize(bdev); - if (bdev->bd_bdi == &noop_backing_dev_info) - bdev->bd_bdi = bdi_get(disk->bdi); - } if (test_bit(GD_NEED_PART_SCAN, &disk->state)) bdev_disk_changed(disk, false); bdev->bd_openers++; @@ -1121,8 +1112,6 @@ static int blkdev_get_part(struct block_device *part, fmode_t mode) disk->open_partitions++; set_init_blocksize(part); - if (part->bd_bdi == &noop_backing_dev_info) - part->bd_bdi = bdi_get(disk->bdi); done: part->bd_openers++; return 0; diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 4abd928b0bc8..f6b2d280aab5 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1053,7 +1053,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_time_gran = 1; sb->s_max_links = NILFS_LINK_MAX; - sb->s_bdi = bdi_get(sb->s_bdev->bd_bdi); + sb->s_bdi = bdi_get(sb->s_bdev->bd_disk->bdi); err = load_nilfs(nilfs, sb); if (err) diff --git a/fs/super.c b/fs/super.c index 91b7f156735b..bcef3a6f4c4b 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1203,7 +1203,7 @@ static int set_bdev_super(struct super_block *s, void *data) { s->s_bdev = data; s->s_dev = s->s_bdev->bd_dev; - s->s_bdi = bdi_get(s->s_bdev->bd_bdi); + s->s_bdi = bdi_get(s->s_bdev->bd_disk->bdi); if (blk_queue_stable_writes(s->s_bdev->bd_disk->queue)) s->s_iflags |= SB_I_STABLE_WRITES; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 8ff42b3585e0..3ab73567a0f5 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -844,7 +844,7 @@ xfs_buf_readahead_map( { struct xfs_buf *bp; - if (bdi_read_congested(target->bt_bdev->bd_bdi)) + if (bdi_read_congested(target->bt_bdev->bd_disk->bdi)) return; xfs_buf_read_map(target, map, nmaps, diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 44df4fcef65c..29530859d9ff 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -143,7 +143,7 @@ static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) sb = inode->i_sb; #ifdef CONFIG_BLOCK if (sb_is_blkdev_sb(sb)) - return I_BDEV(inode)->bd_bdi; + return I_BDEV(inode)->bd_disk->bdi; #endif return sb->s_bdi; } diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index e92735655684..1335efa8a1db 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -38,7 +38,6 @@ struct block_device { u8 bd_partno; spinlock_t bd_size_lock; /* for bd_inode->i_size updates */ struct gendisk * bd_disk; - struct backing_dev_info *bd_bdi; /* The counter of freeze processes */ int bd_fsfreeze_count; From 866663b7b52d2da267b28e12eed89ee781b8fed1 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 29 Jul 2021 11:42:26 +0800 Subject: [PATCH 063/315] block: return ELEVATOR_DISCARD_MERGE if possible When merging one bio to request, if they are discard IO and the queue supports multi-range discard, we need to return ELEVATOR_DISCARD_MERGE because both block core and related drivers(nvme, virtio-blk) doesn't handle mixed discard io merge(traditional IO merge together with discard merge) well. Fix the issue by returning ELEVATOR_DISCARD_MERGE in this situation, so both blk-mq and drivers just need to handle multi-range discard. Reported-by: Oleksandr Natalenko Signed-off-by: Ming Lei Tested-by: Oleksandr Natalenko Fixes: 2705dfb20947 ("block: fix discard request merge") Link: https://lore.kernel.org/r/20210729034226.1591070-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 3 +++ block/blk-merge.c | 16 ---------------- block/elevator.c | 3 +++ block/mq-deadline-main.c | 2 ++ include/linux/blkdev.h | 16 ++++++++++++++++ 5 files changed, 24 insertions(+), 16 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 1576e858d3a5..e4a61eda2d0f 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2361,6 +2361,9 @@ static int bfq_request_merge(struct request_queue *q, struct request **req, __rq = bfq_find_rq_fmerge(bfqd, bio, q); if (__rq && elv_bio_merge_ok(__rq, bio)) { *req = __rq; + + if (blk_discard_mergable(__rq)) + return ELEVATOR_DISCARD_MERGE; return ELEVATOR_FRONT_MERGE; } diff --git a/block/blk-merge.c b/block/blk-merge.c index a11b3b53717e..f8707ff7e2fc 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -705,22 +705,6 @@ static void blk_account_io_merge_request(struct request *req) } } -/* - * Two cases of handling DISCARD merge: - * If max_discard_segments > 1, the driver takes every bio - * as a range and send them to controller together. The ranges - * needn't to be contiguous. - * Otherwise, the bios/requests will be handled as same as - * others which should be contiguous. - */ -static inline bool blk_discard_mergable(struct request *req) -{ - if (req_op(req) == REQ_OP_DISCARD && - queue_max_discard_segments(req->q) > 1) - return true; - return false; -} - static enum elv_merge blk_try_req_merge(struct request *req, struct request *next) { diff --git a/block/elevator.c b/block/elevator.c index 9beaafd238e0..ff45d8388f48 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -336,6 +336,9 @@ enum elv_merge elv_merge(struct request_queue *q, struct request **req, __rq = elv_rqhash_find(q, bio->bi_iter.bi_sector); if (__rq && elv_bio_merge_ok(__rq, bio)) { *req = __rq; + + if (blk_discard_mergable(__rq)) + return ELEVATOR_DISCARD_MERGE; return ELEVATOR_BACK_MERGE; } diff --git a/block/mq-deadline-main.c b/block/mq-deadline-main.c index 6f612e6dc82b..294be0c0db65 100644 --- a/block/mq-deadline-main.c +++ b/block/mq-deadline-main.c @@ -677,6 +677,8 @@ static int dd_request_merge(struct request_queue *q, struct request **rq, if (elv_bio_merge_ok(__rq, bio)) { *rq = __rq; + if (blk_discard_mergable(__rq)) + return ELEVATOR_DISCARD_MERGE; return ELEVATOR_FRONT_MERGE; } } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 23e1253a8d88..07eef02325b4 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1519,6 +1519,22 @@ static inline int queue_limit_discard_alignment(struct queue_limits *lim, sector return offset << SECTOR_SHIFT; } +/* + * Two cases of handling DISCARD merge: + * If max_discard_segments > 1, the driver takes every bio + * as a range and send them to controller together. The ranges + * needn't to be contiguous. + * Otherwise, the bios/requests will be handled as same as + * others which should be contiguous. + */ +static inline bool blk_discard_mergable(struct request *req) +{ + if (req_op(req) == REQ_OP_DISCARD && + queue_max_discard_segments(req->q) > 1) + return true; + return false; +} + static inline int bdev_discard_alignment(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); From da20b58d5bbbb0d23ae9530992a37d0f0d1787a4 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 6 Aug 2021 12:06:01 +0100 Subject: [PATCH 064/315] xen-blkfront: Remove redundant assignment to variable err The variable err is being assigned a value that is never read, the assignment is redundant and can be removed. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Reviewed-by: Boris Ostrovsky Link: https://lore.kernel.org/r/20210806110601.11386-1-colin.king@canonical.com Signed-off-by: Jens Axboe --- drivers/block/xen-blkfront.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index d83fee21f6c5..715bfa8aca7f 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -1092,7 +1092,6 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, err = xlbd_reserve_minors(minor, nr_minors); if (err) return err; - err = -ENODEV; memset(&info->tag_set, 0, sizeof(info->tag_set)); info->tag_set.ops = &blkfront_mq_ops; From 698429f9d0e54ce3964151adff886ee5fc59714b Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 3 Aug 2021 16:16:17 +0200 Subject: [PATCH 065/315] clocksource: Replace deprecated CPU-hotplug functions. The functions get_online_cpus() and put_online_cpus() have been deprecated during the CPU hotplug rework. They map directly to cpus_read_lock() and cpus_read_unlock(). Replace deprecated CPU-hotplug functions with the official version. The behavior remains unchanged. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210803141621.780504-35-bigeasy@linutronix.de --- kernel/time/clocksource.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index b89c76e1c02c..b8a14d2fb5ba 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -306,12 +306,12 @@ void clocksource_verify_percpu(struct clocksource *cs) return; cpumask_clear(&cpus_ahead); cpumask_clear(&cpus_behind); - get_online_cpus(); + cpus_read_lock(); preempt_disable(); clocksource_verify_choose_cpus(); if (cpumask_weight(&cpus_chosen) == 0) { preempt_enable(); - put_online_cpus(); + cpus_read_unlock(); pr_warn("Not enough CPUs to check clocksource '%s'.\n", cs->name); return; } @@ -337,7 +337,7 @@ void clocksource_verify_percpu(struct clocksource *cs) cs_nsec_min = cs_nsec; } preempt_enable(); - put_online_cpus(); + cpus_read_unlock(); if (!cpumask_empty(&cpus_ahead)) pr_warn(" CPUs %*pbl ahead of CPU %d for clocksource %s.\n", cpumask_pr_args(&cpus_ahead), testcpu, cs->name); From 99d26de2f6d79badc80f55b54bd90d4cb9d1ad90 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 14:39:24 +0200 Subject: [PATCH 066/315] writeback: make the laptop_mode prototypes available unconditionally Fix the !CONFIG_BLOCK build after the recent cleanup. Fixes: 5ed964f8e54e ("mm: hide laptop_mode_wb_timer entirely behind the BDI API") Reported-by: Stephen Rothwell Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/writeback.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 667e86cfbdcf..270677dc4f36 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -336,14 +336,9 @@ static inline void cgroup_writeback_umount(void) /* * mm/page-writeback.c */ -#ifdef CONFIG_BLOCK void laptop_io_completion(struct backing_dev_info *info); void laptop_sync_completion(void); -void laptop_mode_sync(struct work_struct *work); void laptop_mode_timer_fn(struct timer_list *t); -#else -static inline void laptop_sync_completion(void) { } -#endif bool node_dirty_ok(struct pglist_data *pgdat); int wb_domain_init(struct wb_domain *dom, gfp_t gfp); #ifdef CONFIG_CGROUP_WRITEBACK From 1dae37c7e41d9a75a615ba7b0480acc2e04094d4 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 21 Jul 2021 13:01:47 +0100 Subject: [PATCH 067/315] posix-timers: Remove redundant initialization of variable ret The variable ret is being initialized with a value that is never read, it is being updated later on. The assignment is redundant and can be removed. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210721120147.109570-1-colin.king@canonical.com --- kernel/time/posix-timers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index dd5697d7347b..3913222e7bcf 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -336,7 +336,7 @@ void posixtimer_rearm(struct kernel_siginfo *info) int posix_timer_event(struct k_itimer *timr, int si_private) { enum pid_type type; - int ret = -1; + int ret; /* * FIXME: if ->sigq is queued we can race with * dequeue_signal()->posixtimer_rearm(). From a5dec9f82ab2ae486119f0b0820ea16db3e522c3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 26 Jul 2021 14:55:08 +0200 Subject: [PATCH 068/315] posix-cpu-timers: Assert task sighand is locked while starting cputime counter Starting the process wide cputime counter needs to be done in the same sighand locking sequence than actually arming the related timer otherwise this races against concurrent timers setting/expiring in the same threadgroup. Detecting that the cputime counter is started without holding the sighand lock is a first step toward debugging such situations. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210726125513.271824-2-frederic@kernel.org --- include/linux/sched/signal.h | 6 ++++++ kernel/signal.c | 15 +++++++++++++++ kernel/time/posix-cpu-timers.c | 2 ++ 3 files changed, 23 insertions(+) diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index b9126fe06c3f..0310a5add9ab 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -714,6 +714,12 @@ static inline void unlock_task_sighand(struct task_struct *task, spin_unlock_irqrestore(&task->sighand->siglock, *flags); } +#ifdef CONFIG_LOCKDEP +extern void lockdep_assert_task_sighand_held(struct task_struct *task); +#else +static inline void lockdep_assert_task_sighand_held(struct task_struct *task) { } +#endif + static inline unsigned long task_rlimit(const struct task_struct *task, unsigned int limit) { diff --git a/kernel/signal.c b/kernel/signal.c index a3229add4455..52b6abec0ff8 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1413,6 +1413,21 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, return sighand; } +#ifdef CONFIG_LOCKDEP +void lockdep_assert_task_sighand_held(struct task_struct *task) +{ + struct sighand_struct *sighand; + + rcu_read_lock(); + sighand = rcu_dereference(task->sighand); + if (sighand) + lockdep_assert_held(&sighand->siglock); + else + WARN_ON_ONCE(1); + rcu_read_unlock(); +} +#endif + /* * send signal info to all the members of a group */ diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 517be7fd175e..4693d3c71e7e 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -291,6 +291,8 @@ static void thread_group_start_cputime(struct task_struct *tsk, u64 *samples) struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; struct posix_cputimers *pct = &tsk->signal->posix_cputimers; + lockdep_assert_task_sighand_held(tsk); + /* Check if cputimer isn't running. This is accessed without locking. */ if (!READ_ONCE(pct->timers_active)) { struct task_cputime sum; From 175cc3ab28e3509ddee8de4f164b563d99daa570 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 26 Jul 2021 14:55:09 +0200 Subject: [PATCH 069/315] posix-cpu-timers: Force next_expiration recalc after timer deletion A timer deletion only dequeues the timer but it doesn't shutdown the related costly process wide cputimer counter and the tick dependency. The following code snippet keeps this overhead around for one week after the timer deletion: void trigger_process_counter(void) { timer_t id; struct itimerspec val = { }; val.it_value.tv_sec = 604800; timer_create(CLOCK_PROCESS_CPUTIME_ID, NULL, &id); timer_settime(id, 0, &val, NULL); timer_delete(id); } Make sure the next target's tick recalculates the nearest expiration and clears the process wide counter and tick dependency if necessary. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210726125513.271824-3-frederic@kernel.org --- include/linux/posix-timers.h | 4 +++- kernel/time/posix-cpu-timers.c | 33 ++++++++++++++++++++++++++++++++- 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 896c16d2c5fb..4cf1fbe8d1bc 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -82,12 +82,14 @@ static inline bool cpu_timer_enqueue(struct timerqueue_head *head, return timerqueue_add(head, &ctmr->node); } -static inline void cpu_timer_dequeue(struct cpu_timer *ctmr) +static inline bool cpu_timer_dequeue(struct cpu_timer *ctmr) { if (ctmr->head) { timerqueue_del(ctmr->head, &ctmr->node); ctmr->head = NULL; + return true; } + return false; } static inline u64 cpu_timer_getexpires(struct cpu_timer *ctmr) diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 4693d3c71e7e..61c78b62fe6a 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -407,6 +407,37 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer) return 0; } +/* + * Dequeue the timer and reset the base if it was its earliest expiration. + * It makes sure the next tick recalculates the base next expiration so we + * don't keep the costly process wide cputime counter around for a random + * amount of time, along with the tick dependency. + * + * If another timer gets queued between this and the next tick, its + * expiration will update the base next event if necessary on the next + * tick. + */ +static void disarm_timer(struct k_itimer *timer, struct task_struct *p) +{ + struct cpu_timer *ctmr = &timer->it.cpu; + struct posix_cputimer_base *base; + int clkidx; + + if (!cpu_timer_dequeue(ctmr)) + return; + + clkidx = CPUCLOCK_WHICH(timer->it_clock); + + if (CPUCLOCK_PERTHREAD(timer->it_clock)) + base = p->posix_cputimers.bases + clkidx; + else + base = p->signal->posix_cputimers.bases + clkidx; + + if (cpu_timer_getexpires(ctmr) == base->nextevt) + base->nextevt = 0; +} + + /* * Clean up a CPU-clock timer that is about to be destroyed. * This is called from timer deletion with the timer already locked. @@ -441,7 +472,7 @@ static int posix_cpu_timer_del(struct k_itimer *timer) if (timer->it.cpu.firing) ret = TIMER_RETRY; else - cpu_timer_dequeue(ctmr); + disarm_timer(timer, p); unlock_task_sighand(p, &flags); } From 406dd42bd1ba0c01babf9cde169bb319e52f6147 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 26 Jul 2021 14:55:10 +0200 Subject: [PATCH 070/315] posix-cpu-timers: Force next expiration recalc after itimer reset When an itimer deactivates a previously armed expiration, it simply doesn't do anything. As a result the process wide cputime counter keeps running and the tick dependency stays set until it reaches the old ghost expiration value. This can be reproduced with the following snippet: void trigger_process_counter(void) { struct itimerval n = {}; n.it_value.tv_sec = 100; setitimer(ITIMER_VIRTUAL, &n, NULL); n.it_value.tv_sec = 0; setitimer(ITIMER_VIRTUAL, &n, NULL); } Fix this with resetting the relevant base expiration. This is similar to disarming a timer. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210726125513.271824-4-frederic@kernel.org --- kernel/time/posix-cpu-timers.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 61c78b62fe6a..5c71322b45c7 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1379,8 +1379,6 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clkid, } } - if (!*newval) - return; *newval += now; } From d9c1b2a1089f606404284b9f5b045a584d73382d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 26 Jul 2021 14:55:11 +0200 Subject: [PATCH 071/315] posix-cpu-timers: Remove confusing return value override The end of the function cannot be reached with an error in variable ret. Unconfuse reviewers about that. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210726125513.271824-5-frederic@kernel.org --- kernel/time/posix-cpu-timers.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 5c71322b45c7..4fbbbc89ac4a 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -744,8 +744,6 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, */ cpu_timer_fire(timer); } - - ret = 0; out: rcu_read_unlock(); if (old) From 5c8f23e6b73c13d9f7b52614783dcb9169883296 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 26 Jul 2021 14:55:12 +0200 Subject: [PATCH 072/315] posix-cpu-timers: Consolidate timer base accessor Remove the ad-hoc timer base accessors and provide a consolidated one. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210726125513.271824-6-frederic@kernel.org --- kernel/time/posix-cpu-timers.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 4fbbbc89ac4a..0d918117a3e0 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -407,6 +407,17 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer) return 0; } +static struct posix_cputimer_base *timer_base(struct k_itimer *timer, + struct task_struct *tsk) +{ + int clkidx = CPUCLOCK_WHICH(timer->it_clock); + + if (CPUCLOCK_PERTHREAD(timer->it_clock)) + return tsk->posix_cputimers.bases + clkidx; + else + return tsk->signal->posix_cputimers.bases + clkidx; +} + /* * Dequeue the timer and reset the base if it was its earliest expiration. * It makes sure the next tick recalculates the base next expiration so we @@ -421,18 +432,11 @@ static void disarm_timer(struct k_itimer *timer, struct task_struct *p) { struct cpu_timer *ctmr = &timer->it.cpu; struct posix_cputimer_base *base; - int clkidx; if (!cpu_timer_dequeue(ctmr)) return; - clkidx = CPUCLOCK_WHICH(timer->it_clock); - - if (CPUCLOCK_PERTHREAD(timer->it_clock)) - base = p->posix_cputimers.bases + clkidx; - else - base = p->signal->posix_cputimers.bases + clkidx; - + base = timer_base(timer, p); if (cpu_timer_getexpires(ctmr) == base->nextevt) base->nextevt = 0; } @@ -531,15 +535,9 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk) */ static void arm_timer(struct k_itimer *timer, struct task_struct *p) { - int clkidx = CPUCLOCK_WHICH(timer->it_clock); + struct posix_cputimer_base *base = timer_base(timer, p); struct cpu_timer *ctmr = &timer->it.cpu; u64 newexp = cpu_timer_getexpires(ctmr); - struct posix_cputimer_base *base; - - if (CPUCLOCK_PERTHREAD(timer->it_clock)) - base = p->posix_cputimers.bases + clkidx; - else - base = p->signal->posix_cputimers.bases + clkidx; if (!cpu_timer_enqueue(&base->tqhead, ctmr)) return; From ee375328f579f94251eb66d5dc91aba056019a31 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 26 Jul 2021 14:55:13 +0200 Subject: [PATCH 073/315] posix-cpu-timers: Recalc next expiration when timer_settime() ends up not queueing There are several scenarios that can result in posix_cpu_timer_set() not queueing the timer but still leaving the threadgroup cputime counter running or keeping the tick dependency around for a random amount of time. 1) If timer_settime() is called with a 0 expiration on a timer that is already disabled, the process wide cputime counter will be started and won't ever get a chance to be stopped by stop_process_timer() since no timer is actually armed to be processed. The following snippet is enough to trigger the issue. void trigger_process_counter(void) { timer_t id; struct itimerspec val = { }; timer_create(CLOCK_PROCESS_CPUTIME_ID, NULL, &id); timer_settime(id, TIMER_ABSTIME, &val, NULL); timer_delete(id); } 2) If timer_settime() is called with a 0 expiration on a timer that is already armed, the timer is dequeued but not really disarmed. So the process wide cputime counter and the tick dependency may still remain a while around. The following code snippet keeps this overhead around for one week after the timer deletion: void trigger_process_counter(void) { timer_t id; struct itimerspec val = { }; val.it_value.tv_sec = 604800; timer_create(CLOCK_PROCESS_CPUTIME_ID, NULL, &id); timer_settime(id, 0, &val, NULL); timer_delete(id); } 3) If the timer was initially deactivated, this call to timer_settime() with an early expiration may have started the process wide cputime counter even though the timer hasn't been queued and armed because it has fired early and inline within posix_cpu_timer_set() itself. As a result the process wide cputime counter may never stop until a new timer is ever armed in the future. The following code snippet can reproduce this: void trigger_process_counter(void) { timer_t id; struct itimerspec val = { }; signal(SIGALRM, SIG_IGN); timer_create(CLOCK_PROCESS_CPUTIME_ID, NULL, &id); val.it_value.tv_nsec = 1; timer_settime(id, TIMER_ABSTIME, &val, NULL); } 4) If the timer was initially armed with a former expiration value before this call to timer_settime() and the current call sets an early deadline that has already expired, the timer fires inline within posix_cpu_timer_set(). In this case it must have been dequeued before firing inline with its new expiration value, yet it hasn't been disarmed in this case. So the process wide cputime counter and the tick dependency may still be around for a while even after the timer fired. The following code snippet can reproduce this: void trigger_process_counter(void) { timer_t id; struct itimerspec val = { }; signal(SIGALRM, SIG_IGN); timer_create(CLOCK_PROCESS_CPUTIME_ID, NULL, &id); val.it_value.tv_sec = 100; timer_settime(id, TIMER_ABSTIME, &val, NULL); val.it_value.tv_sec = 0; val.it_value.tv_nsec = 1; timer_settime(id, TIMER_ABSTIME, &val, NULL); } Fix all these issues with triggering the related base next expiration recalculation on the next tick. This also implies to re-evaluate the need to keep around the process wide cputime counter and the tick dependency, in a similar fashion to disarm_timer(). Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210726125513.271824-7-frederic@kernel.org --- include/linux/posix-timers.h | 7 +++++- kernel/time/posix-cpu-timers.c | 41 +++++++++++++++++++++++++++++----- 2 files changed, 41 insertions(+), 7 deletions(-) diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 4cf1fbe8d1bc..00fef0064355 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -82,9 +82,14 @@ static inline bool cpu_timer_enqueue(struct timerqueue_head *head, return timerqueue_add(head, &ctmr->node); } +static inline bool cpu_timer_queued(struct cpu_timer *ctmr) +{ + return !!ctmr->head; +} + static inline bool cpu_timer_dequeue(struct cpu_timer *ctmr) { - if (ctmr->head) { + if (cpu_timer_queued(ctmr)) { timerqueue_del(ctmr->head, &ctmr->node); ctmr->head = NULL; return true; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 0d918117a3e0..ee736861b18f 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -418,6 +418,20 @@ static struct posix_cputimer_base *timer_base(struct k_itimer *timer, return tsk->signal->posix_cputimers.bases + clkidx; } +/* + * Force recalculating the base earliest expiration on the next tick. + * This will also re-evaluate the need to keep around the process wide + * cputime counter and tick dependency and eventually shut these down + * if necessary. + */ +static void trigger_base_recalc_expires(struct k_itimer *timer, + struct task_struct *tsk) +{ + struct posix_cputimer_base *base = timer_base(timer, tsk); + + base->nextevt = 0; +} + /* * Dequeue the timer and reset the base if it was its earliest expiration. * It makes sure the next tick recalculates the base next expiration so we @@ -438,7 +452,7 @@ static void disarm_timer(struct k_itimer *timer, struct task_struct *p) base = timer_base(timer, p); if (cpu_timer_getexpires(ctmr) == base->nextevt) - base->nextevt = 0; + trigger_base_recalc_expires(timer, p); } @@ -734,13 +748,28 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, timer->it_overrun_last = 0; timer->it_overrun = -1; - if (new_expires != 0 && !(val < new_expires)) { + if (val >= new_expires) { + if (new_expires != 0) { + /* + * The designated time already passed, so we notify + * immediately, even if the thread never runs to + * accumulate more time on this clock. + */ + cpu_timer_fire(timer); + } + /* - * The designated time already passed, so we notify - * immediately, even if the thread never runs to - * accumulate more time on this clock. + * Make sure we don't keep around the process wide cputime + * counter or the tick dependency if they are not necessary. */ - cpu_timer_fire(timer); + sighand = lock_task_sighand(p, &flags); + if (!sighand) + goto out; + + if (!cpu_timer_queued(ctmr)) + trigger_base_recalc_expires(timer, p); + + unlock_task_sighand(p, &flags); } out: rcu_read_unlock(); From 627ef5ae2df8eeccb20d5af0e4cfa4df9e61ed28 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 13 Jul 2021 15:39:46 +0200 Subject: [PATCH 074/315] hrtimer: Avoid double reprogramming in __hrtimer_start_range_ns() If __hrtimer_start_range_ns() is invoked with an already armed hrtimer then the timer has to be canceled first and then added back. If the timer is the first expiring timer then on removal the clockevent device is reprogrammed to the next expiring timer to avoid that the pending expiry fires needlessly. If the new expiry time ends up to be the first expiry again then the clock event device has to reprogrammed again. Avoid this by checking whether the timer is the first to expire and in that case, keep the timer on the current CPU and delay the reprogramming up to the point where the timer has been enqueued again. Reported-by: Lorenzo Colitti Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210713135157.873137732@linutronix.de --- kernel/time/hrtimer.c | 60 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 53 insertions(+), 7 deletions(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 4a66725b1d4a..ba2e0d0a0e5a 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1030,12 +1030,13 @@ static void __remove_hrtimer(struct hrtimer *timer, * remove hrtimer, called with base lock held */ static inline int -remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart) +remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, + bool restart, bool keep_local) { u8 state = timer->state; if (state & HRTIMER_STATE_ENQUEUED) { - int reprogram; + bool reprogram; /* * Remove the timer and force reprogramming when high @@ -1048,8 +1049,16 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest debug_deactivate(timer); reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases); + /* + * If the timer is not restarted then reprogramming is + * required if the timer is local. If it is local and about + * to be restarted, avoid programming it twice (on removal + * and a moment later when it's requeued). + */ if (!restart) state = HRTIMER_STATE_INACTIVE; + else + reprogram &= !keep_local; __remove_hrtimer(timer, base, state, reprogram); return 1; @@ -1103,9 +1112,31 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, struct hrtimer_clock_base *base) { struct hrtimer_clock_base *new_base; + bool force_local, first; - /* Remove an active timer from the queue: */ - remove_hrtimer(timer, base, true); + /* + * If the timer is on the local cpu base and is the first expiring + * timer then this might end up reprogramming the hardware twice + * (on removal and on enqueue). To avoid that by prevent the + * reprogram on removal, keep the timer local to the current CPU + * and enforce reprogramming after it is queued no matter whether + * it is the new first expiring timer again or not. + */ + force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases); + force_local &= base->cpu_base->next_timer == timer; + + /* + * Remove an active timer from the queue. In case it is not queued + * on the current CPU, make sure that remove_hrtimer() updates the + * remote data correctly. + * + * If it's on the current CPU and the first expiring timer, then + * skip reprogramming, keep the timer local and enforce + * reprogramming later if it was the first expiring timer. This + * avoids programming the underlying clock event twice (once at + * removal and once after enqueue). + */ + remove_hrtimer(timer, base, true, force_local); if (mode & HRTIMER_MODE_REL) tim = ktime_add_safe(tim, base->get_time()); @@ -1115,9 +1146,24 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, hrtimer_set_expires_range_ns(timer, tim, delta_ns); /* Switch the timer base, if necessary: */ - new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED); + if (!force_local) { + new_base = switch_hrtimer_base(timer, base, + mode & HRTIMER_MODE_PINNED); + } else { + new_base = base; + } - return enqueue_hrtimer(timer, new_base, mode); + first = enqueue_hrtimer(timer, new_base, mode); + if (!force_local) + return first; + + /* + * Timer was forced to stay on the current CPU to avoid + * reprogramming on removal and enqueue. Force reprogram the + * hardware by evaluating the new first expiring timer. + */ + hrtimer_force_reprogram(new_base->cpu_base, 1); + return 0; } /** @@ -1183,7 +1229,7 @@ int hrtimer_try_to_cancel(struct hrtimer *timer) base = lock_hrtimer_base(timer, &flags); if (!hrtimer_callback_running(timer)) - ret = remove_hrtimer(timer, base, false); + ret = remove_hrtimer(timer, base, false, false); unlock_hrtimer_base(timer, &flags); From b14bca97c9f5c3e3f133445b01c723e95490d843 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 13 Jul 2021 15:39:47 +0200 Subject: [PATCH 075/315] hrtimer: Consolidate reprogramming code This code is mostly duplicated. The redudant store in the force reprogram case does no harm and the in hrtimer interrupt condition cannot be true for the force reprogram invocations. Signed-off-by: Peter Zijlstra Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210713135158.054424875@linutronix.de --- kernel/time/hrtimer.c | 72 +++++++++++++++++-------------------------- 1 file changed, 29 insertions(+), 43 deletions(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index ba2e0d0a0e5a..5f7c46598d9f 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -652,21 +652,24 @@ static inline int hrtimer_hres_active(void) return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases)); } -/* - * Reprogram the event source with checking both queues for the - * next event - * Called with interrupts disabled and base->lock held - */ static void -hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) +__hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal, + struct hrtimer *next_timer, ktime_t expires_next) { - ktime_t expires_next; + /* + * If the hrtimer interrupt is running, then it will reevaluate the + * clock bases and reprogram the clock event device. + */ + if (cpu_base->in_hrtirq) + return; - expires_next = hrtimer_update_next_event(cpu_base); + if (expires_next > cpu_base->expires_next) + return; if (skip_equal && expires_next == cpu_base->expires_next) return; + cpu_base->next_timer = next_timer; cpu_base->expires_next = expires_next; /* @@ -689,7 +692,23 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected) return; - tick_program_event(cpu_base->expires_next, 1); + tick_program_event(expires_next, 1); +} + +/* + * Reprogram the event source with checking both queues for the + * next event + * Called with interrupts disabled and base->lock held + */ +static void +hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) +{ + ktime_t expires_next; + + expires_next = hrtimer_update_next_event(cpu_base); + + __hrtimer_reprogram(cpu_base, skip_equal, cpu_base->next_timer, + expires_next); } /* High resolution timer related functions */ @@ -835,40 +854,7 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) if (base->cpu_base != cpu_base) return; - /* - * If the hrtimer interrupt is running, then it will - * reevaluate the clock bases and reprogram the clock event - * device. The callbacks are always executed in hard interrupt - * context so we don't need an extra check for a running - * callback. - */ - if (cpu_base->in_hrtirq) - return; - - if (expires >= cpu_base->expires_next) - return; - - /* Update the pointer to the next expiring timer */ - cpu_base->next_timer = timer; - cpu_base->expires_next = expires; - - /* - * If hres is not active, hardware does not have to be - * programmed yet. - * - * If a hang was detected in the last timer interrupt then we - * do not schedule a timer which is earlier than the expiry - * which we enforced in the hang detection. We want the system - * to make progress. - */ - if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected) - return; - - /* - * Program the timer hardware. We enforce the expiry for - * events which are already in the past. - */ - tick_program_event(expires, 1); + __hrtimer_reprogram(cpu_base, true, timer, expires); } /* From 8c3b5e6ec0fee18bc2ce38d1dfe913413205f908 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 13 Jul 2021 15:39:48 +0200 Subject: [PATCH 076/315] hrtimer: Ensure timerfd notification for HIGHRES=n If high resolution timers are disabled the timerfd notification about a clock was set event is not happening for all cases which use clock_was_set_delayed() because that's a NOP for HIGHRES=n, which is wrong. Make clock_was_set_delayed() unconditially available to fix that. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210713135158.196661266@linutronix.de --- include/linux/hrtimer.h | 5 ----- kernel/time/hrtimer.c | 32 ++++++++++++++++---------------- kernel/time/tick-internal.h | 3 +++ 3 files changed, 19 insertions(+), 21 deletions(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index bb5e7b0a4274..77295af72426 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -318,16 +318,12 @@ struct clock_event_device; extern void hrtimer_interrupt(struct clock_event_device *dev); -extern void clock_was_set_delayed(void); - extern unsigned int hrtimer_resolution; #else #define hrtimer_resolution (unsigned int)LOW_RES_NSEC -static inline void clock_was_set_delayed(void) { } - #endif static inline ktime_t @@ -351,7 +347,6 @@ hrtimer_expires_remaining_adjusted(const struct hrtimer *timer) timer->base->get_time()); } -extern void clock_was_set(void); #ifdef CONFIG_TIMERFD extern void timerfd_clock_was_set(void); #else diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 5f7c46598d9f..7ebf642b53f9 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -777,22 +777,6 @@ static void hrtimer_switch_to_hres(void) retrigger_next_event(NULL); } -static void clock_was_set_work(struct work_struct *work) -{ - clock_was_set(); -} - -static DECLARE_WORK(hrtimer_work, clock_was_set_work); - -/* - * Called from timekeeping and resume code to reprogram the hrtimer - * interrupt device on all cpus. - */ -void clock_was_set_delayed(void) -{ - schedule_work(&hrtimer_work); -} - #else static inline int hrtimer_is_hres_enabled(void) { return 0; } @@ -877,6 +861,22 @@ void clock_was_set(void) timerfd_clock_was_set(); } +static void clock_was_set_work(struct work_struct *work) +{ + clock_was_set(); +} + +static DECLARE_WORK(hrtimer_work, clock_was_set_work); + +/* + * Called from timekeeping and resume code to reprogram the hrtimer + * interrupt device on all cpus and to notify timerfd. + */ +void clock_was_set_delayed(void) +{ + schedule_work(&hrtimer_work); +} + /* * During resume we might have to reprogram the high resolution timer * interrupt on all online CPUs. However, all other CPUs will be diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 6a742a29e545..cd610faa2523 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -165,3 +165,6 @@ DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem); void timer_clear_idle(void); + +void clock_was_set(void); +void clock_was_set_delayed(void); From e71a4153b7c256ec103e79875398553808aeffd2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 13 Jul 2021 15:39:49 +0200 Subject: [PATCH 077/315] hrtimer: Force clock_was_set() handling for the HIGHRES=n, NOHZ=y case When CONFIG_HIGH_RES_TIMERS is disabled, but NOHZ is enabled then clock_was_set() is not doing anything. With HIGHRES=n the kernel relies on the periodic tick to update the clock offsets, but when NOHZ is enabled and active then CPUs which are in a deep idle sleep do not have a periodic tick which means the expiry of timers affected by clock_was_set() can be arbitrarily delayed up to the point where the CPUs are brought out of idle again. Make the clock_was_set() logic unconditionaly available so that idle CPUs are kicked out of idle to handle the update. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210713135158.288697903@linutronix.de --- kernel/time/hrtimer.c | 87 +++++++++++++++++++++++++++++-------------- 1 file changed, 59 insertions(+), 28 deletions(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 7ebf642b53f9..214fd65a9597 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -739,23 +739,7 @@ static inline int hrtimer_is_hres_enabled(void) return hrtimer_hres_enabled; } -/* - * Retrigger next event is called after clock was set - * - * Called with interrupts disabled via on_each_cpu() - */ -static void retrigger_next_event(void *arg) -{ - struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); - - if (!__hrtimer_hres_active(base)) - return; - - raw_spin_lock(&base->lock); - hrtimer_update_base(base); - hrtimer_force_reprogram(base, 0); - raw_spin_unlock(&base->lock); -} +static void retrigger_next_event(void *arg); /* * Switch to high resolution mode @@ -781,9 +765,50 @@ static void hrtimer_switch_to_hres(void) static inline int hrtimer_is_hres_enabled(void) { return 0; } static inline void hrtimer_switch_to_hres(void) { } -static inline void retrigger_next_event(void *arg) { } #endif /* CONFIG_HIGH_RES_TIMERS */ +/* + * Retrigger next event is called after clock was set with interrupts + * disabled through an SMP function call or directly from low level + * resume code. + * + * This is only invoked when: + * - CONFIG_HIGH_RES_TIMERS is enabled. + * - CONFIG_NOHZ_COMMON is enabled + * + * For the other cases this function is empty and because the call sites + * are optimized out it vanishes as well, i.e. no need for lots of + * #ifdeffery. + */ +static void retrigger_next_event(void *arg) +{ + struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); + + /* + * When high resolution mode or nohz is active, then the offsets of + * CLOCK_REALTIME/TAI/BOOTTIME have to be updated. Otherwise the + * next tick will take care of that. + * + * If high resolution mode is active then the next expiring timer + * must be reevaluated and the clock event device reprogrammed if + * necessary. + * + * In the NOHZ case the update of the offset and the reevaluation + * of the next expiring timer is enough. The return from the SMP + * function call will take care of the reprogramming in case the + * CPU was in a NOHZ idle sleep. + */ + if (!__hrtimer_hres_active(base) && !tick_nohz_active) + return; + + raw_spin_lock(&base->lock); + hrtimer_update_base(base); + if (__hrtimer_hres_active(base)) + hrtimer_force_reprogram(base, 0); + else + hrtimer_update_next_event(base); + raw_spin_unlock(&base->lock); +} /* * When a timer is enqueued and expires earlier than the already enqueued @@ -842,22 +867,28 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) } /* - * Clock realtime was set + * Clock was set. This might affect CLOCK_REALTIME, CLOCK_TAI and + * CLOCK_BOOTTIME (for late sleep time injection). * - * Change the offset of the realtime clock vs. the monotonic - * clock. - * - * We might have to reprogram the high resolution timer interrupt. On - * SMP we call the architecture specific code to retrigger _all_ high - * resolution timer interrupts. On UP we just disable interrupts and - * call the high resolution interrupt code. + * This requires to update the offsets for these clocks + * vs. CLOCK_MONOTONIC. When high resolution timers are enabled, then this + * also requires to eventually reprogram the per CPU clock event devices + * when the change moves an affected timer ahead of the first expiring + * timer on that CPU. Obviously remote per CPU clock event devices cannot + * be reprogrammed. The other reason why an IPI has to be sent is when the + * system is in !HIGH_RES and NOHZ mode. The NOHZ mode updates the offsets + * in the tick, which obviously might be stopped, so this has to bring out + * the remote CPU which might sleep in idle to get this sorted. */ void clock_was_set(void) { -#ifdef CONFIG_HIGH_RES_TIMERS + if (!hrtimer_hres_active() && !tick_nohz_active) + goto out_timerfd; + /* Retrigger the CPU local events everywhere */ on_each_cpu(retrigger_next_event, NULL, 1); -#endif + +out_timerfd: timerfd_clock_was_set(); } From 66f7b0c8aadd2785fc29f2c71477ebc16f4e38cc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 13 Jul 2021 15:39:50 +0200 Subject: [PATCH 078/315] timerfd: Provide timerfd_resume() Resuming timekeeping is a clock-was-set event and uses the clock-was-set notification mechanism. This is in the way of making the clock-was-set update for hrtimers selective so unnecessary IPIs are avoided when a CPU base does not have timers queued which are affected by the clock setting. Provide a seperate timerfd_resume() interface so the resume logic and the clock-was-set mechanism can be distangled in the core code. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210713135158.395287410@linutronix.de --- fs/timerfd.c | 16 ++++++++++++++++ include/linux/hrtimer.h | 2 ++ 2 files changed, 18 insertions(+) diff --git a/fs/timerfd.c b/fs/timerfd.c index c5509d2448e3..e9c96a0c79f1 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -115,6 +115,22 @@ void timerfd_clock_was_set(void) rcu_read_unlock(); } +static void timerfd_resume_work(struct work_struct *work) +{ + timerfd_clock_was_set(); +} + +static DECLARE_WORK(timerfd_work, timerfd_resume_work); + +/* + * Invoked from timekeeping_resume(). Defer the actual update to work so + * timerfd_clock_was_set() runs in task context. + */ +void timerfd_resume(void) +{ + schedule_work(&timerfd_work); +} + static void __timerfd_remove_cancel(struct timerfd_ctx *ctx) { if (ctx->might_cancel) { diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 77295af72426..253c6e25f331 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -349,8 +349,10 @@ hrtimer_expires_remaining_adjusted(const struct hrtimer *timer) #ifdef CONFIG_TIMERFD extern void timerfd_clock_was_set(void); +extern void timerfd_resume(void); #else static inline void timerfd_clock_was_set(void) { } +static inline void timerfd_resume(void) { } #endif extern void hrtimers_resume(void); From a761a67f591a8c7476c30bb20ed0f09fdfb1a704 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 13 Jul 2021 15:39:51 +0200 Subject: [PATCH 079/315] timekeeping: Distangle resume and clock-was-set events Resuming timekeeping is a clock-was-set event and uses the clock-was-set notification mechanism. This is in the way of making the clock-was-set update for hrtimers selective so unnecessary IPIs are avoided when a CPU base does not have timers queued which are affected by the clock setting. Distangle it by invoking hrtimer_resume() on each unfreezing CPU and invoke the new timerfd_resume() function from timekeeping_resume() which is the only place where this is needed. Rename hrtimer_resume() to hrtimer_resume_local() to reflect the change. With this the clock_was_set*() functions are not longer required to IPI all CPUs unconditionally and can get some smarts to avoid them. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210713135158.488853478@linutronix.de --- include/linux/hrtimer.h | 1 - kernel/time/hrtimer.c | 15 ++++++--------- kernel/time/tick-common.c | 7 +++++++ kernel/time/tick-internal.h | 2 ++ kernel/time/timekeeping.c | 4 +++- 5 files changed, 18 insertions(+), 11 deletions(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 253c6e25f331..0ee140176f10 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -354,7 +354,6 @@ extern void timerfd_resume(void); static inline void timerfd_clock_was_set(void) { } static inline void timerfd_resume(void) { } #endif -extern void hrtimers_resume(void); DECLARE_PER_CPU(struct tick_device, tick_cpu_device); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 214fd65a9597..68e56f0ecb09 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -900,8 +900,8 @@ static void clock_was_set_work(struct work_struct *work) static DECLARE_WORK(hrtimer_work, clock_was_set_work); /* - * Called from timekeeping and resume code to reprogram the hrtimer - * interrupt device on all cpus and to notify timerfd. + * Called from timekeeping code to reprogram the hrtimer interrupt device + * on all cpus and to notify timerfd. */ void clock_was_set_delayed(void) { @@ -909,18 +909,15 @@ void clock_was_set_delayed(void) } /* - * During resume we might have to reprogram the high resolution timer - * interrupt on all online CPUs. However, all other CPUs will be - * stopped with IRQs interrupts disabled so the clock_was_set() call - * must be deferred. + * Called during resume either directly from via timekeeping_resume() + * or in the case of s2idle from tick_unfreeze() to ensure that the + * hrtimers are up to date. */ -void hrtimers_resume(void) +void hrtimers_resume_local(void) { lockdep_assert_irqs_disabled(); /* Retrigger on the local CPU */ retrigger_next_event(NULL); - /* And schedule a retrigger for all others */ - clock_was_set_delayed(); } /* diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index d663249652ef..46789356f856 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -470,6 +470,13 @@ void tick_resume_local(void) else tick_resume_oneshot(); } + + /* + * Ensure that hrtimers are up to date and the clockevents device + * is reprogrammed correctly when high resolution timers are + * enabled. + */ + hrtimers_resume_local(); } /** diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index cd610faa2523..22de98cc6dd8 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -168,3 +168,5 @@ void timer_clear_idle(void); void clock_was_set(void); void clock_was_set_delayed(void); + +void hrtimers_resume_local(void); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 8a364aa9881a..c8a9b9e54c9d 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1810,8 +1810,10 @@ void timekeeping_resume(void) touch_softlockup_watchdog(); + /* Resume the clockevent device(s) and hrtimers */ tick_resume(); - hrtimers_resume(); + /* Notify timerfd as resume is equivalent to clock_was_set() */ + timerfd_resume(); } int timekeeping_suspend(void) From 1b267793f4fd9a089ea8558f3b6698186b9a3214 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 13 Jul 2021 15:39:52 +0200 Subject: [PATCH 080/315] time/timekeeping: Avoid invoking clock_was_set() twice do_adjtimex() might end up scheduling a delayed clock_was_set() via timekeeping_advance() and then invoke clock_was_set() directly which is pointless. Make timekeeping_advance() return whether an invocation of clock_was_set() is required and handle it at the call sites which allows do_adjtimex() to issue a single direct call if required. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210713135158.580966888@linutronix.de --- kernel/time/timekeeping.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index c8a9b9e54c9d..19ed58e97b57 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2127,7 +2127,7 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset, * timekeeping_advance - Updates the timekeeper to the current time and * current NTP tick length */ -static void timekeeping_advance(enum timekeeping_adv_mode mode) +static bool timekeeping_advance(enum timekeeping_adv_mode mode) { struct timekeeper *real_tk = &tk_core.timekeeper; struct timekeeper *tk = &shadow_timekeeper; @@ -2198,9 +2198,8 @@ static void timekeeping_advance(enum timekeeping_adv_mode mode) write_seqcount_end(&tk_core.seq); out: raw_spin_unlock_irqrestore(&timekeeper_lock, flags); - if (clock_set) - /* Have to call _delayed version, since in irq context*/ - clock_was_set_delayed(); + + return !!clock_set; } /** @@ -2209,7 +2208,8 @@ out: */ void update_wall_time(void) { - timekeeping_advance(TK_ADV_TICK); + if (timekeeping_advance(TK_ADV_TICK)) + clock_was_set_delayed(); } /** @@ -2389,8 +2389,9 @@ int do_adjtimex(struct __kernel_timex *txc) { struct timekeeper *tk = &tk_core.timekeeper; struct audit_ntp_data ad; - unsigned long flags; + bool clock_set = false; struct timespec64 ts; + unsigned long flags; s32 orig_tai, tai; int ret; @@ -2425,6 +2426,7 @@ int do_adjtimex(struct __kernel_timex *txc) if (tai != orig_tai) { __timekeeping_set_tai_offset(tk, tai); timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); + clock_set = true; } tk_update_leap_state(tk); @@ -2435,9 +2437,9 @@ int do_adjtimex(struct __kernel_timex *txc) /* Update the multiplier immediately if frequency was set directly */ if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK)) - timekeeping_advance(TK_ADV_FREQ); + clock_set |= timekeeping_advance(TK_ADV_FREQ); - if (tai != orig_tai) + if (clock_set) clock_was_set(); ntp_notify_cmos_timer(); From 17a1b8826b451c80e7999a7c68e06b70579b2b8f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 13 Jul 2021 15:39:53 +0200 Subject: [PATCH 081/315] hrtimer: Add bases argument to clock_was_set() clock_was_set() unconditionaly invokes retrigger_next_event() on all online CPUs. This was necessary because that mechanism was also used for resume from suspend to idle which is not longer the case. The bases arguments allows the callers of clock_was_set() to hand in a mask which tells clock_was_set() which of the hrtimer clock bases are affected by the clock setting. This mask will be used in the next step to check whether a CPU base has timers queued on a clock base affected by the event and avoid the SMP function call if there are none. Add a @bases argument, provide defines for the active bases masking and fixup all callsites. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210713135158.691083465@linutronix.de --- kernel/time/hrtimer.c | 4 ++-- kernel/time/tick-internal.h | 9 ++++++++- kernel/time/timekeeping.c | 14 +++++++------- 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 68e56f0ecb09..c8af165c04eb 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -880,7 +880,7 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) * in the tick, which obviously might be stopped, so this has to bring out * the remote CPU which might sleep in idle to get this sorted. */ -void clock_was_set(void) +void clock_was_set(unsigned int bases) { if (!hrtimer_hres_active() && !tick_nohz_active) goto out_timerfd; @@ -894,7 +894,7 @@ out_timerfd: static void clock_was_set_work(struct work_struct *work) { - clock_was_set(); + clock_was_set(CLOCK_SET_WALL); } static DECLARE_WORK(hrtimer_work, clock_was_set_work); diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 22de98cc6dd8..3548f0829e6d 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -166,7 +166,14 @@ DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem); void timer_clear_idle(void); -void clock_was_set(void); +#define CLOCK_SET_WALL \ + (BIT(HRTIMER_BASE_REALTIME) | BIT(HRTIMER_BASE_REALTIME_SOFT) | \ + BIT(HRTIMER_BASE_TAI) | BIT(HRTIMER_BASE_TAI_SOFT)) + +#define CLOCK_SET_BOOT \ + (BIT(HRTIMER_BASE_BOOTTIME) | BIT(HRTIMER_BASE_BOOTTIME_SOFT)) + +void clock_was_set(unsigned int bases); void clock_was_set_delayed(void); void hrtimers_resume_local(void); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 19ed58e97b57..b348749a9fc6 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1323,8 +1323,8 @@ out: write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); - /* signal hrtimers about time change */ - clock_was_set(); + /* Signal hrtimers about time change */ + clock_was_set(CLOCK_SET_WALL); if (!ret) audit_tk_injoffset(ts_delta); @@ -1371,8 +1371,8 @@ error: /* even if we error out, we forwarded the time, so call update */ write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); - /* signal hrtimers about time change */ - clock_was_set(); + /* Signal hrtimers about time change */ + clock_was_set(CLOCK_SET_WALL); return ret; } @@ -1746,8 +1746,8 @@ void timekeeping_inject_sleeptime64(const struct timespec64 *delta) write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); - /* signal hrtimers about time change */ - clock_was_set(); + /* Signal hrtimers about time change */ + clock_was_set(CLOCK_SET_WALL | CLOCK_SET_BOOT); } #endif @@ -2440,7 +2440,7 @@ int do_adjtimex(struct __kernel_timex *txc) clock_set |= timekeeping_advance(TK_ADV_FREQ); if (clock_set) - clock_was_set(); + clock_was_set(CLOCK_REALTIME); ntp_notify_cmos_timer(); From 81d741d3460ca422843ce0ec8351083f259c6166 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 13 Jul 2021 15:39:54 +0200 Subject: [PATCH 082/315] hrtimer: Avoid unnecessary SMP function calls in clock_was_set() Setting of clocks triggers an unconditional SMP function call on all online CPUs to reprogram the clock event device. However, only some clocks have their offsets updated and therefore potentially require a reprogram. That's CLOCK_REALTIME and CLOCK_TAI and in the case of resume (delayed sleep time injection) also CLOCK_BOOTTIME. Instead of sending an IPI unconditionally, check each per CPU hrtimer base whether it has active timers in the affected clock bases which are indicated by the caller in the @bases argument of clock_was_set(). If that's not the case, skip the IPI and update the offsets remotely which ensures that any subsequently armed timers on the affected clocks are evaluated with the correct offsets. [ tglx: Adopted to the new bases argument, removed the softirq_active check, added comment, fixed up stale comment ] Signed-off-by: Marcelo Tosatti Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210713135158.787536542@linutronix.de --- kernel/time/hrtimer.c | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index c8af165c04eb..5d44c90d41ea 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -882,11 +882,42 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) */ void clock_was_set(unsigned int bases) { + cpumask_var_t mask; + int cpu; + if (!hrtimer_hres_active() && !tick_nohz_active) goto out_timerfd; - /* Retrigger the CPU local events everywhere */ - on_each_cpu(retrigger_next_event, NULL, 1); + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) { + on_each_cpu(retrigger_next_event, NULL, 1); + goto out_timerfd; + } + + /* Avoid interrupting CPUs if possible */ + cpus_read_lock(); + for_each_online_cpu(cpu) { + struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); + unsigned long flags; + + raw_spin_lock_irqsave(&cpu_base->lock, flags); + /* + * Only send the IPI when there are timers queued in one of + * the affected clock bases. Otherwise update the base + * remote to ensure that the next enqueue of a timer on + * such a clock base will see the correct offsets. + */ + if (cpu_base->active_bases & bases) + cpumask_set_cpu(cpu, mask); + else + hrtimer_update_base(cpu_base); + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + } + + preempt_disable(); + smp_call_function_many(mask, retrigger_next_event, NULL, 1); + preempt_enable(); + cpus_read_unlock(); + free_cpumask_var(mask); out_timerfd: timerfd_clock_was_set(); From 1e7f7fbcd40c69d23e3fe641ead9f3dc128fa8aa Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 13 Jul 2021 15:39:55 +0200 Subject: [PATCH 083/315] hrtimer: Avoid more SMP function calls in clock_was_set() By unconditionally updating the offsets there are more indicators whether the SMP function calls on clock_was_set() can be avoided: - When the offset update already happened on the remote CPU then the remote update attempt will yield the same seqeuence number and no IPI is required. - When the remote CPU is currently handling hrtimer_interrupt(). In that case the remote CPU will reevaluate the timer bases before reprogramming anyway, so nothing to do. - After updating it can be checked whether the first expiring timer in the affected clock bases moves before the first expiring (softirq) timer of the CPU. If that's not the case then sending the IPI is not required. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210713135158.887322464@linutronix.de --- kernel/time/hrtimer.c | 74 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 65 insertions(+), 9 deletions(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 5d44c90d41ea..88aefc35f7d2 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -866,6 +866,68 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) __hrtimer_reprogram(cpu_base, true, timer, expires); } +static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, + unsigned int active) +{ + struct hrtimer_clock_base *base; + unsigned int seq; + ktime_t expires; + + /* + * Update the base offsets unconditionally so the following + * checks whether the SMP function call is required works. + * + * The update is safe even when the remote CPU is in the hrtimer + * interrupt or the hrtimer soft interrupt and expiring affected + * bases. Either it will see the update before handling a base or + * it will see it when it finishes the processing and reevaluates + * the next expiring timer. + */ + seq = cpu_base->clock_was_set_seq; + hrtimer_update_base(cpu_base); + + /* + * If the sequence did not change over the update then the + * remote CPU already handled it. + */ + if (seq == cpu_base->clock_was_set_seq) + return false; + + /* + * If the remote CPU is currently handling an hrtimer interrupt, it + * will reevaluate the first expiring timer of all clock bases + * before reprogramming. Nothing to do here. + */ + if (cpu_base->in_hrtirq) + return false; + + /* + * Walk the affected clock bases and check whether the first expiring + * timer in a clock base is moving ahead of the first expiring timer of + * @cpu_base. If so, the IPI must be invoked because per CPU clock + * event devices cannot be remotely reprogrammed. + */ + active &= cpu_base->active_bases; + + for_each_active_base(base, cpu_base, active) { + struct timerqueue_node *next; + + next = timerqueue_getnext(&base->active); + expires = ktime_sub(next->expires, base->offset); + if (expires < cpu_base->expires_next) + return true; + + /* Extra check for softirq clock bases */ + if (base->clockid < HRTIMER_BASE_MONOTONIC_SOFT) + continue; + if (cpu_base->softirq_activated) + continue; + if (expires < cpu_base->softirq_expires_next) + return true; + } + return false; +} + /* * Clock was set. This might affect CLOCK_REALTIME, CLOCK_TAI and * CLOCK_BOOTTIME (for late sleep time injection). @@ -900,16 +962,10 @@ void clock_was_set(unsigned int bases) unsigned long flags; raw_spin_lock_irqsave(&cpu_base->lock, flags); - /* - * Only send the IPI when there are timers queued in one of - * the affected clock bases. Otherwise update the base - * remote to ensure that the next enqueue of a timer on - * such a clock base will see the correct offsets. - */ - if (cpu_base->active_bases & bases) + + if (update_needs_ipi(cpu_base, bases)) cpumask_set_cpu(cpu, mask); - else - hrtimer_update_base(cpu_base); + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); } From fb6a0408eac284688d5262519cbb3be0250e4caf Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Tue, 20 Jul 2021 05:27:49 +0200 Subject: [PATCH 084/315] x86: Add support for 0x22/0x23 port I/O configuration space Define macros and accessors for the configuration space addressed indirectly with an index register and a data register at the port I/O locations of 0x22 and 0x23 respectively. This space is defined by the Intel MultiProcessor Specification for the IMCR register used to switch between the PIC and the APIC mode[1], by Cyrix processors for their configuration[2][3], and also some chipsets. Given the lack of atomicity with the indirect addressing a spinlock is required to protect accesses, although for Cyrix processors it is enough if accesses are executed with interrupts locally disabled, because the registers are local to the accessing CPU, and IMCR is only ever poked at by the BSP and early enough for interrupts not to have been configured yet. Therefore existing code does not have to change or use the new spinlock and neither it does. Put the spinlock in a library file then, so that it does not get pulled unnecessarily for configurations that do not refer it. Convert Cyrix accessors to wrappers so as to retain the brevity and clarity of the `getCx86' and `setCx86' calls. References: [1] "MultiProcessor Specification", Version 1.4, Intel Corporation, Order Number: 242016-006, May 1997, Section 3.6.2.1 "PIC Mode", pp. 3-7, 3-8 [2] "5x86 Microprocessor", Cyrix Corporation, Order Number: 94192-00, July 1995, Section 2.3.2.4 "Configuration Registers", p. 2-23 [3] "6x86 Processor", Cyrix Corporation, Order Number: 94175-01, March 1996, Section 2.4.4 "6x86 Configuration Registers", p. 2-23 Signed-off-by: Maciej W. Rozycki Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/alpine.DEB.2.21.2107182353140.9461@angie.orcam.me.uk --- arch/x86/include/asm/pc-conf-reg.h | 33 ++++++++++++++++++++++++++ arch/x86/include/asm/processor-cyrix.h | 8 +++---- arch/x86/kernel/apic/apic.c | 9 +++---- arch/x86/lib/Makefile | 1 + arch/x86/lib/pc-conf-reg.c | 13 ++++++++++ 5 files changed, 54 insertions(+), 10 deletions(-) create mode 100644 arch/x86/include/asm/pc-conf-reg.h create mode 100644 arch/x86/lib/pc-conf-reg.c diff --git a/arch/x86/include/asm/pc-conf-reg.h b/arch/x86/include/asm/pc-conf-reg.h new file mode 100644 index 000000000000..56bceceacf5f --- /dev/null +++ b/arch/x86/include/asm/pc-conf-reg.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Support for the configuration register space at port I/O locations + * 0x22 and 0x23 variously used by PC architectures, e.g. the MP Spec, + * Cyrix CPUs, numerous chipsets. + */ +#ifndef _ASM_X86_PC_CONF_REG_H +#define _ASM_X86_PC_CONF_REG_H + +#include +#include +#include + +#define PC_CONF_INDEX 0x22 +#define PC_CONF_DATA 0x23 + +#define PC_CONF_MPS_IMCR 0x70 + +extern raw_spinlock_t pc_conf_lock; + +static inline u8 pc_conf_get(u8 reg) +{ + outb(reg, PC_CONF_INDEX); + return inb(PC_CONF_DATA); +} + +static inline void pc_conf_set(u8 reg, u8 data) +{ + outb(reg, PC_CONF_INDEX); + outb(data, PC_CONF_DATA); +} + +#endif /* _ASM_X86_PC_CONF_REG_H */ diff --git a/arch/x86/include/asm/processor-cyrix.h b/arch/x86/include/asm/processor-cyrix.h index df700a6cc869..efe3e46e454b 100644 --- a/arch/x86/include/asm/processor-cyrix.h +++ b/arch/x86/include/asm/processor-cyrix.h @@ -5,14 +5,14 @@ * Access order is always 0x22 (=offset), 0x23 (=value) */ +#include + static inline u8 getCx86(u8 reg) { - outb(reg, 0x22); - return inb(0x23); + return pc_conf_get(reg); } static inline void setCx86(u8 reg, u8 data) { - outb(reg, 0x22); - outb(data, 0x23); + pc_conf_set(reg, data); } diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index d262811ce14b..b70344bf6600 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -38,6 +38,7 @@ #include #include +#include #include #include #include @@ -132,18 +133,14 @@ static int enabled_via_apicbase __ro_after_init; */ static inline void imcr_pic_to_apic(void) { - /* select IMCR register */ - outb(0x70, 0x22); /* NMI and 8259 INTR go through APIC */ - outb(0x01, 0x23); + pc_conf_set(PC_CONF_MPS_IMCR, 0x01); } static inline void imcr_apic_to_pic(void) { - /* select IMCR register */ - outb(0x70, 0x22); /* NMI and 8259 INTR go directly to BSP */ - outb(0x00, 0x23); + pc_conf_set(PC_CONF_MPS_IMCR, 0x00); } #endif diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index bad4dee4f0e4..c6506c6a7092 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -44,6 +44,7 @@ obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o lib-y := delay.o misc.o cmdline.o cpu.o lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o lib-y += memcpy_$(BITS).o +lib-y += pc-conf-reg.o lib-$(CONFIG_ARCH_HAS_COPY_MC) += copy_mc.o copy_mc_64.o lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o insn-eval.o lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o diff --git a/arch/x86/lib/pc-conf-reg.c b/arch/x86/lib/pc-conf-reg.c new file mode 100644 index 000000000000..febb52749e8d --- /dev/null +++ b/arch/x86/lib/pc-conf-reg.c @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Support for the configuration register space at port I/O locations + * 0x22 and 0x23 variously used by PC architectures, e.g. the MP Spec, + * Cyrix CPUs, numerous chipsets. As the space is indirectly addressed + * it may have to be protected with a spinlock, depending on the context. + */ + +#include + +#include + +DEFINE_RAW_SPINLOCK(pc_conf_lock); From 1ce849c755342b236fc6236dfe39dbbf536b64b6 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Tue, 20 Jul 2021 05:27:54 +0200 Subject: [PATCH 085/315] x86/PCI: Add support for the ALi M1487 (IBC) PIRQ router The ALi M1487 ISA Bus Controller (IBC), a part of the ALi FinALi 486 chipset, implements PCI interrupt steering with a PIRQ router[1] in the form of four 4-bit mappings, spread across two PCI INTx Routing Table Mapping Registers, available in the port I/O space accessible indirectly via the index/data register pair at 0x22/0x23, located at indices 0x42 and 0x43 for the INT1/INT2 and INT3/INT4 lines respectively. Additionally there is a separate PCI INTx Sensitivity Register at index 0x44 in the same port I/O space, whose bits 3:0 select the trigger mode for INT[4:1] lines respectively[2]. Manufacturer's documentation says that this register has to be set consistently with the relevant ELCR register[3]. Add a router-specific hook then and use it to handle this register. Accesses to the port I/O space concerned here need to be unlocked by writing the value of 0xc5 to the Lock Register at index 0x03 beforehand[4]. Do so then and then lock access after use for safety. The IBC is implemented as a peer bridge on the host bus rather than a southbridge on PCI and therefore it does not itself appear in the PCI configuration space. It is complemented by the M1489 Cache-Memory PCI Controller (CMP) host-to-PCI bridge, so use that device's identification for determining the presence of the IBC. References: [1] "M1489/M1487: 486 PCI Chip Set", Version 1.2, Acer Laboratories Inc., July 1997, Section 4: "Configuration Registers", pp. 76-77 [2] same, p. 77 [3] same, Section 5: "M1489/M1487 Software Programming Guide", pp. 99-100 [4] same, Section 4: "Configuration Registers", p. 37 Signed-off-by: Maciej W. Rozycki Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/alpine.DEB.2.21.2107191702020.9461@angie.orcam.me.uk --- arch/x86/pci/irq.c | 154 +++++++++++++++++++++++++++++++++++++++- include/linux/pci_ids.h | 1 + 2 files changed, 153 insertions(+), 2 deletions(-) diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index d3a73f9335e1..1bccbc419630 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -13,9 +13,12 @@ #include #include #include +#include #include #include #include + +#include #include #define PIRQ_SIGNATURE (('$' << 0) + ('P' << 8) + ('I' << 16) + ('R' << 24)) @@ -47,6 +50,8 @@ struct irq_router { int (*get)(struct pci_dev *router, struct pci_dev *dev, int pirq); int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq, int new); + int (*lvl)(struct pci_dev *router, struct pci_dev *dev, int pirq, + int irq); }; struct irq_router_handler { @@ -169,6 +174,139 @@ void elcr_set_level_irq(unsigned int irq) } } +/* + * PIRQ routing for the M1487 ISA Bus Controller (IBC) ASIC used + * with the ALi FinALi 486 chipset. The IBC is not decoded in the + * PCI configuration space, so we identify it by the accompanying + * M1489 Cache-Memory PCI Controller (CMP) ASIC. + * + * There are four 4-bit mappings provided, spread across two PCI + * INTx Routing Table Mapping Registers, available in the port I/O + * space accessible indirectly via the index/data register pair at + * 0x22/0x23, located at indices 0x42 and 0x43 for the INT1/INT2 + * and INT3/INT4 lines respectively. The INT1/INT3 and INT2/INT4 + * lines are mapped in the low and the high 4-bit nibble of the + * corresponding register as follows: + * + * 0000 : Disabled + * 0001 : IRQ9 + * 0010 : IRQ3 + * 0011 : IRQ10 + * 0100 : IRQ4 + * 0101 : IRQ5 + * 0110 : IRQ7 + * 0111 : IRQ6 + * 1000 : Reserved + * 1001 : IRQ11 + * 1010 : Reserved + * 1011 : IRQ12 + * 1100 : Reserved + * 1101 : IRQ14 + * 1110 : Reserved + * 1111 : IRQ15 + * + * In addition to the usual ELCR register pair there is a separate + * PCI INTx Sensitivity Register at index 0x44 in the same port I/O + * space, whose bits 3:0 select the trigger mode for INT[4:1] lines + * respectively. Any bit set to 1 causes interrupts coming on the + * corresponding line to be passed to ISA as edge-triggered and + * otherwise they are passed as level-triggered. Manufacturer's + * documentation says this register has to be set consistently with + * the relevant ELCR register. + * + * Accesses to the port I/O space concerned here need to be unlocked + * by writing the value of 0xc5 to the Lock Register at index 0x03 + * beforehand. Any other value written to said register prevents + * further accesses from reaching the register file, except for the + * Lock Register being written with 0xc5 again. + * + * References: + * + * "M1489/M1487: 486 PCI Chip Set", Version 1.2, Acer Laboratories + * Inc., July 1997 + */ + +#define PC_CONF_FINALI_LOCK 0x03u +#define PC_CONF_FINALI_PCI_INTX_RT1 0x42u +#define PC_CONF_FINALI_PCI_INTX_RT2 0x43u +#define PC_CONF_FINALI_PCI_INTX_SENS 0x44u + +#define PC_CONF_FINALI_LOCK_KEY 0xc5u + +static u8 read_pc_conf_nybble(u8 base, u8 index) +{ + u8 reg = base + (index >> 1); + u8 x; + + x = pc_conf_get(reg); + return index & 1 ? x >> 4 : x & 0xf; +} + +static void write_pc_conf_nybble(u8 base, u8 index, u8 val) +{ + u8 reg = base + (index >> 1); + u8 x; + + x = pc_conf_get(reg); + x = index & 1 ? (x & 0x0f) | (val << 4) : (x & 0xf0) | val; + pc_conf_set(reg, x); +} + +static int pirq_finali_get(struct pci_dev *router, struct pci_dev *dev, + int pirq) +{ + static const u8 irqmap[16] = { + 0, 9, 3, 10, 4, 5, 7, 6, 0, 11, 0, 12, 0, 14, 0, 15 + }; + unsigned long flags; + u8 x; + + raw_spin_lock_irqsave(&pc_conf_lock, flags); + pc_conf_set(PC_CONF_FINALI_LOCK, PC_CONF_FINALI_LOCK_KEY); + x = irqmap[read_pc_conf_nybble(PC_CONF_FINALI_PCI_INTX_RT1, pirq - 1)]; + pc_conf_set(PC_CONF_FINALI_LOCK, 0); + raw_spin_unlock_irqrestore(&pc_conf_lock, flags); + return x; +} + +static int pirq_finali_set(struct pci_dev *router, struct pci_dev *dev, + int pirq, int irq) +{ + static const u8 irqmap[16] = { + 0, 0, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 + }; + u8 val = irqmap[irq]; + unsigned long flags; + + if (!val) + return 0; + + raw_spin_lock_irqsave(&pc_conf_lock, flags); + pc_conf_set(PC_CONF_FINALI_LOCK, PC_CONF_FINALI_LOCK_KEY); + write_pc_conf_nybble(PC_CONF_FINALI_PCI_INTX_RT1, pirq - 1, val); + pc_conf_set(PC_CONF_FINALI_LOCK, 0); + raw_spin_unlock_irqrestore(&pc_conf_lock, flags); + return 1; +} + +static int pirq_finali_lvl(struct pci_dev *router, struct pci_dev *dev, + int pirq, int irq) +{ + u8 mask = ~(1u << (pirq - 1)); + unsigned long flags; + u8 trig; + + elcr_set_level_irq(irq); + raw_spin_lock_irqsave(&pc_conf_lock, flags); + pc_conf_set(PC_CONF_FINALI_LOCK, PC_CONF_FINALI_LOCK_KEY); + trig = pc_conf_get(PC_CONF_FINALI_PCI_INTX_SENS); + trig &= mask; + pc_conf_set(PC_CONF_FINALI_PCI_INTX_SENS, trig); + pc_conf_set(PC_CONF_FINALI_LOCK, 0); + raw_spin_unlock_irqrestore(&pc_conf_lock, flags); + return 1; +} + /* * Common IRQ routing practice: nibbles in config space, * offset by some magic constant. @@ -745,6 +883,12 @@ static __init int ite_router_probe(struct irq_router *r, struct pci_dev *router, static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) { switch (device) { + case PCI_DEVICE_ID_AL_M1489: + r->name = "FinALi"; + r->get = pirq_finali_get; + r->set = pirq_finali_set; + r->lvl = pirq_finali_lvl; + return 1; case PCI_DEVICE_ID_AL_M1533: case PCI_DEVICE_ID_AL_M1563: r->name = "ALI"; @@ -968,11 +1112,17 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign) } else if (r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \ ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask))) { msg = "found"; - elcr_set_level_irq(irq); + if (r->lvl) + r->lvl(pirq_router_dev, dev, pirq, irq); + else + elcr_set_level_irq(irq); } else if (newirq && r->set && (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) { if (r->set(pirq_router_dev, dev, pirq, newirq)) { - elcr_set_level_irq(newirq); + if (r->lvl) + r->lvl(pirq_router_dev, dev, pirq, newirq); + else + elcr_set_level_irq(newirq); msg = "assigned"; irq = newirq; } diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 4bac1831de80..256fa4d7e86c 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -1121,6 +1121,7 @@ #define PCI_DEVICE_ID_3COM_3CR990SVR 0x990a #define PCI_VENDOR_ID_AL 0x10b9 +#define PCI_DEVICE_ID_AL_M1489 0x1489 #define PCI_DEVICE_ID_AL_M1533 0x1533 #define PCI_DEVICE_ID_AL_M1535 0x1535 #define PCI_DEVICE_ID_AL_M1541 0x1541 From 6b79164f603d14a3ff9c64330c1ca6c05f0b019e Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Tue, 20 Jul 2021 05:27:59 +0200 Subject: [PATCH 086/315] x86/PCI: Add support for the Intel 82374EB/82374SB (ESC) PIRQ router The Intel 82374EB/82374SB EISA System Component (ESC) devices implement PCI interrupt steering with a PIRQ router[1] in the form of four PIRQ Route Control registers, available in the port I/O space accessible indirectly via the index/data register pair at 0x22/0x23, located at indices 0x60/0x61/0x62/0x63 for the PIRQ0/1/2/3# lines respectively. The semantics is the same as with the PIIX router, however it is not clear if BIOSes use register indices or line numbers as the cookie to identify PCI interrupts in their routing tables and therefore support either scheme. Accesses to the port I/O space concerned here need to be unlocked by writing the value of 0x0f to the ESC ID Register at index 0x02 beforehand[2]. Do so then and then lock access after use for safety. This locking could possibly interfere with accesses to the Intel MP spec IMCR register, implemented by the 82374SB variant of the ESC only as the PCI/APIC Control Register at index 0x70[3], for which leaving access to the configuration space concerned unlocked may have been a requirement for the BIOS to remain compliant with the MP spec. However we only poke at the IMCR register if the APIC mode is used, in which case the PIRQ router is not, so this arrangement is not going to interfere with IMCR access code. The ESC is implemented as a part of the combined southbridge also made of 82375EB/82375SB PCI-EISA Bridge (PCEB) and does itself appear in the PCI configuration space. Use the PCEB's device identification then for determining the presence of the ESC. References: [1] "82374EB/82374SB EISA System Component (ESC)", Intel Corporation, Order Number: 290476-004, March 1996, Section 3.1.12 "PIRQ[0:3]#--PIRQ Route Control Registers", pp. 44-45 [2] same, Section 3.1.1 "ESCID--ESC ID Register", p. 36 [3] same, Section 3.1.17 "PAC--PCI/APIC Control Register", p. 47 Signed-off-by: Maciej W. Rozycki Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/alpine.DEB.2.21.2107192023450.9461@angie.orcam.me.uk --- arch/x86/pci/irq.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 1bccbc419630..187e284f2021 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -357,6 +357,74 @@ static int pirq_ali_set(struct pci_dev *router, struct pci_dev *dev, int pirq, i return 0; } +/* + * PIRQ routing for the 82374EB/82374SB EISA System Component (ESC) + * ASIC used with the Intel 82420 and 82430 PCIsets. The ESC is not + * decoded in the PCI configuration space, so we identify it by the + * accompanying 82375EB/82375SB PCI-EISA Bridge (PCEB) ASIC. + * + * There are four PIRQ Route Control registers, available in the + * port I/O space accessible indirectly via the index/data register + * pair at 0x22/0x23, located at indices 0x60/0x61/0x62/0x63 for the + * PIRQ0/1/2/3# lines respectively. The semantics is the same as + * with the PIIX router. + * + * Accesses to the port I/O space concerned here need to be unlocked + * by writing the value of 0x0f to the ESC ID Register at index 0x02 + * beforehand. Any other value written to said register prevents + * further accesses from reaching the register file, except for the + * ESC ID Register being written with 0x0f again. + * + * References: + * + * "82374EB/82374SB EISA System Component (ESC)", Intel Corporation, + * Order Number: 290476-004, March 1996 + * + * "82375EB/82375SB PCI-EISA Bridge (PCEB)", Intel Corporation, Order + * Number: 290477-004, March 1996 + */ + +#define PC_CONF_I82374_ESC_ID 0x02u +#define PC_CONF_I82374_PIRQ_ROUTE_CONTROL 0x60u + +#define PC_CONF_I82374_ESC_ID_KEY 0x0fu + +static int pirq_esc_get(struct pci_dev *router, struct pci_dev *dev, int pirq) +{ + unsigned long flags; + int reg; + u8 x; + + reg = pirq; + if (reg >= 1 && reg <= 4) + reg += PC_CONF_I82374_PIRQ_ROUTE_CONTROL - 1; + + raw_spin_lock_irqsave(&pc_conf_lock, flags); + pc_conf_set(PC_CONF_I82374_ESC_ID, PC_CONF_I82374_ESC_ID_KEY); + x = pc_conf_get(reg); + pc_conf_set(PC_CONF_I82374_ESC_ID, 0); + raw_spin_unlock_irqrestore(&pc_conf_lock, flags); + return (x < 16) ? x : 0; +} + +static int pirq_esc_set(struct pci_dev *router, struct pci_dev *dev, int pirq, + int irq) +{ + unsigned long flags; + int reg; + + reg = pirq; + if (reg >= 1 && reg <= 4) + reg += PC_CONF_I82374_PIRQ_ROUTE_CONTROL - 1; + + raw_spin_lock_irqsave(&pc_conf_lock, flags); + pc_conf_set(PC_CONF_I82374_ESC_ID, PC_CONF_I82374_ESC_ID_KEY); + pc_conf_set(reg, irq); + pc_conf_set(PC_CONF_I82374_ESC_ID, 0); + raw_spin_unlock_irqrestore(&pc_conf_lock, flags); + return 1; +} + /* * The Intel PIIX4 pirq rules are fairly simple: "pirq" is * just a pointer to the config space. @@ -687,6 +755,11 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route return 0; switch (device) { + case PCI_DEVICE_ID_INTEL_82375: + r->name = "PCEB/ESC"; + r->get = pirq_esc_get; + r->set = pirq_esc_set; + return 1; case PCI_DEVICE_ID_INTEL_82371FB_0: case PCI_DEVICE_ID_INTEL_82371SB_0: case PCI_DEVICE_ID_INTEL_82371AB_0: From 0e8c6f56fab3af3ef9f78f486e198792d3af0fa1 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Tue, 20 Jul 2021 05:28:04 +0200 Subject: [PATCH 087/315] x86/PCI: Add support for the Intel 82426EX PIRQ router The Intel 82426EX ISA Bridge (IB), a part of the Intel 82420EX PCIset, implements PCI interrupt steering with a PIRQ router in the form of two PIRQ Route Control registers, available in the PCI configuration space at locations 0x66 and 0x67 for the PIRQ0# and PIRQ1# lines respectively. The semantics is the same as with the PIIX router, however it is not clear if BIOSes use register indices or line numbers as the cookie to identify PCI interrupts in their routing tables and therefore support either scheme. The IB is directly attached to the Intel 82425EX PCI System Controller (PSC) component of the chipset via a dedicated PSC/IB Link interface rather than the host bus or PCI. Therefore it does not itself appear in the PCI configuration space even though it responds to configuration cycles addressing registers it implements. Use 82425EX's identification then for determining the presence of the IB. References: [1] "82420EX PCIset Data Sheet, 82425EX PCI System Controller (PSC) and 82426EX ISA Bridge (IB)", Intel Corporation, Order Number: 290488-004, December 1995, Section 3.3.18 "PIRQ1RC/PIRQ0RC--PIRQ Route Control Registers", p. 61 Signed-off-by: Maciej W. Rozycki Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/alpine.DEB.2.21.2107200213490.9461@angie.orcam.me.uk --- arch/x86/pci/irq.c | 49 +++++++++++++++++++++++++++++++++++++++++ include/linux/pci_ids.h | 1 + 2 files changed, 50 insertions(+) diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 187e284f2021..b937c96f9f85 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -443,6 +443,50 @@ static int pirq_piix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, return 1; } +/* + * PIRQ routing for the 82426EX ISA Bridge (IB) ASIC used with the + * Intel 82420EX PCIset. + * + * There are only two PIRQ Route Control registers, available in the + * combined 82425EX/82426EX PCI configuration space, at 0x66 and 0x67 + * for the PIRQ0# and PIRQ1# lines respectively. The semantics is + * the same as with the PIIX router. + * + * References: + * + * "82420EX PCIset Data Sheet, 82425EX PCI System Controller (PSC) + * and 82426EX ISA Bridge (IB)", Intel Corporation, Order Number: + * 290488-004, December 1995 + */ + +#define PCI_I82426EX_PIRQ_ROUTE_CONTROL 0x66u + +static int pirq_ib_get(struct pci_dev *router, struct pci_dev *dev, int pirq) +{ + int reg; + u8 x; + + reg = pirq; + if (reg >= 1 && reg <= 2) + reg += PCI_I82426EX_PIRQ_ROUTE_CONTROL - 1; + + pci_read_config_byte(router, reg, &x); + return (x < 16) ? x : 0; +} + +static int pirq_ib_set(struct pci_dev *router, struct pci_dev *dev, int pirq, + int irq) +{ + int reg; + + reg = pirq; + if (reg >= 1 && reg <= 2) + reg += PCI_I82426EX_PIRQ_ROUTE_CONTROL - 1; + + pci_write_config_byte(router, reg, irq); + return 1; +} + /* * The VIA pirq rules are nibble-based, like ALI, * but without the ugly irq number munging. @@ -805,6 +849,11 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route r->get = pirq_piix_get; r->set = pirq_piix_set; return 1; + case PCI_DEVICE_ID_INTEL_82425: + r->name = "PSC/IB"; + r->get = pirq_ib_get; + r->set = pirq_ib_set; + return 1; } if ((device >= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MIN && diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 256fa4d7e86c..60e2101a009d 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2644,6 +2644,7 @@ #define PCI_DEVICE_ID_INTEL_82375 0x0482 #define PCI_DEVICE_ID_INTEL_82424 0x0483 #define PCI_DEVICE_ID_INTEL_82378 0x0484 +#define PCI_DEVICE_ID_INTEL_82425 0x0486 #define PCI_DEVICE_ID_INTEL_MRST_SD0 0x0807 #define PCI_DEVICE_ID_INTEL_MRST_SD1 0x0808 #define PCI_DEVICE_ID_INTEL_MFD_SD 0x0820 From d25316616842b593de6f89ce2101f1af62f4d559 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Tue, 20 Jul 2021 05:28:09 +0200 Subject: [PATCH 088/315] x86: Avoid magic number with ELCR register accesses Define PIC_ELCR1 and PIC_ELCR2 macros for accesses to the ELCR registers implemented by many chipsets in their embedded 8259A PIC cores, avoiding magic numbers that are difficult to handle, and complementing the macros we already have for registers originally defined with discrete 8259A PIC implementations. No functional change. Signed-off-by: Maciej W. Rozycki Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/alpine.DEB.2.21.2107200237300.9461@angie.orcam.me.uk --- arch/x86/include/asm/i8259.h | 2 ++ arch/x86/kernel/acpi/boot.c | 6 +++--- arch/x86/kernel/apic/io_apic.c | 2 +- arch/x86/kernel/apic/vector.c | 2 +- arch/x86/kernel/i8259.c | 8 ++++---- arch/x86/kernel/mpparse.c | 3 ++- arch/x86/pci/irq.c | 3 ++- 7 files changed, 15 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h index 89789e8c80f6..637fa1df3512 100644 --- a/arch/x86/include/asm/i8259.h +++ b/arch/x86/include/asm/i8259.h @@ -19,6 +19,8 @@ extern unsigned int cached_irq_mask; #define PIC_MASTER_OCW3 PIC_MASTER_ISR #define PIC_SLAVE_CMD 0xa0 #define PIC_SLAVE_IMR 0xa1 +#define PIC_ELCR1 0x4d0 +#define PIC_ELCR2 0x4d1 /* i8259A PIC related value */ #define PIC_CASCADE_IR 2 diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index e55e0c1fad8c..7f59f831c47c 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -570,7 +570,7 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger) unsigned int old, new; /* Real old ELCR mask */ - old = inb(0x4d0) | (inb(0x4d1) << 8); + old = inb(PIC_ELCR1) | (inb(PIC_ELCR2) << 8); /* * If we use ACPI to set PCI IRQs, then we should clear ELCR @@ -596,8 +596,8 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger) return; pr_warn("setting ELCR to %04x (from %04x)\n", new, old); - outb(new, 0x4d0); - outb(new >> 8, 0x4d1); + outb(new, PIC_ELCR1); + outb(new >> 8, PIC_ELCR2); } int acpi_gsi_to_irq(u32 gsi, unsigned int *irqp) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index d5c691a3208b..7846499c54ec 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -764,7 +764,7 @@ static bool irq_active_low(int idx) static bool EISA_ELCR(unsigned int irq) { if (irq < nr_legacy_irqs()) { - unsigned int port = 0x4d0 + (irq >> 3); + unsigned int port = PIC_ELCR1 + (irq >> 3); return (inb(port) >> (irq & 7)) & 1; } apic_printk(APIC_VERBOSE, KERN_INFO diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index fb67ed5e7e6a..c132daabe615 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -1299,7 +1299,7 @@ static void __init print_PIC(void) pr_debug("... PIC ISR: %04x\n", v); - v = inb(0x4d1) << 8 | inb(0x4d0); + v = inb(PIC_ELCR2) << 8 | inb(PIC_ELCR1); pr_debug("... PIC ELCR: %04x\n", v); } diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 282b4ee1339f..15aefa3f3e18 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -235,15 +235,15 @@ static char irq_trigger[2]; */ static void restore_ELCR(char *trigger) { - outb(trigger[0], 0x4d0); - outb(trigger[1], 0x4d1); + outb(trigger[0], PIC_ELCR1); + outb(trigger[1], PIC_ELCR2); } static void save_ELCR(char *trigger) { /* IRQ 0,1,2,8,13 are marked as reserved */ - trigger[0] = inb(0x4d0) & 0xF8; - trigger[1] = inb(0x4d1) & 0xDE; + trigger[0] = inb(PIC_ELCR1) & 0xF8; + trigger[1] = inb(PIC_ELCR2) & 0xDE; } static void i8259A_resume(void) diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 8f06449aab27..fed721f90116 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -251,7 +252,7 @@ static int __init ELCR_trigger(unsigned int irq) { unsigned int port; - port = 0x4d0 + (irq >> 3); + port = PIC_ELCR1 + (irq >> 3); return (inb(port) >> (irq & 7)) & 1; } diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index b937c96f9f85..97b63e35e152 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -18,6 +18,7 @@ #include #include +#include #include #include @@ -158,7 +159,7 @@ static void __init pirq_peer_trick(void) void elcr_set_level_irq(unsigned int irq) { unsigned char mask = 1 << (irq & 7); - unsigned int port = 0x4d0 + (irq >> 3); + unsigned int port = PIC_ELCR1 + (irq >> 3); unsigned char val; static u16 elcr_irq_mask; From 34739a2809e1e5d54d41d93cfc6b074e8d781ee2 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Tue, 20 Jul 2021 05:28:15 +0200 Subject: [PATCH 089/315] x86: Fix typo s/ECLR/ELCR/ for the PIC register The proper spelling for the acronym referring to the Edge/Level Control Register is ELCR rather than ECLR. Adjust references accordingly. No functional change. Signed-off-by: Maciej W. Rozycki Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/alpine.DEB.2.21.2107200251080.9461@angie.orcam.me.uk --- arch/x86/kernel/acpi/boot.c | 6 +++--- arch/x86/kvm/i8259.c | 20 ++++++++++---------- arch/x86/kvm/irq.h | 2 +- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 7f59f831c47c..14bcd59bcdee 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -558,10 +558,10 @@ acpi_parse_nmi_src(union acpi_subtable_headers * header, const unsigned long end * If a PIC-mode SCI is not recognized or gives spurious IRQ7's * it may require Edge Trigger -- use "acpi_sci=edge" * - * Port 0x4d0-4d1 are ECLR1 and ECLR2, the Edge/Level Control Registers + * Port 0x4d0-4d1 are ELCR1 and ELCR2, the Edge/Level Control Registers * for the 8259 PIC. bit[n] = 1 means irq[n] is Level, otherwise Edge. - * ECLR1 is IRQs 0-7 (IRQ 0, 1, 2 must be 0) - * ECLR2 is IRQs 8-15 (IRQ 8, 13 must be 0) + * ELCR1 is IRQs 0-7 (IRQ 0, 1, 2 must be 0) + * ELCR2 is IRQs 8-15 (IRQ 8, 13 must be 0) */ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger) diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 629a09ca9860..0b80263d46d8 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -541,17 +541,17 @@ static int picdev_slave_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, addr, len, val); } -static int picdev_eclr_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, +static int picdev_elcr_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, gpa_t addr, int len, const void *val) { - return picdev_write(container_of(dev, struct kvm_pic, dev_eclr), + return picdev_write(container_of(dev, struct kvm_pic, dev_elcr), addr, len, val); } -static int picdev_eclr_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, +static int picdev_elcr_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, gpa_t addr, int len, void *val) { - return picdev_read(container_of(dev, struct kvm_pic, dev_eclr), + return picdev_read(container_of(dev, struct kvm_pic, dev_elcr), addr, len, val); } @@ -577,9 +577,9 @@ static const struct kvm_io_device_ops picdev_slave_ops = { .write = picdev_slave_write, }; -static const struct kvm_io_device_ops picdev_eclr_ops = { - .read = picdev_eclr_read, - .write = picdev_eclr_write, +static const struct kvm_io_device_ops picdev_elcr_ops = { + .read = picdev_elcr_read, + .write = picdev_elcr_write, }; int kvm_pic_init(struct kvm *kvm) @@ -602,7 +602,7 @@ int kvm_pic_init(struct kvm *kvm) */ kvm_iodevice_init(&s->dev_master, &picdev_master_ops); kvm_iodevice_init(&s->dev_slave, &picdev_slave_ops); - kvm_iodevice_init(&s->dev_eclr, &picdev_eclr_ops); + kvm_iodevice_init(&s->dev_elcr, &picdev_elcr_ops); mutex_lock(&kvm->slots_lock); ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x20, 2, &s->dev_master); @@ -613,7 +613,7 @@ int kvm_pic_init(struct kvm *kvm) if (ret < 0) goto fail_unreg_2; - ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x4d0, 2, &s->dev_eclr); + ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x4d0, 2, &s->dev_elcr); if (ret < 0) goto fail_unreg_1; @@ -647,7 +647,7 @@ void kvm_pic_destroy(struct kvm *kvm) mutex_lock(&kvm->slots_lock); kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_master); kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_slave); - kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_eclr); + kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_elcr); mutex_unlock(&kvm->slots_lock); kvm->arch.vpic = NULL; diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 9b64abf9b3f1..650642b18d15 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -55,7 +55,7 @@ struct kvm_pic { int output; /* intr from master PIC */ struct kvm_io_device dev_master; struct kvm_io_device dev_slave; - struct kvm_io_device dev_eclr; + struct kvm_io_device dev_elcr; void (*ack_notifier)(void *opaque, int irq); unsigned long irq_states[PIC_NUM_PINS]; }; From 018eca456c4b4dca56aaf1ec27f309c74d0fe246 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Wed, 21 Jul 2021 10:53:15 +0800 Subject: [PATCH 090/315] block: move some macros to blkdev.h Move them (PAGE_SECTORS_SHIFT, PAGE_SECTORS and SECTOR_MASK) to the generic header file to remove redundancy. Signed-off-by: Guoqing Jiang Link: https://lore.kernel.org/r/20210721025315.1729118-1-guoqing.jiang@linux.dev Signed-off-by: Jens Axboe --- drivers/block/brd.c | 3 --- drivers/block/null_blk/main.c | 4 ---- drivers/md/bcache/util.h | 2 -- include/linux/blkdev.h | 4 ++++ include/linux/device-mapper.h | 1 - 5 files changed, 4 insertions(+), 10 deletions(-) diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 95694113e38e..58ec167aa018 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -27,9 +27,6 @@ #include -#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) -#define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT) - /* * Each block ramdisk device has a radix_tree brd_pages of pages that stores * the pages containing the block device's contents. A brd page's ->index is diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index d734e9ee1546..f128242d1170 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -11,10 +11,6 @@ #include #include "null_blk.h" -#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) -#define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT) -#define SECTOR_MASK (PAGE_SECTORS - 1) - #define FREE_BATCH 16 #define TICKS_PER_SEC 50ULL diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index bca4a7c97da7..b64460a76267 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -15,8 +15,6 @@ #include "closure.h" -#define PAGE_SECTORS (PAGE_SIZE / 512) - struct closure; #ifdef CONFIG_BCACHE_DEBUG diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 07eef02325b4..df404c1fb087 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -939,6 +939,10 @@ static inline struct request_queue *bdev_get_queue(struct block_device *bdev) #define SECTOR_SIZE (1 << SECTOR_SHIFT) #endif +#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) +#define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT) +#define SECTOR_MASK (PAGE_SECTORS - 1) + /* * blk_rq_pos() : the current sector * blk_rq_bytes() : bytes left in the entire request diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 7457d49acf9a..94f2cd6a8e83 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -151,7 +151,6 @@ typedef size_t (*dm_dax_copy_iter_fn)(struct dm_target *ti, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i); typedef int (*dm_dax_zero_page_range_fn)(struct dm_target *ti, pgoff_t pgoff, size_t nr_pages); -#define PAGE_SECTORS (PAGE_SIZE / 512) void dm_error(const char *message); From 162a5284faf41b2441b8f686f9ac4771c7a8f669 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Sun, 30 May 2021 12:24:45 -0400 Subject: [PATCH 091/315] x86/reboot: Document the "reboot=pci" option It is mentioned in the top level non-arch specific file but it was overlooked here for x86. Signed-off-by: Paul Gortmaker Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210530162447.996461-2-paul.gortmaker@windriver.com --- Documentation/x86/x86_64/boot-options.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Documentation/x86/x86_64/boot-options.rst b/Documentation/x86/x86_64/boot-options.rst index 5f62b3b86357..482f3b29f99a 100644 --- a/Documentation/x86/x86_64/boot-options.rst +++ b/Documentation/x86/x86_64/boot-options.rst @@ -126,7 +126,7 @@ Idle loop Rebooting ========= - reboot=b[ios] | t[riple] | k[bd] | a[cpi] | e[fi] [, [w]arm | [c]old] + reboot=b[ios] | t[riple] | k[bd] | a[cpi] | e[fi] | p[ci] [, [w]arm | [c]old] bios Use the CPU reboot vector for warm reset warm @@ -145,6 +145,8 @@ Rebooting Use efi reset_system runtime service. If EFI is not configured or the EFI reset does not work, the reboot path attempts the reset using the keyboard controller. + pci + Use a write to the PCI config space register 0xcf9 to trigger reboot. Using warm reset will be much faster especially on big memory systems because the BIOS will not go through the memory check. From 12febc181886f0658ce3413f554203c255d338dd Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Sun, 30 May 2021 12:24:46 -0400 Subject: [PATCH 092/315] x86/reboot: Document how to override DMI platform quirks commit 5955633e91bf ("x86/reboot: Skip DMI checks if reboot set by user") made it so that it's not required to recompile the kernel in order to bypass broken reboot quirks compiled into an image: * This variable is used privately to keep track of whether or not * reboot_type is still set to its default value (i.e., reboot= hasn't * been set on the command line). This is needed so that we can * suppress DMI scanning for reboot quirks. Without it, it's * impossible to override a faulty reboot quirk without recompiling. However, at the time it was not eally documented outside the source code, and so this information isn't really available to the average user out there. The change is a little white lie and invented "reboot=default" since it is easy to remember, and documents well. The truth is that any random string that is *not* a currently accepted string will work. Since that doesn't document well for non-coders, and since it's unknown what the future additions might be, lay claim on "default" since that is exactly what it achieves. Signed-off-by: Paul Gortmaker Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210530162447.996461-3-paul.gortmaker@windriver.com --- Documentation/admin-guide/kernel-parameters.txt | 2 +- Documentation/x86/x86_64/boot-options.rst | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index bdb22006f713..34c8dd54518e 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4777,7 +4777,7 @@ reboot= [KNL] Format (x86 or x86_64): - [w[arm] | c[old] | h[ard] | s[oft] | g[pio]] \ + [w[arm] | c[old] | h[ard] | s[oft] | g[pio]] | d[efault] \ [[,]s[mp]#### \ [[,]b[ios] | a[cpi] | k[bd] | t[riple] | e[fi] | p[ci]] \ [[,]f[orce] diff --git a/Documentation/x86/x86_64/boot-options.rst b/Documentation/x86/x86_64/boot-options.rst index 482f3b29f99a..ccb7e86bf8d9 100644 --- a/Documentation/x86/x86_64/boot-options.rst +++ b/Documentation/x86/x86_64/boot-options.rst @@ -157,6 +157,13 @@ Rebooting Don't stop other CPUs on reboot. This can make reboot more reliable in some cases. + reboot=default + There are some built-in platform specific "quirks" - you may see: + "reboot: series board detected. Selecting for reboots." + In the case where you think the quirk is in error (e.g. you have + newer BIOS, or newer board) using this option will ignore the built-in + quirk table, and use the generic default reboot actions. + Non Executable Mappings ======================= From a729691b541f6e63043beae72e635635abe5dc09 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Sun, 30 May 2021 12:24:47 -0400 Subject: [PATCH 093/315] x86/reboot: Limit Dell Optiplex 990 quirk to early BIOS versions When this platform was relatively new in November 2011, with early BIOS revisions, a reboot quirk was added in commit 6be30bb7d750 ("x86/reboot: Blacklist Dell OptiPlex 990 known to require PCI reboot") However, this quirk (and several others) are open-ended to all BIOS versions and left no automatic expiry if/when the system BIOS fixed the issue, meaning that nobody is likely to come along and re-test. What is really problematic with using PCI reboot as this quirk does, is that it causes this platform to do a full power down, wait one second, and then power back on. This is less than ideal if one is using it for boot testing and/or bisecting kernels when legacy rotating hard disks are installed. It was only by chance that the quirk was noticed in dmesg - and when disabled it turned out that it wasn't required anymore (BIOS A24), and a default reboot would work fine without the "harshness" of power cycling the machine (and disks) down and up like the PCI reboot does. Doing a bit more research, it seems that the "newest" BIOS for which the issue was reported[1] was version A06, however Dell[2] seemed to suggest only up to and including version A05, with the A06 having a large number of fixes[3] listed. As is typical with a new platform, the initial BIOS updates come frequently and then taper off (and in this case, with a revival for CPU CVEs); a search for O990-A.exe reveals the following dates: A02 16 Mar 2011 A03 11 May 2011 A06 14 Sep 2011 A07 24 Oct 2011 A10 08 Dec 2011 A14 06 Sep 2012 A16 15 Oct 2012 A18 30 Sep 2013 A19 23 Sep 2015 A20 02 Jun 2017 A23 07 Mar 2018 A24 21 Aug 2018 While it's overkill to flash and test each of the above, it would seem likely that the issue was contained within A0x BIOS versions, given the dates above and the dates of issue reports[4] from distros. So rather than just throw out the quirk entirely, limit the scope to just those early BIOS versions, in case people are still running systems from 2011 with the original as-shipped early A0x BIOS versions. [1] https://lore.kernel.org/lkml/1320373471-3942-1-git-send-email-trenn@suse.de/ [2] https://www.dell.com/support/kbdoc/en-ca/000131908/linux-based-operating-systems-stall-upon-reboot-on-optiplex-390-790-990-systems [3] https://www.dell.com/support/home/en-ca/drivers/driversdetails?driverid=85j10 [4] https://bugs.launchpad.net/ubuntu/+source/linux/+bug/768039 Fixes: 6be30bb7d750 ("x86/reboot: Blacklist Dell OptiPlex 990 known to require PCI reboot") Signed-off-by: Paul Gortmaker Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210530162447.996461-4-paul.gortmaker@windriver.com --- arch/x86/kernel/reboot.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index ebfb91108232..0a40df66a40d 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -388,10 +388,11 @@ static const struct dmi_system_id reboot_dmi_table[] __initconst = { }, { /* Handle problems with rebooting on the OptiPlex 990. */ .callback = set_pci_reboot, - .ident = "Dell OptiPlex 990", + .ident = "Dell OptiPlex 990 BIOS A0x", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"), + DMI_MATCH(DMI_BIOS_VERSION, "A0"), }, }, { /* Handle problems with rebooting on Dell 300's */ From 29e6a5e01d0adae52a2859ed39cb9e607430e011 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Aug 2021 08:40:21 +0200 Subject: [PATCH 094/315] mmc: block: let device_add_disk create disk attributes Pass the attribute group for the attributes on the gendisk to device_add_disk so that they are created atomically with the disk creation. Signed-off-by: Christoph Hellwig Acked-by: Ulf Hansson Link: https://lore.kernel.org/r/20210809064028.1198327-2-hch@lst.de Signed-off-by: Jens Axboe --- drivers/mmc/core/block.c | 102 +++++++++++++++++---------------------- 1 file changed, 45 insertions(+), 57 deletions(-) diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index ce8aed562929..4ac3e1b93e7e 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -128,8 +128,6 @@ struct mmc_blk_data { * track of the current selected device partition. */ unsigned int part_curr; - struct device_attribute force_ro; - struct device_attribute power_ro_lock; int area_type; /* debugfs files (only in main mmc_blk_data) */ @@ -281,6 +279,9 @@ out_put: return count; } +static DEVICE_ATTR(ro_lock_until_next_power_on, 0, + power_ro_lock_show, power_ro_lock_store); + static ssize_t force_ro_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -313,6 +314,44 @@ out: return ret; } +static DEVICE_ATTR(force_ro, 0644, force_ro_show, force_ro_store); + +static struct attribute *mmc_disk_attrs[] = { + &dev_attr_force_ro.attr, + &dev_attr_ro_lock_until_next_power_on.attr, + NULL, +}; + +static umode_t mmc_disk_attrs_is_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct device *dev = container_of(kobj, struct device, kobj); + struct mmc_blk_data *md = mmc_blk_get(dev_to_disk(dev)); + umode_t mode = a->mode; + + if (a == &dev_attr_ro_lock_until_next_power_on.attr && + (md->area_type & MMC_BLK_DATA_AREA_BOOT) && + md->queue.card->ext_csd.boot_ro_lockable) { + mode = S_IRUGO; + if (!(md->queue.card->ext_csd.boot_ro_lock & + EXT_CSD_BOOT_WP_B_PWR_WP_DIS)) + mode |= S_IWUSR; + } + + mmc_blk_put(md); + return mode; +} + +static const struct attribute_group mmc_disk_attr_group = { + .is_visible = mmc_disk_attrs_is_visible, + .attrs = mmc_disk_attrs, +}; + +static const struct attribute_group *mmc_disk_attr_groups[] = { + &mmc_disk_attr_group, + NULL, +}; + static int mmc_blk_open(struct block_device *bdev, fmode_t mode) { struct mmc_blk_data *md = mmc_blk_get(bdev->bd_disk); @@ -2644,15 +2683,8 @@ static void mmc_blk_remove_req(struct mmc_blk_data *md) * from being accepted. */ card = md->queue.card; - if (md->disk->flags & GENHD_FL_UP) { - device_remove_file(disk_to_dev(md->disk), &md->force_ro); - if ((md->area_type & MMC_BLK_DATA_AREA_BOOT) && - card->ext_csd.boot_ro_lockable) - device_remove_file(disk_to_dev(md->disk), - &md->power_ro_lock); - + if (md->disk->flags & GENHD_FL_UP) del_gendisk(md->disk); - } mmc_cleanup_queue(&md->queue); mmc_blk_put(md); } @@ -2679,51 +2711,6 @@ static void mmc_blk_remove_parts(struct mmc_card *card, } } -static int mmc_add_disk(struct mmc_blk_data *md) -{ - int ret; - struct mmc_card *card = md->queue.card; - - device_add_disk(md->parent, md->disk, NULL); - md->force_ro.show = force_ro_show; - md->force_ro.store = force_ro_store; - sysfs_attr_init(&md->force_ro.attr); - md->force_ro.attr.name = "force_ro"; - md->force_ro.attr.mode = S_IRUGO | S_IWUSR; - ret = device_create_file(disk_to_dev(md->disk), &md->force_ro); - if (ret) - goto force_ro_fail; - - if ((md->area_type & MMC_BLK_DATA_AREA_BOOT) && - card->ext_csd.boot_ro_lockable) { - umode_t mode; - - if (card->ext_csd.boot_ro_lock & EXT_CSD_BOOT_WP_B_PWR_WP_DIS) - mode = S_IRUGO; - else - mode = S_IRUGO | S_IWUSR; - - md->power_ro_lock.show = power_ro_lock_show; - md->power_ro_lock.store = power_ro_lock_store; - sysfs_attr_init(&md->power_ro_lock.attr); - md->power_ro_lock.attr.mode = mode; - md->power_ro_lock.attr.name = - "ro_lock_until_next_power_on"; - ret = device_create_file(disk_to_dev(md->disk), - &md->power_ro_lock); - if (ret) - goto power_ro_lock_fail; - } - return ret; - -power_ro_lock_fail: - device_remove_file(disk_to_dev(md->disk), &md->force_ro); -force_ro_fail: - del_gendisk(md->disk); - - return ret; -} - #ifdef CONFIG_DEBUG_FS static int mmc_dbg_card_status_get(void *data, u64 *val) @@ -2919,12 +2906,13 @@ static int mmc_blk_probe(struct mmc_card *card) dev_set_drvdata(&card->dev, md); - ret = mmc_add_disk(md); + device_add_disk(md->parent, md->disk, mmc_disk_attr_groups); if (ret) goto out; list_for_each_entry(part_md, &md->part, part) { - ret = mmc_add_disk(part_md); + device_add_disk(part_md->parent, part_md->disk, + mmc_disk_attr_groups); if (ret) goto out; } From a94dcfce70d3f4f6cd99f3b43d74305e3a4f3983 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Aug 2021 08:40:22 +0200 Subject: [PATCH 095/315] mmc: block: cleanup gendisk creation Restructure mmc_blk_probe to avoid a failure path with a half created disk. Signed-off-by: Christoph Hellwig Acked-by: Ulf Hansson Link: https://lore.kernel.org/r/20210809064028.1198327-3-hch@lst.de Signed-off-by: Jens Axboe --- drivers/mmc/core/block.c | 49 ++++++++++++++-------------------------- 1 file changed, 17 insertions(+), 32 deletions(-) diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index 4ac3e1b93e7e..4c11f171e56d 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -2328,7 +2328,8 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card, sector_t size, bool default_ro, const char *subname, - int area_type) + int area_type, + unsigned int part_type) { struct mmc_blk_data *md; int devidx, ret; @@ -2375,6 +2376,7 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card, kref_init(&md->kref); md->queue.blkdata = md; + md->part_type = part_type; md->disk->major = MMC_BLOCK_MAJOR; md->disk->minors = perdev_minors; @@ -2427,6 +2429,10 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card, md->disk->disk_name, mmc_card_id(card), mmc_card_name(card), cap_str, md->read_only ? "(ro)" : ""); + /* used in ->open, must be set before add_disk: */ + if (area_type == MMC_BLK_DATA_AREA_MAIN) + dev_set_drvdata(&card->dev, md); + device_add_disk(md->parent, md->disk, mmc_disk_attr_groups); return md; err_kfree: @@ -2456,7 +2462,7 @@ static struct mmc_blk_data *mmc_blk_alloc(struct mmc_card *card) } return mmc_blk_alloc_req(card, &card->dev, size, false, NULL, - MMC_BLK_DATA_AREA_MAIN); + MMC_BLK_DATA_AREA_MAIN, 0); } static int mmc_blk_alloc_part(struct mmc_card *card, @@ -2470,10 +2476,9 @@ static int mmc_blk_alloc_part(struct mmc_card *card, struct mmc_blk_data *part_md; part_md = mmc_blk_alloc_req(card, disk_to_dev(md->disk), size, default_ro, - subname, area_type); + subname, area_type, part_type); if (IS_ERR(part_md)) return PTR_ERR(part_md); - part_md->part_type = part_type; list_add(&part_md->part, &md->part); return 0; @@ -2674,20 +2679,13 @@ static int mmc_blk_alloc_parts(struct mmc_card *card, struct mmc_blk_data *md) static void mmc_blk_remove_req(struct mmc_blk_data *md) { - struct mmc_card *card; - - if (md) { - /* - * Flush remaining requests and free queues. It - * is freeing the queue that stops new requests - * from being accepted. - */ - card = md->queue.card; - if (md->disk->flags & GENHD_FL_UP) - del_gendisk(md->disk); - mmc_cleanup_queue(&md->queue); - mmc_blk_put(md); - } + /* + * Flush remaining requests and free queues. It is freeing the queue + * that stops new requests from being accepted. + */ + del_gendisk(md->disk); + mmc_cleanup_queue(&md->queue); + mmc_blk_put(md); } static void mmc_blk_remove_parts(struct mmc_card *card, @@ -2876,7 +2874,7 @@ static void mmc_blk_remove_debugfs(struct mmc_card *card, static int mmc_blk_probe(struct mmc_card *card) { - struct mmc_blk_data *md, *part_md; + struct mmc_blk_data *md; int ret = 0; /* @@ -2904,19 +2902,6 @@ static int mmc_blk_probe(struct mmc_card *card) if (ret) goto out; - dev_set_drvdata(&card->dev, md); - - device_add_disk(md->parent, md->disk, mmc_disk_attr_groups); - if (ret) - goto out; - - list_for_each_entry(part_md, &md->part, part) { - device_add_disk(part_md->parent, part_md->disk, - mmc_disk_attr_groups); - if (ret) - goto out; - } - /* Add two debugfs entries */ mmc_blk_add_debugfs(card, md); From 5eba200526ac5fee7659c45b6c23fb2c576f8813 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Aug 2021 08:40:23 +0200 Subject: [PATCH 096/315] nvme: remove the GENHD_FL_UP check in nvme_ns_remove Early probe failure never reaches nvme_ns_remove, so GENHD_FL_UP must be set at this point. Remove the check. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210809064028.1198327-4-hch@lst.de Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index f6c0a59c4b53..dbe7144f0026 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3826,14 +3826,12 @@ static void nvme_ns_remove(struct nvme_ns *ns) nvme_mpath_clear_current_path(ns); synchronize_srcu(&ns->head->srcu); /* wait for concurrent submissions */ - if (ns->disk->flags & GENHD_FL_UP) { - if (!nvme_ns_head_multipath(ns->head)) - nvme_cdev_del(&ns->cdev, &ns->cdev_device); - del_gendisk(ns->disk); - blk_cleanup_queue(ns->queue); - if (blk_get_integrity(ns->disk)) - blk_integrity_unregister(ns->disk); - } + if (!nvme_ns_head_multipath(ns->head)) + nvme_cdev_del(&ns->cdev, &ns->cdev_device); + del_gendisk(ns->disk); + blk_cleanup_queue(ns->queue); + if (blk_get_integrity(ns->disk)) + blk_integrity_unregister(ns->disk); down_write(&ns->ctrl->namespaces_rwsem); list_del_init(&ns->list); From 916a470da02f909cabb65337f65438b8bc3965b2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Aug 2021 08:40:24 +0200 Subject: [PATCH 097/315] nvme: replace the GENHD_FL_UP check in nvme_mpath_shutdown_disk Use the nvme-internal NVME_NSHEAD_DISK_LIVE flag instead of abusing the block layer state. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210809064028.1198327-5-hch@lst.de Signed-off-by: Jens Axboe --- drivers/nvme/host/multipath.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 3f32c5e86bfc..37ce3e8b1db2 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -765,7 +765,7 @@ void nvme_mpath_shutdown_disk(struct nvme_ns_head *head) if (!head->disk) return; kblockd_schedule_work(&head->requeue_work); - if (head->disk->flags & GENHD_FL_UP) { + if (test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { nvme_cdev_del(&head->cdev, &head->cdev_device); del_gendisk(head->disk); } From 4f9e14aecfbdc6b762d5122489604858c5fec5e7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Aug 2021 08:40:25 +0200 Subject: [PATCH 098/315] sx8: use the internal state machine to check if del_gendisk needs to be called Remove usage of the block layer internal GENHD_FL_UP by just looking at the host state. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210809064028.1198327-6-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/sx8.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c index 7b54353ee92b..420cd952ddc4 100644 --- a/drivers/block/sx8.c +++ b/drivers/block/sx8.c @@ -1373,7 +1373,7 @@ static void carm_free_disk(struct carm_host *host, unsigned int port_no) if (!disk) return; - if (disk->flags & GENHD_FL_UP) + if (host->state > HST_DEV_ACTIVATE) del_gendisk(disk); blk_cleanup_disk(disk); } From 224b0683228c5f332f9cee615d85e75e9a347170 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Aug 2021 08:40:26 +0200 Subject: [PATCH 099/315] bcache: add proper error unwinding in bcache_device_init Except for the IDA none of the allocations in bcache_device_init is unwound on error, fix that. Signed-off-by: Christoph Hellwig Acked-by: Coly Li Link: https://lore.kernel.org/r/20210809064028.1198327-7-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 185246a0d855..d0f08e946453 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -931,20 +931,20 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long); d->full_dirty_stripes = kvzalloc(n, GFP_KERNEL); if (!d->full_dirty_stripes) - return -ENOMEM; + goto out_free_stripe_sectors_dirty; idx = ida_simple_get(&bcache_device_idx, 0, BCACHE_DEVICE_IDX_MAX, GFP_KERNEL); if (idx < 0) - return idx; + goto out_free_full_dirty_stripes; if (bioset_init(&d->bio_split, 4, offsetof(struct bbio, bio), BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER)) - goto err; + goto out_ida_remove; d->disk = blk_alloc_disk(NUMA_NO_NODE); if (!d->disk) - goto err; + goto out_bioset_exit; set_capacity(d->disk, sectors); snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx); @@ -987,8 +987,14 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, return 0; -err: +out_bioset_exit: + bioset_exit(&d->bio_split); +out_ida_remove: ida_simple_remove(&bcache_device_idx, idx); +out_free_full_dirty_stripes: + kvfree(d->full_dirty_stripes); +out_free_stripe_sectors_dirty: + kvfree(d->stripe_sectors_dirty); return -ENOMEM; } From b75f4aed88febe903bd40a6128b74edd2388417e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Aug 2021 08:40:27 +0200 Subject: [PATCH 100/315] bcache: move the del_gendisk call out of bcache_device_free Let the callers call del_gendisk so that we can check if add_disk has been called properly for the cached device case instead of relying on the block layer internal GENHD_FL_UP flag. Signed-off-by: Christoph Hellwig Reviewed-by: Coly Li Link: https://lore.kernel.org/r/20210809064028.1198327-8-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index d0f08e946453..f2874c77ff79 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -885,11 +885,6 @@ static void bcache_device_free(struct bcache_device *d) bcache_device_detach(d); if (disk) { - bool disk_added = (disk->flags & GENHD_FL_UP) != 0; - - if (disk_added) - del_gendisk(disk); - blk_cleanup_disk(disk); ida_simple_remove(&bcache_device_idx, first_minor_to_idx(disk->first_minor)); @@ -1371,8 +1366,10 @@ static void cached_dev_free(struct closure *cl) mutex_lock(&bch_register_lock); - if (atomic_read(&dc->running)) + if (atomic_read(&dc->running)) { bd_unlink_disk_holder(dc->bdev, dc->disk.disk); + del_gendisk(dc->disk.disk); + } bcache_device_free(&dc->disk); list_del(&dc->list); @@ -1518,6 +1515,7 @@ static void flash_dev_free(struct closure *cl) mutex_lock(&bch_register_lock); atomic_long_sub(bcache_dev_sectors_dirty(d), &d->c->flash_dev_dirty_sectors); + del_gendisk(d->disk); bcache_device_free(d); mutex_unlock(&bch_register_lock); kobject_put(&d->kobj); From 50b4aecfbbb09869db967e4a26212a47e10c0088 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Aug 2021 08:40:28 +0200 Subject: [PATCH 101/315] block: remove GENHD_FL_UP Just check inode_unhashed on the whole device bdev inode instead, and provide a helper to check for that information. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210809064028.1198327-9-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 6 ++---- block/partitions/core.c | 4 ++-- drivers/md/md.h | 4 +--- drivers/nvme/host/core.c | 2 +- fs/block_dev.c | 2 +- include/linux/genhd.h | 9 +++++---- 6 files changed, 12 insertions(+), 15 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index f8def1129501..9d6b3aeea288 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -77,7 +77,8 @@ bool set_capacity_and_notify(struct gendisk *disk, sector_t size) * initial capacity during probing. */ if (size == capacity || - (disk->flags & (GENHD_FL_UP | GENHD_FL_HIDDEN)) != GENHD_FL_UP) + !disk_live(disk) || + (disk->flags & GENHD_FL_HIDDEN)) return false; pr_info("%s: detected capacity change from %lld to %lld\n", @@ -527,8 +528,6 @@ void device_add_disk(struct device *parent, struct gendisk *disk, disk->flags |= GENHD_FL_EXT_DEVT; } - disk->flags |= GENHD_FL_UP; - disk_alloc_events(disk); if (disk->flags & GENHD_FL_HIDDEN) { @@ -597,7 +596,6 @@ void del_gendisk(struct gendisk *disk) mutex_lock(&disk->open_mutex); remove_inode_hash(disk->part0->bd_inode); - disk->flags &= ~GENHD_FL_UP; blk_drop_partitions(disk); mutex_unlock(&disk->open_mutex); diff --git a/block/partitions/core.c b/block/partitions/core.c index fb3a556cacce..c6738ccbcee5 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -459,7 +459,7 @@ int bdev_add_partition(struct block_device *bdev, int partno, int ret; mutex_lock(&disk->open_mutex); - if (!(disk->flags & GENHD_FL_UP)) { + if (!disk_live(disk)) { ret = -ENXIO; goto out; } @@ -669,7 +669,7 @@ int bdev_disk_changed(struct gendisk *disk, bool invalidate) lockdep_assert_held(&disk->open_mutex); - if (!(disk->flags & GENHD_FL_UP)) + if (!disk_live(disk)) return -ENXIO; rescan: diff --git a/drivers/md/md.h b/drivers/md/md.h index 832547cf038f..4c96c36bd01a 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -764,9 +764,7 @@ struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev); static inline bool is_mddev_broken(struct md_rdev *rdev, const char *md_type) { - int flags = rdev->bdev->bd_disk->flags; - - if (!(flags & GENHD_FL_UP)) { + if (!disk_live(rdev->bdev->bd_disk)) { if (!test_and_set_bit(MD_BROKEN, &rdev->mddev->flags)) pr_warn("md: %s: %s array has a missing/failed member\n", mdname(rdev->mddev), md_type); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index dbe7144f0026..1478d825011d 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1822,7 +1822,7 @@ static void nvme_update_disk_info(struct gendisk *disk, static inline bool nvme_first_scan(struct gendisk *disk) { /* nvme_alloc_ns() scans the disk prior to adding it */ - return !(disk->flags & GENHD_FL_UP); + return !disk_live(disk); } static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id) diff --git a/fs/block_dev.c b/fs/block_dev.c index e1c14c2e0504..38a8b0e04a0c 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1218,7 +1218,7 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) mutex_lock(&disk->open_mutex); ret = -ENXIO; - if (!(disk->flags & GENHD_FL_UP)) + if (!disk_live(disk)) goto abort_claiming; if (bdev_is_partition(bdev)) ret = blkdev_get_part(bdev, mode); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index b3bab578f03a..b47e297cd551 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -60,9 +60,6 @@ struct partition_meta_info { * device. * Affects responses to the ``CDROM_GET_CAPABILITY`` ioctl. * - * ``GENHD_FL_UP`` (0x0010): indicates that the block device is "up", - * with a similar meaning to network interfaces. - * * ``GENHD_FL_SUPPRESS_PARTITION_INFO`` (0x0020): don't include * partition information in ``/proc/partitions`` or in the output of * printk_all_partitions(). @@ -97,7 +94,6 @@ struct partition_meta_info { /* 2 is unused (used to be GENHD_FL_DRIVERFS) */ /* 4 is unused (used to be GENHD_FL_MEDIA_CHANGE_NOTIFY) */ #define GENHD_FL_CD 0x0008 -#define GENHD_FL_UP 0x0010 #define GENHD_FL_SUPPRESS_PARTITION_INFO 0x0020 #define GENHD_FL_EXT_DEVT 0x0040 #define GENHD_FL_NATIVE_CAPACITY 0x0080 @@ -178,6 +174,11 @@ struct gendisk { u64 diskseq; }; +static inline bool disk_live(struct gendisk *disk) +{ + return !inode_unhashed(disk->part0->bd_inode); +} + /* * The gendisk is refcounted by the part0 block_device, and the bd_device * therein is also used for device model presentation in sysfs. From a08aa9bccdc282b5e8d133bf8c239473f057b464 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 17:45:09 +0200 Subject: [PATCH 102/315] block: store a gendisk in struct parsed_partitions Partition scanning only happens on the whole device, so pass a struct gendisk instead of the whole device block_device to the scanners. This allows to simplify printing the device name in various places as the disk name is available in disk->name. Signed-off-by: Christoph Hellwig Reviewed-by: Stefan Haberland Link: https://lore.kernel.org/r/20210810154512.1809898-2-hch@lst.de Signed-off-by: Jens Axboe --- block/partitions/acorn.c | 4 ++-- block/partitions/aix.c | 20 ++------------------ block/partitions/amiga.c | 7 +++---- block/partitions/atari.c | 4 ++-- block/partitions/check.h | 2 +- block/partitions/cmdline.c | 6 ++---- block/partitions/core.c | 6 +++--- block/partitions/efi.c | 36 +++++++++++++++++------------------- block/partitions/ibm.c | 4 ++-- block/partitions/ldm.c | 18 +++++++++--------- block/partitions/mac.c | 2 +- block/partitions/msdos.c | 6 ++++-- block/partitions/sgi.c | 5 ++--- block/partitions/sun.c | 5 ++--- 14 files changed, 52 insertions(+), 73 deletions(-) diff --git a/block/partitions/acorn.c b/block/partitions/acorn.c index c64c57b958bf..2c381c694c57 100644 --- a/block/partitions/acorn.c +++ b/block/partitions/acorn.c @@ -275,7 +275,7 @@ int adfspart_check_ADFS(struct parsed_partitions *state) /* * Work out start of non-adfs partition. */ - nr_sects = (state->bdev->bd_inode->i_size >> 9) - start_sect; + nr_sects = get_capacity(state->disk) - start_sect; if (start_sect) { switch (id) { @@ -540,7 +540,7 @@ int adfspart_check_EESOX(struct parsed_partitions *state) if (i != 0) { sector_t size; - size = get_capacity(state->bdev->bd_disk); + size = get_capacity(state->disk); put_partition(state, slot++, start, size - start); strlcat(state->pp_buf, "\n", PAGE_SIZE); } diff --git a/block/partitions/aix.c b/block/partitions/aix.c index c7b4fd1a4a97..85f4b967565e 100644 --- a/block/partitions/aix.c +++ b/block/partitions/aix.c @@ -66,22 +66,6 @@ struct pvd { #define LVM_MAXLVS 256 -/** - * last_lba(): return number of last logical block of device - * @bdev: block device - * - * Description: Returns last LBA value on success, 0 on error. - * This is stored (by sd and ide-geometry) in - * the part[0] entry for this disk, and is the number of - * physical sectors available on the disk. - */ -static u64 last_lba(struct block_device *bdev) -{ - if (!bdev || !bdev->bd_inode) - return 0; - return (bdev->bd_inode->i_size >> 9) - 1ULL; -} - /** * read_lba(): Read bytes from disk, starting at given LBA * @state @@ -89,7 +73,7 @@ static u64 last_lba(struct block_device *bdev) * @buffer * @count * - * Description: Reads @count bytes from @state->bdev into @buffer. + * Description: Reads @count bytes from @state->disk into @buffer. * Returns number of bytes read on success, 0 on error. */ static size_t read_lba(struct parsed_partitions *state, u64 lba, u8 *buffer, @@ -97,7 +81,7 @@ static size_t read_lba(struct parsed_partitions *state, u64 lba, u8 *buffer, { size_t totalreadcount = 0; - if (!buffer || lba + count / 512 > last_lba(state->bdev)) + if (!buffer || lba + count / 512 > get_capacity(state->disk) - 1ULL) return 0; while (count) { diff --git a/block/partitions/amiga.c b/block/partitions/amiga.c index 9526491d9aed..5c8624e26a54 100644 --- a/block/partitions/amiga.c +++ b/block/partitions/amiga.c @@ -34,7 +34,6 @@ int amiga_partition(struct parsed_partitions *state) int start_sect, nr_sects, blk, part, res = 0; int blksize = 1; /* Multiplier for disk block size */ int slot = 1; - char b[BDEVNAME_SIZE]; for (blk = 0; ; blk++, put_dev_sector(sect)) { if (blk == RDB_ALLOCATION_LIMIT) @@ -42,7 +41,7 @@ int amiga_partition(struct parsed_partitions *state) data = read_part_sector(state, blk, §); if (!data) { pr_err("Dev %s: unable to read RDB block %d\n", - bdevname(state->bdev, b), blk); + state->disk->disk_name, blk); res = -1; goto rdb_done; } @@ -64,7 +63,7 @@ int amiga_partition(struct parsed_partitions *state) } pr_err("Dev %s: RDB in block %d has bad checksum\n", - bdevname(state->bdev, b), blk); + state->disk->disk_name, blk); } /* blksize is blocks per 512 byte standard block */ @@ -84,7 +83,7 @@ int amiga_partition(struct parsed_partitions *state) data = read_part_sector(state, blk, §); if (!data) { pr_err("Dev %s: unable to read partition block %d\n", - bdevname(state->bdev, b), blk); + state->disk->disk_name, blk); res = -1; goto rdb_done; } diff --git a/block/partitions/atari.c b/block/partitions/atari.c index 2305840c8522..da5994175416 100644 --- a/block/partitions/atari.c +++ b/block/partitions/atari.c @@ -47,7 +47,7 @@ int atari_partition(struct parsed_partitions *state) * ATARI partition scheme supports 512 lba only. If this is not * the case, bail early to avoid miscalculating hd_size. */ - if (bdev_logical_block_size(state->bdev) != 512) + if (queue_logical_block_size(state->disk->queue) != 512) return 0; rs = read_part_sector(state, 0, §); @@ -55,7 +55,7 @@ int atari_partition(struct parsed_partitions *state) return -1; /* Verify this is an Atari rootsector: */ - hd_size = state->bdev->bd_inode->i_size >> 9; + hd_size = get_capacity(state->disk); if (!VALID_PARTITION(&rs->part[0], hd_size) && !VALID_PARTITION(&rs->part[1], hd_size) && !VALID_PARTITION(&rs->part[2], hd_size) && diff --git a/block/partitions/check.h b/block/partitions/check.h index c577e9ee67f0..d5b28e309d64 100644 --- a/block/partitions/check.h +++ b/block/partitions/check.h @@ -9,7 +9,7 @@ * description. */ struct parsed_partitions { - struct block_device *bdev; + struct gendisk *disk; char name[BDEVNAME_SIZE]; struct { sector_t from; diff --git a/block/partitions/cmdline.c b/block/partitions/cmdline.c index 482a29e95dbd..1af610f0ba8c 100644 --- a/block/partitions/cmdline.c +++ b/block/partitions/cmdline.c @@ -380,7 +380,6 @@ static void cmdline_parts_verifier(int slot, struct parsed_partitions *state) int cmdline_partition(struct parsed_partitions *state) { sector_t disk_size; - char bdev[BDEVNAME_SIZE]; struct cmdline_parts *parts; if (cmdline) { @@ -397,12 +396,11 @@ int cmdline_partition(struct parsed_partitions *state) if (!bdev_parts) return 0; - bdevname(state->bdev, bdev); - parts = cmdline_parts_find(bdev_parts, bdev); + parts = cmdline_parts_find(bdev_parts, state->disk->disk_name); if (!parts) return 0; - disk_size = get_capacity(state->bdev->bd_disk) << 9; + disk_size = get_capacity(state->disk) << 9; cmdline_parts_set(parts, disk_size, state); cmdline_parts_verifier(1, state); diff --git a/block/partitions/core.c b/block/partitions/core.c index c6738ccbcee5..5dd1cd1a163d 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -135,7 +135,7 @@ static struct parsed_partitions *check_partition(struct gendisk *hd) } state->pp_buf[0] = '\0'; - state->bdev = hd->part0; + state->disk = hd; snprintf(state->name, BDEVNAME_SIZE, "%s", hd->disk_name); snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name); if (isdigit(state->name[strlen(state->name)-1])) @@ -717,10 +717,10 @@ EXPORT_SYMBOL_GPL(bdev_disk_changed); void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p) { - struct address_space *mapping = state->bdev->bd_inode->i_mapping; + struct address_space *mapping = state->disk->part0->bd_inode->i_mapping; struct page *page; - if (n >= get_capacity(state->bdev->bd_disk)) { + if (n >= get_capacity(state->disk)) { state->access_beyond_eod = true; return NULL; } diff --git a/block/partitions/efi.c b/block/partitions/efi.c index e2716792ecc1..aaa3dc487cb5 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -124,19 +124,17 @@ efi_crc32(const void *buf, unsigned long len) /** * last_lba(): return number of last logical block of device - * @bdev: block device + * @disk: block device * * Description: Returns last LBA value on success, 0 on error. * This is stored (by sd and ide-geometry) in * the part[0] entry for this disk, and is the number of * physical sectors available on the disk. */ -static u64 last_lba(struct block_device *bdev) +static u64 last_lba(struct gendisk *disk) { - if (!bdev || !bdev->bd_inode) - return 0; - return div_u64(bdev->bd_inode->i_size, - bdev_logical_block_size(bdev)) - 1ULL; + return div_u64(disk->part0->bd_inode->i_size, + queue_logical_block_size(disk->queue)) - 1ULL; } static inline int pmbr_part_valid(gpt_mbr_record *part) @@ -231,17 +229,17 @@ done: * @buffer: destination buffer * @count: bytes to read * - * Description: Reads @count bytes from @state->bdev into @buffer. + * Description: Reads @count bytes from @state->disk into @buffer. * Returns number of bytes read on success, 0 on error. */ static size_t read_lba(struct parsed_partitions *state, u64 lba, u8 *buffer, size_t count) { size_t totalreadcount = 0; - struct block_device *bdev = state->bdev; - sector_t n = lba * (bdev_logical_block_size(bdev) / 512); + sector_t n = lba * + (queue_logical_block_size(state->disk->queue) / 512); - if (!buffer || lba > last_lba(bdev)) + if (!buffer || lba > last_lba(state->disk)) return 0; while (count) { @@ -302,14 +300,14 @@ static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state, * @lba: the Logical Block Address of the partition table * * Description: returns GPT header on success, NULL on error. Allocates - * and fills a GPT header starting at @ from @state->bdev. + * and fills a GPT header starting at @ from @state->disk. * Note: remember to free gpt when finished with it. */ static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state, u64 lba) { gpt_header *gpt; - unsigned ssz = bdev_logical_block_size(state->bdev); + unsigned ssz = queue_logical_block_size(state->disk->queue); gpt = kmalloc(ssz, GFP_KERNEL); if (!gpt) @@ -356,10 +354,10 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba, /* Check the GUID Partition Table header size is too big */ if (le32_to_cpu((*gpt)->header_size) > - bdev_logical_block_size(state->bdev)) { + queue_logical_block_size(state->disk->queue)) { pr_debug("GUID Partition Table Header size is too large: %u > %u\n", le32_to_cpu((*gpt)->header_size), - bdev_logical_block_size(state->bdev)); + queue_logical_block_size(state->disk->queue)); goto fail; } @@ -395,7 +393,7 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba, /* Check the first_usable_lba and last_usable_lba are * within the disk. */ - lastlba = last_lba(state->bdev); + lastlba = last_lba(state->disk); if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) { pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n", (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba), @@ -587,13 +585,13 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, gpt_header *pgpt = NULL, *agpt = NULL; gpt_entry *pptes = NULL, *aptes = NULL; legacy_mbr *legacymbr; - sector_t total_sectors = i_size_read(state->bdev->bd_inode) >> 9; + sector_t total_sectors = get_capacity(state->disk); u64 lastlba; if (!ptes) return 0; - lastlba = last_lba(state->bdev); + lastlba = last_lba(state->disk); if (!force_gpt) { /* This will be added to the EFI Spec. per Intel after v1.02. */ legacymbr = kzalloc(sizeof(*legacymbr), GFP_KERNEL); @@ -705,7 +703,7 @@ int efi_partition(struct parsed_partitions *state) gpt_header *gpt = NULL; gpt_entry *ptes = NULL; u32 i; - unsigned ssz = bdev_logical_block_size(state->bdev) / 512; + unsigned ssz = queue_logical_block_size(state->disk->queue) / 512; if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) { kfree(gpt); @@ -722,7 +720,7 @@ int efi_partition(struct parsed_partitions *state) u64 size = le64_to_cpu(ptes[i].ending_lba) - le64_to_cpu(ptes[i].starting_lba) + 1ULL; - if (!is_pte_valid(&ptes[i], last_lba(state->bdev))) + if (!is_pte_valid(&ptes[i], last_lba(state->disk))) continue; put_partition(state, i+1, start * ssz, size * ssz); diff --git a/block/partitions/ibm.c b/block/partitions/ibm.c index 4b044e620d35..9bca396aef4a 100644 --- a/block/partitions/ibm.c +++ b/block/partitions/ibm.c @@ -290,8 +290,8 @@ static int find_cms1_partitions(struct parsed_partitions *state, int ibm_partition(struct parsed_partitions *state) { int (*fn)(struct gendisk *disk, dasd_information2_t *info); - struct block_device *bdev = state->bdev; - struct gendisk *disk = bdev->bd_disk; + struct gendisk *disk = state->disk; + struct block_device *bdev = disk->part0; int blocksize, res; loff_t i_size, offset, size; dasd_information2_t *info; diff --git a/block/partitions/ldm.c b/block/partitions/ldm.c index cc86534c80ad..a6f0c9eaebe9 100644 --- a/block/partitions/ldm.c +++ b/block/partitions/ldm.c @@ -304,7 +304,7 @@ static bool ldm_validate_privheads(struct parsed_partitions *state, } } - num_sects = state->bdev->bd_inode->i_size >> 9; + num_sects = get_capacity(state->disk); if ((ph[0]->config_start > num_sects) || ((ph[0]->config_start + ph[0]->config_size) > num_sects)) { @@ -339,11 +339,11 @@ out: /** * ldm_validate_tocblocks - Validate the table of contents and its backups * @state: Partition check state including device holding the LDM Database - * @base: Offset, into @state->bdev, of the database + * @base: Offset, into @state->disk, of the database * @ldb: Cache of the database structures * * Find and compare the four tables of contents of the LDM Database stored on - * @state->bdev and return the parsed information into @toc1. + * @state->disk and return the parsed information into @toc1. * * The offsets and sizes of the configs are range-checked against a privhead. * @@ -486,8 +486,8 @@ out: * only likely to happen if the underlying device is strange. If that IS * the case we should return zero to let someone else try. * - * Return: 'true' @state->bdev is a dynamic disk - * 'false' @state->bdev is not a dynamic disk, or an error occurred + * Return: 'true' @state->disk is a dynamic disk + * 'false' @state->disk is not a dynamic disk, or an error occurred */ static bool ldm_validate_partition_table(struct parsed_partitions *state) { @@ -1340,7 +1340,7 @@ static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb) /** * ldm_get_vblks - Read the on-disk database of VBLKs into memory * @state: Partition check state including device holding the LDM Database - * @base: Offset, into @state->bdev, of the database + * @base: Offset, into @state->disk, of the database * @ldb: Cache of the database structures * * To use the information from the VBLKs, they need to be read from the disk, @@ -1432,10 +1432,10 @@ static void ldm_free_vblks (struct list_head *lh) * example, if the device is hda, we would have: hda1: LDM database, hda2, hda3, * and so on: the actual data containing partitions. * - * Return: 1 Success, @state->bdev is a dynamic disk and we handled it - * 0 Success, @state->bdev is not a dynamic disk + * Return: 1 Success, @state->disk is a dynamic disk and we handled it + * 0 Success, @state->disk is not a dynamic disk * -1 An error occurred before enough information had been read - * Or @state->bdev is a dynamic disk, but it may be corrupted + * Or @state->disk is a dynamic disk, but it may be corrupted */ int ldm_partition(struct parsed_partitions *state) { diff --git a/block/partitions/mac.c b/block/partitions/mac.c index b6095335636c..7b521df00a39 100644 --- a/block/partitions/mac.c +++ b/block/partitions/mac.c @@ -133,7 +133,7 @@ int mac_partition(struct parsed_partitions *state) } #ifdef CONFIG_PPC_PMAC if (found_root_goodness) - note_bootable_part(state->bdev->bd_dev, found_root, + note_bootable_part(state->disk->part0->bd_dev, found_root, found_root_goodness); #endif diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c index f5102596a984..b5d5c229cc3b 100644 --- a/block/partitions/msdos.c +++ b/block/partitions/msdos.c @@ -135,11 +135,12 @@ static void parse_extended(struct parsed_partitions *state, Sector sect; unsigned char *data; sector_t this_sector, this_size; - sector_t sector_size = bdev_logical_block_size(state->bdev) / 512; + sector_t sector_size; int loopct = 0; /* number of links followed without finding a data partition */ int i; + sector_size = queue_logical_block_size(state->disk->queue) / 512; this_sector = first_sector; this_size = first_size; @@ -579,7 +580,7 @@ static struct { int msdos_partition(struct parsed_partitions *state) { - sector_t sector_size = bdev_logical_block_size(state->bdev) / 512; + sector_t sector_size; Sector sect; unsigned char *data; struct msdos_partition *p; @@ -587,6 +588,7 @@ int msdos_partition(struct parsed_partitions *state) int slot; u32 disksig; + sector_size = queue_logical_block_size(state->disk->queue) / 512; data = read_part_sector(state, 0, §); if (!data) return -1; diff --git a/block/partitions/sgi.c b/block/partitions/sgi.c index 4273f1bb0515..9cc6b8c1eea4 100644 --- a/block/partitions/sgi.c +++ b/block/partitions/sgi.c @@ -43,7 +43,6 @@ int sgi_partition(struct parsed_partitions *state) Sector sect; struct sgi_disklabel *label; struct sgi_partition *p; - char b[BDEVNAME_SIZE]; label = read_part_sector(state, 0, §); if (!label) @@ -52,7 +51,7 @@ int sgi_partition(struct parsed_partitions *state) magic = label->magic_mushroom; if(be32_to_cpu(magic) != SGI_LABEL_MAGIC) { /*printk("Dev %s SGI disklabel: bad magic %08x\n", - bdevname(bdev, b), be32_to_cpu(magic));*/ + state->disk->disk_name, be32_to_cpu(magic));*/ put_dev_sector(sect); return 0; } @@ -63,7 +62,7 @@ int sgi_partition(struct parsed_partitions *state) } if(csum) { printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n", - bdevname(state->bdev, b)); + state->disk->disk_name); put_dev_sector(sect); return 0; } diff --git a/block/partitions/sun.c b/block/partitions/sun.c index 47dc53eccf77..ddf9e6def4b2 100644 --- a/block/partitions/sun.c +++ b/block/partitions/sun.c @@ -65,7 +65,6 @@ int sun_partition(struct parsed_partitions *state) } * label; struct sun_partition *p; unsigned long spc; - char b[BDEVNAME_SIZE]; int use_vtoc; int nparts; @@ -76,7 +75,7 @@ int sun_partition(struct parsed_partitions *state) p = label->partitions; if (be16_to_cpu(label->magic) != SUN_LABEL_MAGIC) { /* printk(KERN_INFO "Dev %s Sun disklabel: bad magic %04x\n", - bdevname(bdev, b), be16_to_cpu(label->magic)); */ + state->disk->disk_name, be16_to_cpu(label->magic)); */ put_dev_sector(sect); return 0; } @@ -86,7 +85,7 @@ int sun_partition(struct parsed_partitions *state) csum ^= *ush--; if (csum) { printk("Dev %s Sun disklabel: Csum bad, label corrupted\n", - bdevname(state->bdev, b)); + state->disk->disk_name); put_dev_sector(sect); return 0; } From 7f6be3765e113e0d4b8e6b65e1074982de94377e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 17:45:10 +0200 Subject: [PATCH 103/315] block: pass a gendisk to bdev_add_partition bdev_add_partition can only operate on the whole device. Make that clear by passing a gendisk instead of a block_device. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210810154512.1809898-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk.h | 4 ++-- block/ioctl.c | 3 ++- block/partitions/core.c | 5 ++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/block/blk.h b/block/blk.h index 56f33fbcde59..c0486f609978 100644 --- a/block/blk.h +++ b/block/blk.h @@ -347,8 +347,8 @@ void blk_free_ext_minor(unsigned int minor); #define ADDPART_FLAG_NONE 0 #define ADDPART_FLAG_RAID 1 #define ADDPART_FLAG_WHOLEDISK 2 -int bdev_add_partition(struct block_device *bdev, int partno, - sector_t start, sector_t length); +int bdev_add_partition(struct gendisk *disk, int partno, sector_t start, + sector_t length); int bdev_del_partition(struct block_device *bdev, int partno); int bdev_resize_partition(struct block_device *bdev, int partno, sector_t start, sector_t length); diff --git a/block/ioctl.c b/block/ioctl.c index fff161eaab42..36e0ec76b3b2 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -16,6 +16,7 @@ static int blkpg_do_ioctl(struct block_device *bdev, struct blkpg_partition __user *upart, int op) { + struct gendisk *disk = bdev->bd_disk; struct blkpg_partition p; long long start, length; @@ -40,7 +41,7 @@ static int blkpg_do_ioctl(struct block_device *bdev, /* check if partition is aligned to blocksize */ if (p.start & (bdev_logical_block_size(bdev) - 1)) return -EINVAL; - return bdev_add_partition(bdev, p.pno, start, length); + return bdev_add_partition(disk, p.pno, start, length); case BLKPG_RESIZE_PARTITION: return bdev_resize_partition(bdev, p.pno, start, length); default: diff --git a/block/partitions/core.c b/block/partitions/core.c index 5dd1cd1a163d..7b227c114297 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -451,11 +451,10 @@ static bool partition_overlaps(struct gendisk *disk, sector_t start, return overlap; } -int bdev_add_partition(struct block_device *bdev, int partno, - sector_t start, sector_t length) +int bdev_add_partition(struct gendisk *disk, int partno, sector_t start, + sector_t length) { struct block_device *part; - struct gendisk *disk = bdev->bd_disk; int ret; mutex_lock(&disk->open_mutex); From 926fbb1677e0d963dd96dae3c0305e855590d524 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 17:45:11 +0200 Subject: [PATCH 104/315] block: pass a gendisk to bdev_del_partition bdev_del_partition can only operate on the whole device. Make that clear by passing a gendisk instead of a block_device. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210810154512.1809898-4-hch@lst.de Signed-off-by: Jens Axboe --- block/blk.h | 2 +- block/ioctl.c | 2 +- block/partitions/core.c | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/block/blk.h b/block/blk.h index c0486f609978..21c441eb6773 100644 --- a/block/blk.h +++ b/block/blk.h @@ -349,7 +349,7 @@ void blk_free_ext_minor(unsigned int minor); #define ADDPART_FLAG_WHOLEDISK 2 int bdev_add_partition(struct gendisk *disk, int partno, sector_t start, sector_t length); -int bdev_del_partition(struct block_device *bdev, int partno); +int bdev_del_partition(struct gendisk *disk, int partno); int bdev_resize_partition(struct block_device *bdev, int partno, sector_t start, sector_t length); diff --git a/block/ioctl.c b/block/ioctl.c index 36e0ec76b3b2..8f57b276b2f1 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -31,7 +31,7 @@ static int blkpg_do_ioctl(struct block_device *bdev, return -EINVAL; if (op == BLKPG_DEL_PARTITION) - return bdev_del_partition(bdev, p.pno); + return bdev_del_partition(disk, p.pno); start = p.start >> SECTOR_SHIFT; length = p.length >> SECTOR_SHIFT; diff --git a/block/partitions/core.c b/block/partitions/core.c index 7b227c114297..8c7abf0ee0ea 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -476,13 +476,13 @@ out: return ret; } -int bdev_del_partition(struct block_device *bdev, int partno) +int bdev_del_partition(struct gendisk *disk, int partno) { struct block_device *part = NULL; int ret = -ENXIO; - mutex_lock(&bdev->bd_disk->open_mutex); - part = xa_load(&bdev->bd_disk->part_tbl, partno); + mutex_lock(&disk->open_mutex); + part = xa_load(&disk->part_tbl, partno); if (!part) goto out_unlock; @@ -493,7 +493,7 @@ int bdev_del_partition(struct block_device *bdev, int partno) delete_partition(part); ret = 0; out_unlock: - mutex_unlock(&bdev->bd_disk->open_mutex); + mutex_unlock(&disk->open_mutex); return ret; } From 3d2e79894bd7adc7d14638a0c72ceb8b722d1fa3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 17:45:12 +0200 Subject: [PATCH 105/315] block: pass a gendisk to bdev_resize_partition bdev_resize_partition can only operate on the whole device. Make that clear by passing a gendisk instead of a block_device. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210810154512.1809898-5-hch@lst.de Signed-off-by: Jens Axboe --- block/blk.h | 4 ++-- block/ioctl.c | 2 +- block/partitions/core.c | 12 ++++++------ 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/block/blk.h b/block/blk.h index 21c441eb6773..db6f82bbb683 100644 --- a/block/blk.h +++ b/block/blk.h @@ -350,8 +350,8 @@ void blk_free_ext_minor(unsigned int minor); int bdev_add_partition(struct gendisk *disk, int partno, sector_t start, sector_t length); int bdev_del_partition(struct gendisk *disk, int partno); -int bdev_resize_partition(struct block_device *bdev, int partno, - sector_t start, sector_t length); +int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start, + sector_t length); int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, diff --git a/block/ioctl.c b/block/ioctl.c index 8f57b276b2f1..eb0491e90b9a 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -43,7 +43,7 @@ static int blkpg_do_ioctl(struct block_device *bdev, return -EINVAL; return bdev_add_partition(disk, p.pno, start, length); case BLKPG_RESIZE_PARTITION: - return bdev_resize_partition(bdev, p.pno, start, length); + return bdev_resize_partition(disk, p.pno, start, length); default: return -EINVAL; } diff --git a/block/partitions/core.c b/block/partitions/core.c index 8c7abf0ee0ea..9265936df77e 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -497,14 +497,14 @@ out_unlock: return ret; } -int bdev_resize_partition(struct block_device *bdev, int partno, - sector_t start, sector_t length) +int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start, + sector_t length) { struct block_device *part = NULL; int ret = -ENXIO; - mutex_lock(&bdev->bd_disk->open_mutex); - part = xa_load(&bdev->bd_disk->part_tbl, partno); + mutex_lock(&disk->open_mutex); + part = xa_load(&disk->part_tbl, partno); if (!part) goto out_unlock; @@ -513,14 +513,14 @@ int bdev_resize_partition(struct block_device *bdev, int partno, goto out_unlock; ret = -EBUSY; - if (partition_overlaps(bdev->bd_disk, start, length, partno)) + if (partition_overlaps(disk, start, length, partno)) goto out_unlock; bdev_set_nr_sectors(part, length); ret = 0; out_unlock: - mutex_unlock(&bdev->bd_disk->open_mutex); + mutex_unlock(&disk->open_mutex); return ret; } From 9482fd71dbb8f0d1a61821a83e467dc0a9d7b429 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 12 Aug 2021 22:31:24 +0200 Subject: [PATCH 106/315] hrtimer: Use raw_cpu_ptr() in clock_was_set() clock_was_set() can be invoked from preemptible context. Use raw_cpu_ptr() to check whether high resolution mode is active or not. It does not matter whether the task migrates after acquiring the pointer. Fixes: e71a4153b7c2 ("hrtimer: Force clock_was_set() handling for the HIGHRES=n, NOHZ=y case") Reported-by: Mike Galbraith Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/875ywacsmb.ffs@tglx --- kernel/time/hrtimer.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 88aefc35f7d2..33b00e213e07 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -944,10 +944,11 @@ static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, */ void clock_was_set(unsigned int bases) { + struct hrtimer_cpu_base *cpu_base = raw_cpu_ptr(&hrtimer_bases); cpumask_var_t mask; int cpu; - if (!hrtimer_hres_active() && !tick_nohz_active) + if (!__hrtimer_hres_active(cpu_base) && !tick_nohz_active) goto out_timerfd; if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) { @@ -958,9 +959,9 @@ void clock_was_set(unsigned int bases) /* Avoid interrupting CPUs if possible */ cpus_read_lock(); for_each_online_cpu(cpu) { - struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); unsigned long flags; + cpu_base = &per_cpu(hrtimer_bases, cpu); raw_spin_lock_irqsave(&cpu_base->lock, flags); if (update_needs_ipi(cpu_base, bases)) From f80e21489590c00f46226d5802d900e6f66e5633 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 12 Aug 2021 22:32:30 +0200 Subject: [PATCH 107/315] hrtimer: Unbreak hrtimer_force_reprogram() Since the recent consoliation of reprogramming functions, hrtimer_force_reprogram() is affected by a check whether the new expiry time is past the current expiry time. This breaks the NOHZ logic as that relies on the fact that the tick hrtimer is moved into the future. That means cpu_base->expires_next becomes stale and subsequent reprogramming attempts fail as well until the situation is cleaned up by an hrtimer interrupts. For some yet unknown reason this leads to a complete stall, so for now partially revert the offending commit to a known working state. The root cause for the stall is still investigated and will be fixed in a subsequent commit. Fixes: b14bca97c9f5 ("hrtimer: Consolidate reprogramming code") Reported-by: Mike Galbraith Reported-by: Marek Szyprowski Signed-off-by: Thomas Gleixner Tested-by: Mike Galbraith Link: https://lore.kernel.org/r/8735recskh.ffs@tglx --- kernel/time/hrtimer.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 33b00e213e07..0ea8702eb516 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -652,24 +652,10 @@ static inline int hrtimer_hres_active(void) return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases)); } -static void -__hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal, - struct hrtimer *next_timer, ktime_t expires_next) +static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base, + struct hrtimer *next_timer, + ktime_t expires_next) { - /* - * If the hrtimer interrupt is running, then it will reevaluate the - * clock bases and reprogram the clock event device. - */ - if (cpu_base->in_hrtirq) - return; - - if (expires_next > cpu_base->expires_next) - return; - - if (skip_equal && expires_next == cpu_base->expires_next) - return; - - cpu_base->next_timer = next_timer; cpu_base->expires_next = expires_next; /* @@ -707,8 +693,10 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) expires_next = hrtimer_update_next_event(cpu_base); - __hrtimer_reprogram(cpu_base, skip_equal, cpu_base->next_timer, - expires_next); + if (skip_equal && expires_next == cpu_base->expires_next) + return; + + __hrtimer_reprogram(cpu_base, cpu_base->next_timer, expires_next); } /* High resolution timer related functions */ @@ -863,7 +851,19 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) if (base->cpu_base != cpu_base) return; - __hrtimer_reprogram(cpu_base, true, timer, expires); + if (expires >= cpu_base->expires_next) + return; + + /* + * If the hrtimer interrupt is running, then it will reevaluate the + * clock bases and reprogram the clock event device. + */ + if (cpu_base->in_hrtirq) + return; + + cpu_base->next_timer = timer; + + __hrtimer_reprogram(cpu_base, timer, expires); } static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, From ae460fd9164b16654d8ec06cbc280b832f840eac Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 8 Jun 2021 16:43:40 +0100 Subject: [PATCH 108/315] clocksource/drivers/exynos_mct: Prioritise Arm arch timer on arm64 All arm64 CPUs feature an architected timer, which offers a relatively low-latency interface to a per-cpu clocksource and timer. For the most part, using this interface is a no-brainer, with the exception of SoCs where it cannot be used to wake up from deep idle state (i.e. CLOCK_EVT_FEAT_C3STOP is set). On the contrary, the Exynos MCT is extremely slow to access yet can be used as a wakeup source. In preparation for using the Exynos MCT as a potential wakeup timer for the Arm architected timer, reduce its ratings so that the architected timer is preferred. This effectively reverts the decision made in 6282edb72bed ("clocksource/drivers/exynos_mct: Increase priority over ARM arch timer") for arm64, as the reasoning for the original change was to work around a 32-bit SoC design. Cc: Marek Szyprowski Cc: Krzysztof Kozlowski Cc: Chanwoo Choi Cc: Thomas Gleixner Signed-off-by: Will Deacon Tested-by: Krzysztof Kozlowski # exynos-5422 Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210608154341.10794-2-will@kernel.org --- drivers/clocksource/exynos_mct.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/clocksource/exynos_mct.c b/drivers/clocksource/exynos_mct.c index fabad79baafc..804d3e01c8f4 100644 --- a/drivers/clocksource/exynos_mct.c +++ b/drivers/clocksource/exynos_mct.c @@ -51,6 +51,15 @@ #define TICK_BASE_CNT 1 +#ifdef CONFIG_ARM +/* Use values higher than ARM arch timer. See 6282edb72bed. */ +#define MCT_CLKSOURCE_RATING 450 +#define MCT_CLKEVENTS_RATING 500 +#else +#define MCT_CLKSOURCE_RATING 350 +#define MCT_CLKEVENTS_RATING 350 +#endif + enum { MCT_INT_SPI, MCT_INT_PPI @@ -206,7 +215,7 @@ static void exynos4_frc_resume(struct clocksource *cs) static struct clocksource mct_frc = { .name = "mct-frc", - .rating = 450, /* use value higher than ARM arch timer */ + .rating = MCT_CLKSOURCE_RATING, .read = exynos4_frc_read, .mask = CLOCKSOURCE_MASK(32), .flags = CLOCK_SOURCE_IS_CONTINUOUS, @@ -457,7 +466,7 @@ static int exynos4_mct_starting_cpu(unsigned int cpu) evt->set_state_oneshot_stopped = set_state_shutdown; evt->tick_resume = set_state_shutdown; evt->features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT; - evt->rating = 500; /* use value higher than ARM arch timer */ + evt->rating = MCT_CLKEVENTS_RATING, exynos4_mct_write(TICK_BASE_CNT, mevt->base + MCT_L_TCNTB_OFFSET); From 88183788eacb782eb6e1295f1934fb9531b503d6 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 8 Jun 2021 16:43:41 +0100 Subject: [PATCH 109/315] clocksource/drivers/exynos_mct: Mark MCT device as CLOCK_EVT_FEAT_PERCPU The "mct_tick" is a per-cpu clockevents device. Set the CLOCK_EVT_FEAT_PERCPU feature to prevent e.g. mct_tick0 being unsafely designated as the global broadcast timer and instead treat the device as a per-cpu wakeup timer. Cc: Daniel Lezcano Cc: Thomas Gleixner Cc: Krzysztof Kozlowski Signed-off-by: Will Deacon Acked-by: Krzysztof Kozlowski Reviewed-by: Chanwoo Choi Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210608154341.10794-3-will@kernel.org --- drivers/clocksource/exynos_mct.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/clocksource/exynos_mct.c b/drivers/clocksource/exynos_mct.c index 804d3e01c8f4..5e3e96d3d1b9 100644 --- a/drivers/clocksource/exynos_mct.c +++ b/drivers/clocksource/exynos_mct.c @@ -465,7 +465,8 @@ static int exynos4_mct_starting_cpu(unsigned int cpu) evt->set_state_oneshot = set_state_shutdown; evt->set_state_oneshot_stopped = set_state_shutdown; evt->tick_resume = set_state_shutdown; - evt->features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT; + evt->features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT | + CLOCK_EVT_FEAT_PERCPU; evt->rating = MCT_CLKEVENTS_RATING, exynos4_mct_write(TICK_BASE_CNT, mevt->base + MCT_L_TCNTB_OFFSET); From faa186adbd06f3e7113ae1dc6766e2273d5d9231 Mon Sep 17 00:00:00 2001 From: Ezequiel Garcia Date: Thu, 6 May 2021 08:11:36 -0300 Subject: [PATCH 110/315] dt-bindings: timer: convert rockchip,rk-timer.txt to YAML Convert Rockchip Timer dt-bindings to YAML. Signed-off-by: Ezequiel Garcia Reviewed-by: Rob Herring Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210506111136.3941-4-ezequiel@collabora.com --- .../bindings/timer/rockchip,rk-timer.txt | 27 -------- .../bindings/timer/rockchip,rk-timer.yaml | 64 +++++++++++++++++++ 2 files changed, 64 insertions(+), 27 deletions(-) delete mode 100644 Documentation/devicetree/bindings/timer/rockchip,rk-timer.txt create mode 100644 Documentation/devicetree/bindings/timer/rockchip,rk-timer.yaml diff --git a/Documentation/devicetree/bindings/timer/rockchip,rk-timer.txt b/Documentation/devicetree/bindings/timer/rockchip,rk-timer.txt deleted file mode 100644 index d65fdce7c7f0..000000000000 --- a/Documentation/devicetree/bindings/timer/rockchip,rk-timer.txt +++ /dev/null @@ -1,27 +0,0 @@ -Rockchip rk timer - -Required properties: -- compatible: should be: - "rockchip,rv1108-timer", "rockchip,rk3288-timer": for Rockchip RV1108 - "rockchip,rk3036-timer", "rockchip,rk3288-timer": for Rockchip RK3036 - "rockchip,rk3066-timer", "rockchip,rk3288-timer": for Rockchip RK3066 - "rockchip,rk3188-timer", "rockchip,rk3288-timer": for Rockchip RK3188 - "rockchip,rk3228-timer", "rockchip,rk3288-timer": for Rockchip RK3228 - "rockchip,rk3229-timer", "rockchip,rk3288-timer": for Rockchip RK3229 - "rockchip,rk3288-timer": for Rockchip RK3288 - "rockchip,rk3368-timer", "rockchip,rk3288-timer": for Rockchip RK3368 - "rockchip,rk3399-timer": for Rockchip RK3399 -- reg: base address of the timer register starting with TIMERS CONTROL register -- interrupts: should contain the interrupts for Timer0 -- clocks : must contain an entry for each entry in clock-names -- clock-names : must include the following entries: - "timer", "pclk" - -Example: - timer: timer@ff810000 { - compatible = "rockchip,rk3288-timer"; - reg = <0xff810000 0x20>; - interrupts = ; - clocks = <&xin24m>, <&cru PCLK_TIMER>; - clock-names = "timer", "pclk"; - }; diff --git a/Documentation/devicetree/bindings/timer/rockchip,rk-timer.yaml b/Documentation/devicetree/bindings/timer/rockchip,rk-timer.yaml new file mode 100644 index 000000000000..e26ecb5893ae --- /dev/null +++ b/Documentation/devicetree/bindings/timer/rockchip,rk-timer.yaml @@ -0,0 +1,64 @@ +# SPDX-License-Identifier: GPL-2.0 +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/timer/rockchip,rk-timer.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Rockchip Timer Device Tree Bindings + +maintainers: + - Daniel Lezcano + +properties: + compatible: + oneOf: + - const: rockchip,rk3288-timer + - const: rockchip,rk3399-timer + - items: + - enum: + - rockchip,rv1108-timer + - rockchip,rk3036-timer + - rockchip,rk3066-timer + - rockchip,rk3188-timer + - rockchip,rk3228-timer + - rockchip,rk3229-timer + - rockchip,rk3288-timer + - rockchip,rk3368-timer + - rockchip,px30-timer + - const: rockchip,rk3288-timer + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + clocks: + minItems: 2 + maxItems: 2 + + clock-names: + items: + - const: pclk + - const: timer + +required: + - compatible + - reg + - interrupts + - clocks + - clock-names + +additionalProperties: false + +examples: + - | + #include + #include + + timer: timer@ff810000 { + compatible = "rockchip,rk3288-timer"; + reg = <0xff810000 0x20>; + interrupts = ; + clocks = <&cru PCLK_TIMER>, <&xin24m>; + clock-names = "pclk", "timer"; + }; From fad7cd3310db3099f95dd34312c77740fbc455e5 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Wed, 4 Aug 2021 10:12:12 +0800 Subject: [PATCH 111/315] nbd: add the check to prevent overflow in __nbd_ioctl() If user specify a large enough value of NBD blocks option, it may trigger signed integer overflow which may lead to nbd->config->bytesize becomes a large or small value, zero in particular. UBSAN: Undefined behaviour in drivers/block/nbd.c:325:31 signed integer overflow: 1024 * 4611686155866341414 cannot be represented in type 'long long int' [...] Call trace: [...] handle_overflow+0x188/0x1dc lib/ubsan.c:192 __ubsan_handle_mul_overflow+0x34/0x44 lib/ubsan.c:213 nbd_size_set drivers/block/nbd.c:325 [inline] __nbd_ioctl drivers/block/nbd.c:1342 [inline] nbd_ioctl+0x998/0xa10 drivers/block/nbd.c:1395 __blkdev_driver_ioctl block/ioctl.c:311 [inline] [...] Although it is not a big deal, still silence the UBSAN by limit the input value. Reported-by: Hulk Robot Signed-off-by: Baokun Li Reviewed-by: Josef Bacik Link: https://lore.kernel.org/r/20210804021212.990223-1-libaokun1@huawei.com [axboe: dropped unlikely()] Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index c38317979f74..f82264835794 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1384,6 +1384,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, unsigned int cmd, unsigned long arg) { struct nbd_config *config = nbd->config; + loff_t bytesize; switch (cmd) { case NBD_DISCONNECT: @@ -1398,8 +1399,9 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, case NBD_SET_SIZE: return nbd_set_size(nbd, arg, config->blksize); case NBD_SET_SIZE_BLOCKS: - return nbd_set_size(nbd, arg * config->blksize, - config->blksize); + if (check_mul_overflow((loff_t)arg, config->blksize, &bytesize)) + return -EINVAL; + return nbd_set_size(nbd, bytesize, config->blksize); case NBD_SET_TIMEOUT: nbd_set_cmd_timeout(nbd, arg); return 0; From be83c3b6e7b8ff22f72827a613bf6f3aa5afadbb Mon Sep 17 00:00:00 2001 From: Phong Hoang Date: Thu, 22 Apr 2021 14:34:43 +0200 Subject: [PATCH 112/315] clocksource/drivers/sh_cmt: Fix wrong setting if don't request IRQ for clock source channel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If CMT instance has at least two channels, one channel will be used as a clock source and another one used as a clock event device. In that case, IRQ is not requested for clock source channel so sh_cmt_clock_event_program_verify() might work incorrectly. Besides, when a channel is only used for clock source, don't need to re-set the next match_value since it should be maximum timeout as it still is. On the other hand, due to no IRQ, total_cycles is not counted up when reaches compare match time (timer counter resets to zero), so sh_cmt_clocksource_read() returns unexpected value. Therefore, use 64-bit clocksoure's mask for 32-bit or 16-bit variants will also lead to wrong delta calculation. Hence, this mask should correspond to timer counter width, and above function just returns the raw value of timer counter register. Fixes: bfa76bb12f23 ("clocksource: sh_cmt: Request IRQ for clock event device only") Fixes: 37e7742c55ba ("clocksource/drivers/sh_cmt: Fix clocksource width for 32-bit machines") Signed-off-by: Phong Hoang Signed-off-by: Niklas Söderlund Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210422123443.73334-1-niklas.soderlund+renesas@ragnatech.se --- drivers/clocksource/sh_cmt.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c index d7ed99f0001f..dd0956ad969c 100644 --- a/drivers/clocksource/sh_cmt.c +++ b/drivers/clocksource/sh_cmt.c @@ -579,7 +579,8 @@ static int sh_cmt_start(struct sh_cmt_channel *ch, unsigned long flag) ch->flags |= flag; /* setup timeout if no clockevent */ - if ((flag == FLAG_CLOCKSOURCE) && (!(ch->flags & FLAG_CLOCKEVENT))) + if (ch->cmt->num_channels == 1 && + flag == FLAG_CLOCKSOURCE && (!(ch->flags & FLAG_CLOCKEVENT))) __sh_cmt_set_next(ch, ch->max_match_value); out: raw_spin_unlock_irqrestore(&ch->lock, flags); @@ -621,20 +622,25 @@ static struct sh_cmt_channel *cs_to_sh_cmt(struct clocksource *cs) static u64 sh_cmt_clocksource_read(struct clocksource *cs) { struct sh_cmt_channel *ch = cs_to_sh_cmt(cs); - unsigned long flags; u32 has_wrapped; - u64 value; - u32 raw; - raw_spin_lock_irqsave(&ch->lock, flags); - value = ch->total_cycles; - raw = sh_cmt_get_counter(ch, &has_wrapped); + if (ch->cmt->num_channels == 1) { + unsigned long flags; + u64 value; + u32 raw; - if (unlikely(has_wrapped)) - raw += ch->match_value + 1; - raw_spin_unlock_irqrestore(&ch->lock, flags); + raw_spin_lock_irqsave(&ch->lock, flags); + value = ch->total_cycles; + raw = sh_cmt_get_counter(ch, &has_wrapped); - return value + raw; + if (unlikely(has_wrapped)) + raw += ch->match_value + 1; + raw_spin_unlock_irqrestore(&ch->lock, flags); + + return value + raw; + } + + return sh_cmt_get_counter(ch, &has_wrapped); } static int sh_cmt_clocksource_enable(struct clocksource *cs) @@ -697,7 +703,7 @@ static int sh_cmt_register_clocksource(struct sh_cmt_channel *ch, cs->disable = sh_cmt_clocksource_disable; cs->suspend = sh_cmt_clocksource_suspend; cs->resume = sh_cmt_clocksource_resume; - cs->mask = CLOCKSOURCE_MASK(sizeof(u64) * 8); + cs->mask = CLOCKSOURCE_MASK(ch->cmt->info->width); cs->flags = CLOCK_SOURCE_IS_CONTINUOUS; dev_info(&ch->cmt->pdev->dev, "ch%u: used as clock source\n", From 68c9417b193d0d174b0ada013602272177e61303 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Wed, 11 Aug 2021 14:44:23 +0200 Subject: [PATCH 113/315] nbd: do del_gendisk() asynchronously for NBD_DESTROY_ON_DISCONNECT Now open_mutex is used to synchronize partition operations (e.g, blk_drop_partitions() and blkdev_reread_part()), however it makes nbd driver broken, because nbd may call del_gendisk() in nbd_release() or nbd_genl_disconnect() if NBD_CFLAG_DESTROY_ON_DISCONNECT is enabled, and deadlock occurs, as shown below: // AB-BA dead-lock nbd_genl_disconnect blkdev_open nbd_disconnect_and_put lock bd_mutex // last ref nbd_put lock nbd_index_mutex del_gendisk nbd_open try lock nbd_index_mutex try lock bd_mutex or // AA dead-lock nbd_release lock bd_mutex nbd_put try lock bd_mutex Instead of fixing block layer (e.g, introduce another lock), fixing the nbd driver to call del_gendisk() in a kworker when NBD_DESTROY_ON_DISCONNECT is enabled. When NBD_DESTROY_ON_DISCONNECT is disabled, nbd device will always be destroy through module removal, and there is no risky of deadlock. To ensure the reuse of nbd index succeeds, moving the calling of idr_remove() after del_gendisk(), so if the reused index is not found in nbd_index_idr, the old disk must have been deleted. And reusing the existing destroy_complete mechanism to ensure nbd_genl_connect() will wait for the completion of del_gendisk(). Also adding a new workqueue for nbd removal, so nbd_cleanup() can ensure all removals complete before exits. Reported-by: syzbot+0fe7752e52337864d29b@syzkaller.appspotmail.com Fixes: c76f48eb5c08 ("block: take bd_mutex around delete_partitions in del_gendisk") Signed-off-by: Hou Tao Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210811124428.2368491-2-hch@lst.de Reviewed-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 72 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 62 insertions(+), 10 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index f82264835794..deefb2cda9bb 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -49,6 +49,7 @@ static DEFINE_IDR(nbd_index_idr); static DEFINE_MUTEX(nbd_index_mutex); +static struct workqueue_struct *nbd_del_wq; static int nbd_total_devices = 0; struct nbd_sock { @@ -113,6 +114,7 @@ struct nbd_device { struct mutex config_lock; struct gendisk *disk; struct workqueue_struct *recv_workq; + struct work_struct remove_work; struct list_head list; struct task_struct *task_recv; @@ -233,7 +235,7 @@ static const struct device_attribute backend_attr = { .show = backend_show, }; -static void nbd_dev_remove(struct nbd_device *nbd) +static void nbd_del_disk(struct nbd_device *nbd) { struct gendisk *disk = nbd->disk; @@ -242,16 +244,53 @@ static void nbd_dev_remove(struct nbd_device *nbd) blk_cleanup_disk(disk); blk_mq_free_tag_set(&nbd->tag_set); } +} - /* - * Place this in the last just before the nbd is freed to - * make sure that the disk and the related kobject are also - * totally removed to avoid duplicate creation of the same - * one. - */ - if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) && nbd->destroy_complete) +/* + * Place this in the last just before the nbd is freed to + * make sure that the disk and the related kobject are also + * totally removed to avoid duplicate creation of the same + * one. + */ +static void nbd_notify_destroy_completion(struct nbd_device *nbd) +{ + if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) && + nbd->destroy_complete) complete(nbd->destroy_complete); +} +static void nbd_dev_remove_work(struct work_struct *work) +{ + struct nbd_device *nbd = + container_of(work, struct nbd_device, remove_work); + + nbd_del_disk(nbd); + + mutex_lock(&nbd_index_mutex); + /* + * Remove from idr after del_gendisk() completes, + * so if the same id is reused, the following + * add_disk() will succeed. + */ + idr_remove(&nbd_index_idr, nbd->index); + + nbd_notify_destroy_completion(nbd); + mutex_unlock(&nbd_index_mutex); + + kfree(nbd); +} + +static void nbd_dev_remove(struct nbd_device *nbd) +{ + /* Call del_gendisk() asynchrounously to prevent deadlock */ + if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags)) { + queue_work(nbd_del_wq, &nbd->remove_work); + return; + } + + nbd_del_disk(nbd); + idr_remove(&nbd_index_idr, nbd->index); + nbd_notify_destroy_completion(nbd); kfree(nbd); } @@ -259,7 +298,6 @@ static void nbd_put(struct nbd_device *nbd) { if (refcount_dec_and_mutex_lock(&nbd->refs, &nbd_index_mutex)) { - idr_remove(&nbd_index_idr, nbd->index); nbd_dev_remove(nbd); mutex_unlock(&nbd_index_mutex); } @@ -1681,6 +1719,7 @@ static int nbd_dev_add(int index) nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING; nbd->tag_set.driver_data = nbd; + INIT_WORK(&nbd->remove_work, nbd_dev_remove_work); nbd->destroy_complete = NULL; nbd->backend = NULL; @@ -2418,7 +2457,14 @@ static int __init nbd_init(void) if (register_blkdev(NBD_MAJOR, "nbd")) return -EIO; + nbd_del_wq = alloc_workqueue("nbd-del", WQ_UNBOUND, 0); + if (!nbd_del_wq) { + unregister_blkdev(NBD_MAJOR, "nbd"); + return -ENOMEM; + } + if (genl_register_family(&nbd_genl_family)) { + destroy_workqueue(nbd_del_wq); unregister_blkdev(NBD_MAJOR, "nbd"); return -EINVAL; } @@ -2436,7 +2482,10 @@ static int nbd_exit_cb(int id, void *ptr, void *data) struct list_head *list = (struct list_head *)data; struct nbd_device *nbd = ptr; - list_add_tail(&nbd->list, list); + /* Skip nbd that is being removed asynchronously */ + if (refcount_read(&nbd->refs)) + list_add_tail(&nbd->list, list); + return 0; } @@ -2459,6 +2508,9 @@ static void __exit nbd_cleanup(void) nbd_put(nbd); } + /* Also wait for nbd_dev_remove_work() completes */ + destroy_workqueue(nbd_del_wq); + idr_destroy(&nbd_index_idr); genl_unregister_family(&nbd_genl_family); unregister_blkdev(NBD_MAJOR, "nbd"); From 3f74e0645c52a08f640380c9c46f9a3a172b9389 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 11 Aug 2021 14:44:24 +0200 Subject: [PATCH 114/315] nbd: refactor device removal Share common code for the synchronous and workqueue based device removal, and remove the pointless use of refcount_dec_and_mutex_lock. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210811124428.2368491-3-hch@lst.de Reviewed-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 37 +++++++++++++------------------------ 1 file changed, 13 insertions(+), 24 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index deefb2cda9bb..a9883fbed924 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -259,48 +259,37 @@ static void nbd_notify_destroy_completion(struct nbd_device *nbd) complete(nbd->destroy_complete); } -static void nbd_dev_remove_work(struct work_struct *work) +static void nbd_dev_remove(struct nbd_device *nbd) { - struct nbd_device *nbd = - container_of(work, struct nbd_device, remove_work); - nbd_del_disk(nbd); - mutex_lock(&nbd_index_mutex); /* - * Remove from idr after del_gendisk() completes, - * so if the same id is reused, the following - * add_disk() will succeed. + * Remove from idr after del_gendisk() completes, so if the same ID is + * reused, the following add_disk() will succeed. */ + mutex_lock(&nbd_index_mutex); idr_remove(&nbd_index_idr, nbd->index); - nbd_notify_destroy_completion(nbd); mutex_unlock(&nbd_index_mutex); kfree(nbd); } -static void nbd_dev_remove(struct nbd_device *nbd) +static void nbd_dev_remove_work(struct work_struct *work) { - /* Call del_gendisk() asynchrounously to prevent deadlock */ - if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags)) { - queue_work(nbd_del_wq, &nbd->remove_work); - return; - } - - nbd_del_disk(nbd); - idr_remove(&nbd_index_idr, nbd->index); - nbd_notify_destroy_completion(nbd); - kfree(nbd); + nbd_dev_remove(container_of(work, struct nbd_device, remove_work)); } static void nbd_put(struct nbd_device *nbd) { - if (refcount_dec_and_mutex_lock(&nbd->refs, - &nbd_index_mutex)) { + if (!refcount_dec_and_test(&nbd->refs)) + return; + + /* Call del_gendisk() asynchrounously to prevent deadlock */ + if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags)) + queue_work(nbd_del_wq, &nbd->remove_work); + else nbd_dev_remove(nbd); - mutex_unlock(&nbd_index_mutex); - } } static int nbd_disconnected(struct nbd_config *config) From 327b501b1d94342fe17a1b6b1a40746e57ddd472 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 11 Aug 2021 14:44:25 +0200 Subject: [PATCH 115/315] nbd: remove nbd_del_disk Fold nbd_del_disk and remove the pointless NULL check on ->disk given that it is always set for a successfully allocated nbd_device structure. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210811124428.2368491-4-hch@lst.de Reviewed-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index a9883fbed924..48530fe01c0f 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -235,17 +235,6 @@ static const struct device_attribute backend_attr = { .show = backend_show, }; -static void nbd_del_disk(struct nbd_device *nbd) -{ - struct gendisk *disk = nbd->disk; - - if (disk) { - del_gendisk(disk); - blk_cleanup_disk(disk); - blk_mq_free_tag_set(&nbd->tag_set); - } -} - /* * Place this in the last just before the nbd is freed to * make sure that the disk and the related kobject are also @@ -261,7 +250,11 @@ static void nbd_notify_destroy_completion(struct nbd_device *nbd) static void nbd_dev_remove(struct nbd_device *nbd) { - nbd_del_disk(nbd); + struct gendisk *disk = nbd->disk; + + del_gendisk(disk); + blk_cleanup_disk(disk); + blk_mq_free_tag_set(&nbd->tag_set); /* * Remove from idr after del_gendisk() completes, so if the same ID is From 7bdc00cf7e369b3be17f26e5643da28de98d9d6d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 11 Aug 2021 14:44:26 +0200 Subject: [PATCH 116/315] nbd: return the allocated nbd_device from nbd_dev_add Return the device we just allocated instead of doing an extra search for it in the caller. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210811124428.2368491-5-hch@lst.de Reviewed-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 48530fe01c0f..a81b95c66dbf 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1683,7 +1683,7 @@ static const struct blk_mq_ops nbd_mq_ops = { .timeout = nbd_xmit_timeout, }; -static int nbd_dev_add(int index) +static struct nbd_device *nbd_dev_add(int index) { struct nbd_device *nbd; struct gendisk *disk; @@ -1755,7 +1755,7 @@ static int nbd_dev_add(int index) sprintf(disk->disk_name, "nbd%d", index); add_disk(disk); nbd_total_devices++; - return index; + return nbd; out_free_idr: idr_remove(&nbd_index_idr, index); @@ -1764,7 +1764,7 @@ out_free_tags: out_free_nbd: kfree(nbd); out: - return err; + return ERR_PTR(err); } static int find_free_cb(int id, void *ptr, void *data) @@ -1850,25 +1850,22 @@ again: if (index == -1) { ret = idr_for_each(&nbd_index_idr, &find_free_cb, &nbd); if (ret == 0) { - int new_index; - new_index = nbd_dev_add(-1); - if (new_index < 0) { + nbd = nbd_dev_add(-1); + if (IS_ERR(nbd)) { mutex_unlock(&nbd_index_mutex); printk(KERN_ERR "nbd: failed to add new device\n"); - return new_index; + return PTR_ERR(nbd); } - nbd = idr_find(&nbd_index_idr, new_index); } } else { nbd = idr_find(&nbd_index_idr, index); if (!nbd) { - ret = nbd_dev_add(index); - if (ret < 0) { + nbd = nbd_dev_add(index); + if (IS_ERR(nbd)) { mutex_unlock(&nbd_index_mutex); printk(KERN_ERR "nbd: failed to add new device\n"); - return ret; + return PTR_ERR(nbd); } - nbd = idr_find(&nbd_index_idr, index); } } if (!nbd) { From 6177b56c96ff3b5e23d47f6b6c8630f31145da93 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 11 Aug 2021 14:44:27 +0200 Subject: [PATCH 117/315] nbd: refactor device search and allocation in nbd_genl_connect Use idr_for_each_entry instead of the awkward callback to find an existing device for the index == -1 case, and de-duplicate the device allocation if no existing device was found. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210811124428.2368491-6-hch@lst.de Reviewed-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 45 ++++++++++++++------------------------------- 1 file changed, 14 insertions(+), 31 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index a81b95c66dbf..8c0e334bdfbf 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1767,18 +1767,6 @@ out: return ERR_PTR(err); } -static int find_free_cb(int id, void *ptr, void *data) -{ - struct nbd_device *nbd = ptr; - struct nbd_device **found = data; - - if (!refcount_read(&nbd->config_refs)) { - *found = nbd; - return 1; - } - return 0; -} - /* Netlink interface. */ static const struct nla_policy nbd_attr_policy[NBD_ATTR_MAX + 1] = { [NBD_ATTR_INDEX] = { .type = NLA_U32 }, @@ -1848,31 +1836,26 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) again: mutex_lock(&nbd_index_mutex); if (index == -1) { - ret = idr_for_each(&nbd_index_idr, &find_free_cb, &nbd); - if (ret == 0) { - nbd = nbd_dev_add(-1); - if (IS_ERR(nbd)) { - mutex_unlock(&nbd_index_mutex); - printk(KERN_ERR "nbd: failed to add new device\n"); - return PTR_ERR(nbd); + struct nbd_device *tmp; + int id; + + idr_for_each_entry(&nbd_index_idr, tmp, id) { + if (!refcount_read(&tmp->config_refs)) { + nbd = tmp; + break; } } } else { nbd = idr_find(&nbd_index_idr, index); - if (!nbd) { - nbd = nbd_dev_add(index); - if (IS_ERR(nbd)) { - mutex_unlock(&nbd_index_mutex); - printk(KERN_ERR "nbd: failed to add new device\n"); - return PTR_ERR(nbd); - } - } } + if (!nbd) { - printk(KERN_ERR "nbd: couldn't find device at index %d\n", - index); - mutex_unlock(&nbd_index_mutex); - return -EINVAL; + nbd = nbd_dev_add(index); + if (IS_ERR(nbd)) { + mutex_unlock(&nbd_index_mutex); + pr_err("nbd: failed to add new device\n"); + return PTR_ERR(nbd); + } } if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) && From 6e4df4c6488165637b95b9701cc862a42a3836ba Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 11 Aug 2021 14:44:28 +0200 Subject: [PATCH 118/315] nbd: reduce the nbd_index_mutex scope nbd_index_mutex is currently held over add_disk and inside ->open, which leads to lock order reversals. Refactor the device creation code path so that nbd_dev_add is called without nbd_index_mutex lock held and only takes it for the IDR insertation. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210811124428.2368491-7-hch@lst.de [axboe: fix whitespace] Reviewed-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 57 +++++++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 28 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 8c0e334bdfbf..0fe82626bf70 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1683,7 +1683,7 @@ static const struct blk_mq_ops nbd_mq_ops = { .timeout = nbd_xmit_timeout, }; -static struct nbd_device *nbd_dev_add(int index) +static struct nbd_device *nbd_dev_add(int index, unsigned int refs) { struct nbd_device *nbd; struct gendisk *disk; @@ -1709,6 +1709,7 @@ static struct nbd_device *nbd_dev_add(int index) if (err) goto out_free_nbd; + mutex_lock(&nbd_index_mutex); if (index >= 0) { err = idr_alloc(&nbd_index_idr, nbd, index, index + 1, GFP_KERNEL); @@ -1719,6 +1720,7 @@ static struct nbd_device *nbd_dev_add(int index) if (err >= 0) index = err; } + mutex_unlock(&nbd_index_mutex); if (err < 0) goto out_free_tags; nbd->index = index; @@ -1745,7 +1747,7 @@ static struct nbd_device *nbd_dev_add(int index) mutex_init(&nbd->config_lock); refcount_set(&nbd->config_refs, 0); - refcount_set(&nbd->refs, 1); + refcount_set(&nbd->refs, refs); INIT_LIST_HEAD(&nbd->list); disk->major = NBD_MAJOR; disk->first_minor = index << part_shift; @@ -1849,35 +1851,36 @@ again: nbd = idr_find(&nbd_index_idr, index); } - if (!nbd) { - nbd = nbd_dev_add(index); - if (IS_ERR(nbd)) { + if (nbd) { + if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) && + test_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags)) { + nbd->destroy_complete = &destroy_complete; mutex_unlock(&nbd_index_mutex); + + /* wait until the nbd device is completely destroyed */ + wait_for_completion(&destroy_complete); + goto again; + } + + if (!refcount_inc_not_zero(&nbd->refs)) { + mutex_unlock(&nbd_index_mutex); + if (index == -1) + goto again; + pr_err("nbd: device at index %d is going down\n", + index); + return -EINVAL; + } + mutex_unlock(&nbd_index_mutex); + } else { + mutex_unlock(&nbd_index_mutex); + + nbd = nbd_dev_add(index, 2); + if (IS_ERR(nbd)) { pr_err("nbd: failed to add new device\n"); return PTR_ERR(nbd); } } - if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) && - test_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags)) { - nbd->destroy_complete = &destroy_complete; - mutex_unlock(&nbd_index_mutex); - - /* Wait untill the the nbd stuff is totally destroyed */ - wait_for_completion(&destroy_complete); - goto again; - } - - if (!refcount_inc_not_zero(&nbd->refs)) { - mutex_unlock(&nbd_index_mutex); - if (index == -1) - goto again; - printk(KERN_ERR "nbd: device at index %d is going down\n", - index); - return -EINVAL; - } - mutex_unlock(&nbd_index_mutex); - mutex_lock(&nbd->config_lock); if (refcount_read(&nbd->config_refs)) { mutex_unlock(&nbd->config_lock); @@ -2432,10 +2435,8 @@ static int __init nbd_init(void) } nbd_dbg_init(); - mutex_lock(&nbd_index_mutex); for (i = 0; i < nbds_max; i++) - nbd_dev_add(i); - mutex_unlock(&nbd_index_mutex); + nbd_dev_add(i, 1); return 0; } From 3b87265d825a2d29eb6b67511f0e7ed62225cd97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=91=A8=E7=90=B0=E6=9D=B0=20=28Zhou=20Yanjie=29?= Date: Fri, 30 Jul 2021 17:43:08 +0800 Subject: [PATCH 119/315] clocksource/drivers/ingenic: Use bitfield macro helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use "FIELD_GET()" and "FIELD_PREP()" to simplify the code. [dlezcano] : Changed title Signed-off-by: 周琰杰 (Zhou Yanjie) Reviewed-by: Paul Cercueil Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/1627638188-116163-1-git-send-email-zhouyanjie@wanyeetech.com --- drivers/clocksource/ingenic-sysost.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/clocksource/ingenic-sysost.c b/drivers/clocksource/ingenic-sysost.c index a129840f14f9..cb6fc2f152d4 100644 --- a/drivers/clocksource/ingenic-sysost.c +++ b/drivers/clocksource/ingenic-sysost.c @@ -4,6 +4,7 @@ * Copyright (c) 2020 周琰杰 (Zhou Yanjie) */ +#include #include #include #include @@ -34,8 +35,6 @@ /* bits within the OSTCCR register */ #define OSTCCR_PRESCALE1_MASK 0x3 #define OSTCCR_PRESCALE2_MASK 0xc -#define OSTCCR_PRESCALE1_LSB 0 -#define OSTCCR_PRESCALE2_LSB 2 /* bits within the OSTCR register */ #define OSTCR_OST1CLR BIT(0) @@ -98,7 +97,7 @@ static unsigned long ingenic_ost_percpu_timer_recalc_rate(struct clk_hw *hw, prescale = readl(ost_clk->ost->base + info->ostccr_reg); - prescale = (prescale & OSTCCR_PRESCALE1_MASK) >> OSTCCR_PRESCALE1_LSB; + prescale = FIELD_GET(OSTCCR_PRESCALE1_MASK, prescale); return parent_rate >> (prescale * 2); } @@ -112,7 +111,7 @@ static unsigned long ingenic_ost_global_timer_recalc_rate(struct clk_hw *hw, prescale = readl(ost_clk->ost->base + info->ostccr_reg); - prescale = (prescale & OSTCCR_PRESCALE2_MASK) >> OSTCCR_PRESCALE2_LSB; + prescale = FIELD_GET(OSTCCR_PRESCALE2_MASK, prescale); return parent_rate >> (prescale * 2); } @@ -151,7 +150,8 @@ static int ingenic_ost_percpu_timer_set_rate(struct clk_hw *hw, unsigned long re int val; val = readl(ost_clk->ost->base + info->ostccr_reg); - val = (val & ~OSTCCR_PRESCALE1_MASK) | (prescale << OSTCCR_PRESCALE1_LSB); + val &= ~OSTCCR_PRESCALE1_MASK; + val |= FIELD_PREP(OSTCCR_PRESCALE1_MASK, prescale); writel(val, ost_clk->ost->base + info->ostccr_reg); return 0; @@ -166,7 +166,8 @@ static int ingenic_ost_global_timer_set_rate(struct clk_hw *hw, unsigned long re int val; val = readl(ost_clk->ost->base + info->ostccr_reg); - val = (val & ~OSTCCR_PRESCALE2_MASK) | (prescale << OSTCCR_PRESCALE2_LSB); + val &= ~OSTCCR_PRESCALE2_MASK; + val |= FIELD_PREP(OSTCCR_PRESCALE2_MASK, prescale); writel(val, ost_clk->ost->base + info->ostccr_reg); return 0; From ce9570657d45d6387a68d7f419fe70d085200a2f Mon Sep 17 00:00:00 2001 From: Fengquan Chen Date: Fri, 9 Apr 2021 17:22:42 +0800 Subject: [PATCH 120/315] clocksource/drivers/mediatek: Optimize systimer irq clear flow on shutdown mtk_syst_clkevt_shutdown is called after irq disabled in suspend flow, clear any pending systimer irq when shutdown to avoid suspend aborted due to timer irq pending Also as for systimer in mediatek socs, there must be firstly enable timer before clear systimer irq Fixes: e3af677607d9("clocksource/drivers/timer-mediatek: Add support for system timer") Signed-off-by: Fengquan Chen Tested-by: Hsin-Yi Wang Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/1617960162-1988-2-git-send-email-Fengquan.Chen@mediatek.com --- drivers/clocksource/timer-mediatek.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/clocksource/timer-mediatek.c b/drivers/clocksource/timer-mediatek.c index ab63b95e414f..7bcb4a3f26fb 100644 --- a/drivers/clocksource/timer-mediatek.c +++ b/drivers/clocksource/timer-mediatek.c @@ -60,9 +60,9 @@ * SYST_CON_EN: Clock enable. Shall be set to * - Start timer countdown. * - Allow timeout ticks being updated. - * - Allow changing interrupt functions. + * - Allow changing interrupt status,like clear irq pending. * - * SYST_CON_IRQ_EN: Set to allow interrupt. + * SYST_CON_IRQ_EN: Set to enable interrupt. * * SYST_CON_IRQ_CLR: Set to clear interrupt. */ @@ -75,6 +75,7 @@ static void __iomem *gpt_sched_reg __read_mostly; static void mtk_syst_ack_irq(struct timer_of *to) { /* Clear and disable interrupt */ + writel(SYST_CON_EN, SYST_CON_REG(to)); writel(SYST_CON_IRQ_CLR | SYST_CON_EN, SYST_CON_REG(to)); } @@ -111,6 +112,9 @@ static int mtk_syst_clkevt_next_event(unsigned long ticks, static int mtk_syst_clkevt_shutdown(struct clock_event_device *clkevt) { + /* Clear any irq */ + mtk_syst_ack_irq(to_timer_of(clkevt)); + /* Disable timer */ writel(0, SYST_CON_REG(to_timer_of(clkevt))); From 3a95de59730eb9ac8dd6a367018f5653a873ecaa Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Sun, 25 Jul 2021 00:44:23 +0200 Subject: [PATCH 121/315] clocksource/drivers/fttmr010: Pass around less pointers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Just pass bool flags from the different initcalls and use the flags to set the right pointers. This results in less pointers passed around in init. Cc: Cédric Le Goater Cc: Joel Stanley Signed-off-by: Linus Walleij Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210724224424.2085404-1-linus.walleij@linaro.org --- drivers/clocksource/timer-fttmr010.c | 32 ++++++++++++++-------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/drivers/clocksource/timer-fttmr010.c b/drivers/clocksource/timer-fttmr010.c index edb1d5f193f5..126fb1f259b2 100644 --- a/drivers/clocksource/timer-fttmr010.c +++ b/drivers/clocksource/timer-fttmr010.c @@ -271,9 +271,7 @@ static irqreturn_t ast2600_timer_interrupt(int irq, void *dev_id) } static int __init fttmr010_common_init(struct device_node *np, - bool is_aspeed, - int (*timer_shutdown)(struct clock_event_device *), - irq_handler_t irq_handler) + bool is_aspeed, bool is_ast2600) { struct fttmr010 *fttmr010; int irq; @@ -374,8 +372,6 @@ static int __init fttmr010_common_init(struct device_node *np, fttmr010->tick_rate); } - fttmr010->timer_shutdown = timer_shutdown; - /* * Setup clockevent timer (interrupt-driven) on timer 1. */ @@ -383,8 +379,18 @@ static int __init fttmr010_common_init(struct device_node *np, writel(0, fttmr010->base + TIMER1_LOAD); writel(0, fttmr010->base + TIMER1_MATCH1); writel(0, fttmr010->base + TIMER1_MATCH2); - ret = request_irq(irq, irq_handler, IRQF_TIMER, - "FTTMR010-TIMER1", &fttmr010->clkevt); + + if (is_ast2600) { + fttmr010->timer_shutdown = ast2600_timer_shutdown; + ret = request_irq(irq, ast2600_timer_interrupt, + IRQF_TIMER, "FTTMR010-TIMER1", + &fttmr010->clkevt); + } else { + fttmr010->timer_shutdown = fttmr010_timer_shutdown; + ret = request_irq(irq, fttmr010_timer_interrupt, + IRQF_TIMER, "FTTMR010-TIMER1", + &fttmr010->clkevt); + } if (ret) { pr_err("FTTMR010-TIMER1 no IRQ\n"); goto out_unmap; @@ -432,23 +438,17 @@ out_disable_clock: static __init int ast2600_timer_init(struct device_node *np) { - return fttmr010_common_init(np, true, - ast2600_timer_shutdown, - ast2600_timer_interrupt); + return fttmr010_common_init(np, true, true); } static __init int aspeed_timer_init(struct device_node *np) { - return fttmr010_common_init(np, true, - fttmr010_timer_shutdown, - fttmr010_timer_interrupt); + return fttmr010_common_init(np, true, false); } static __init int fttmr010_timer_init(struct device_node *np) { - return fttmr010_common_init(np, false, - fttmr010_timer_shutdown, - fttmr010_timer_interrupt); + return fttmr010_common_init(np, false, false); } TIMER_OF_DECLARE(fttmr010, "faraday,fttmr010", fttmr010_timer_init); From 9ea9b9c48387edc101d56349492ad9c0492ff78d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 12 Aug 2021 15:23:08 +0200 Subject: [PATCH 122/315] remove the lightnvm subsystem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lightnvm supports the OCSSD 1.x and 2.0 specs which were early attempts to produce Open Channel SSDs and never made it into the NVMe spec proper. They have since been superceeded by NVMe enhancements such as ZNS support. Remove the support per the deprecation schedule. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210812132308.38486-1-hch@lst.de Reviewed-by: Matias Bjørling Reviewed-by: Javier González Signed-off-by: Jens Axboe --- Documentation/driver-api/index.rst | 1 - Documentation/driver-api/lightnvm-pblk.rst | 21 - .../userspace-api/ioctl/ioctl-number.rst | 1 - MAINTAINERS | 9 - drivers/Kconfig | 2 - drivers/Makefile | 1 - drivers/lightnvm/Kconfig | 44 - drivers/lightnvm/Makefile | 11 - drivers/lightnvm/core.c | 1440 ----------- drivers/lightnvm/pblk-cache.c | 137 -- drivers/lightnvm/pblk-core.c | 2151 ----------------- drivers/lightnvm/pblk-gc.c | 726 ------ drivers/lightnvm/pblk-init.c | 1324 ---------- drivers/lightnvm/pblk-map.c | 210 -- drivers/lightnvm/pblk-rb.c | 858 ------- drivers/lightnvm/pblk-read.c | 474 ---- drivers/lightnvm/pblk-recovery.c | 874 ------- drivers/lightnvm/pblk-rl.c | 254 -- drivers/lightnvm/pblk-sysfs.c | 728 ------ drivers/lightnvm/pblk-trace.h | 145 -- drivers/lightnvm/pblk-write.c | 665 ----- drivers/lightnvm/pblk.h | 1358 ----------- drivers/nvme/host/Makefile | 1 - drivers/nvme/host/core.c | 13 - drivers/nvme/host/ioctl.c | 4 +- drivers/nvme/host/lightnvm.c | 1274 ---------- drivers/nvme/host/nvme.h | 26 - drivers/nvme/host/pci.c | 6 - include/linux/lightnvm.h | 697 ------ include/uapi/linux/lightnvm.h | 224 -- 30 files changed, 1 insertion(+), 13678 deletions(-) delete mode 100644 Documentation/driver-api/lightnvm-pblk.rst delete mode 100644 drivers/lightnvm/Kconfig delete mode 100644 drivers/lightnvm/Makefile delete mode 100644 drivers/lightnvm/core.c delete mode 100644 drivers/lightnvm/pblk-cache.c delete mode 100644 drivers/lightnvm/pblk-core.c delete mode 100644 drivers/lightnvm/pblk-gc.c delete mode 100644 drivers/lightnvm/pblk-init.c delete mode 100644 drivers/lightnvm/pblk-map.c delete mode 100644 drivers/lightnvm/pblk-rb.c delete mode 100644 drivers/lightnvm/pblk-read.c delete mode 100644 drivers/lightnvm/pblk-recovery.c delete mode 100644 drivers/lightnvm/pblk-rl.c delete mode 100644 drivers/lightnvm/pblk-sysfs.c delete mode 100644 drivers/lightnvm/pblk-trace.h delete mode 100644 drivers/lightnvm/pblk-write.c delete mode 100644 drivers/lightnvm/pblk.h delete mode 100644 drivers/nvme/host/lightnvm.c delete mode 100644 include/linux/lightnvm.h delete mode 100644 include/uapi/linux/lightnvm.h diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst index f5a3207aa7fa..c57c609ad2eb 100644 --- a/Documentation/driver-api/index.rst +++ b/Documentation/driver-api/index.rst @@ -85,7 +85,6 @@ available subsections can be seen below. io-mapping io_ordering generic-counter - lightnvm-pblk memory-devices/index men-chameleon-bus ntb diff --git a/Documentation/driver-api/lightnvm-pblk.rst b/Documentation/driver-api/lightnvm-pblk.rst deleted file mode 100644 index 1040ed1cec81..000000000000 --- a/Documentation/driver-api/lightnvm-pblk.rst +++ /dev/null @@ -1,21 +0,0 @@ -pblk: Physical Block Device Target -================================== - -pblk implements a fully associative, host-based FTL that exposes a traditional -block I/O interface. Its primary responsibilities are: - - - Map logical addresses onto physical addresses (4KB granularity) in a - logical-to-physical (L2P) table. - - Maintain the integrity and consistency of the L2P table as well as its - recovery from normal tear down and power outage. - - Deal with controller- and media-specific constrains. - - Handle I/O errors. - - Implement garbage collection. - - Maintain consistency across the I/O stack during synchronization points. - -For more information please refer to: - - http://lightnvm.io - -which maintains updated FAQs, manual pages, technical documentation, tools, -contacts, etc. diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst index 1409e40e6345..b7070d76f076 100644 --- a/Documentation/userspace-api/ioctl/ioctl-number.rst +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst @@ -160,7 +160,6 @@ Code Seq# Include File Comments 'K' all linux/kd.h 'L' 00-1F linux/loop.h conflict! 'L' 10-1F drivers/scsi/mpt3sas/mpt3sas_ctl.h conflict! -'L' 20-2F linux/lightnvm.h 'L' E0-FF linux/ppdd.h encrypted disk device driver 'M' all linux/soundcard.h conflict! diff --git a/MAINTAINERS b/MAINTAINERS index c9467d2839f5..ec3f59a16c9a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10609,15 +10609,6 @@ F: LICENSES/ F: scripts/spdxcheck-test.sh F: scripts/spdxcheck.py -LIGHTNVM PLATFORM SUPPORT -M: Matias Bjorling -L: linux-block@vger.kernel.org -S: Maintained -W: http://github/OpenChannelSSD -F: drivers/lightnvm/ -F: include/linux/lightnvm.h -F: include/uapi/linux/lightnvm.h - LINEAR RANGES HELPERS M: Mark Brown R: Matti Vaittinen diff --git a/drivers/Kconfig b/drivers/Kconfig index 8bad63417a50..30d2db37cc87 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -51,8 +51,6 @@ source "drivers/net/Kconfig" source "drivers/isdn/Kconfig" -source "drivers/lightnvm/Kconfig" - # input before char - char/joystick depends on it. As does USB. source "drivers/input/Kconfig" diff --git a/drivers/Makefile b/drivers/Makefile index 27c018bdf4de..be5d40ae1488 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -70,7 +70,6 @@ obj-$(CONFIG_FB_I810) += video/fbdev/i810/ obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/ obj-$(CONFIG_PARPORT) += parport/ -obj-$(CONFIG_NVM) += lightnvm/ obj-y += base/ block/ misc/ mfd/ nfc/ obj-$(CONFIG_LIBNVDIMM) += nvdimm/ obj-$(CONFIG_DAX) += dax/ diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig deleted file mode 100644 index 04caa0f2d445..000000000000 --- a/drivers/lightnvm/Kconfig +++ /dev/null @@ -1,44 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -# -# Open-Channel SSD NVM configuration -# - -menuconfig NVM - bool "Open-Channel SSD target support (DEPRECATED)" - depends on BLOCK - help - Say Y here to get to enable Open-channel SSDs. - - Open-Channel SSDs implement a set of extension to SSDs, that - exposes direct access to the underlying non-volatile memory. - - If you say N, all options in this submenu will be skipped and disabled - only do this if you know what you are doing. - - This code is deprecated and will be removed in Linux 5.15. - -if NVM - -config NVM_PBLK - tristate "Physical Block Device Open-Channel SSD target" - select CRC32 - help - Allows an open-channel SSD to be exposed as a block device to the - host. The target assumes the device exposes raw flash and must be - explicitly managed by the host. - - Please note the disk format is considered EXPERIMENTAL for now. - -if NVM_PBLK - -config NVM_PBLK_DEBUG - bool "PBlk Debug Support" - default n - help - Enables debug support for pblk. This includes extra checks, more - vocal error messages, and extra tracking fields in the pblk sysfs - entries. - -endif # NVM_PBLK_DEBUG - -endif # NVM diff --git a/drivers/lightnvm/Makefile b/drivers/lightnvm/Makefile deleted file mode 100644 index 97d9d7c71550..000000000000 --- a/drivers/lightnvm/Makefile +++ /dev/null @@ -1,11 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# -# Makefile for Open-Channel SSDs. -# - -obj-$(CONFIG_NVM) := core.o -obj-$(CONFIG_NVM_PBLK) += pblk.o -pblk-y := pblk-init.o pblk-core.o pblk-rb.o \ - pblk-write.o pblk-cache.o pblk-read.o \ - pblk-gc.o pblk-recovery.o pblk-map.o \ - pblk-rl.o pblk-sysfs.o diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c deleted file mode 100644 index cf8a75494833..000000000000 --- a/drivers/lightnvm/core.c +++ /dev/null @@ -1,1440 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2015 IT University of Copenhagen. All rights reserved. - * Initial release: Matias Bjorling - */ - -#define pr_fmt(fmt) "nvm: " fmt - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static LIST_HEAD(nvm_tgt_types); -static DECLARE_RWSEM(nvm_tgtt_lock); -static LIST_HEAD(nvm_devices); -static DECLARE_RWSEM(nvm_lock); - -/* Map between virtual and physical channel and lun */ -struct nvm_ch_map { - int ch_off; - int num_lun; - int *lun_offs; -}; - -struct nvm_dev_map { - struct nvm_ch_map *chnls; - int num_ch; -}; - -static void nvm_free(struct kref *ref); - -static struct nvm_target *nvm_find_target(struct nvm_dev *dev, const char *name) -{ - struct nvm_target *tgt; - - list_for_each_entry(tgt, &dev->targets, list) - if (!strcmp(name, tgt->disk->disk_name)) - return tgt; - - return NULL; -} - -static bool nvm_target_exists(const char *name) -{ - struct nvm_dev *dev; - struct nvm_target *tgt; - bool ret = false; - - down_write(&nvm_lock); - list_for_each_entry(dev, &nvm_devices, devices) { - mutex_lock(&dev->mlock); - list_for_each_entry(tgt, &dev->targets, list) { - if (!strcmp(name, tgt->disk->disk_name)) { - ret = true; - mutex_unlock(&dev->mlock); - goto out; - } - } - mutex_unlock(&dev->mlock); - } - -out: - up_write(&nvm_lock); - return ret; -} - -static int nvm_reserve_luns(struct nvm_dev *dev, int lun_begin, int lun_end) -{ - int i; - - for (i = lun_begin; i <= lun_end; i++) { - if (test_and_set_bit(i, dev->lun_map)) { - pr_err("lun %d already allocated\n", i); - goto err; - } - } - - return 0; -err: - while (--i >= lun_begin) - clear_bit(i, dev->lun_map); - - return -EBUSY; -} - -static void nvm_release_luns_err(struct nvm_dev *dev, int lun_begin, - int lun_end) -{ - int i; - - for (i = lun_begin; i <= lun_end; i++) - WARN_ON(!test_and_clear_bit(i, dev->lun_map)); -} - -static void nvm_remove_tgt_dev(struct nvm_tgt_dev *tgt_dev, int clear) -{ - struct nvm_dev *dev = tgt_dev->parent; - struct nvm_dev_map *dev_map = tgt_dev->map; - int i, j; - - for (i = 0; i < dev_map->num_ch; i++) { - struct nvm_ch_map *ch_map = &dev_map->chnls[i]; - int *lun_offs = ch_map->lun_offs; - int ch = i + ch_map->ch_off; - - if (clear) { - for (j = 0; j < ch_map->num_lun; j++) { - int lun = j + lun_offs[j]; - int lunid = (ch * dev->geo.num_lun) + lun; - - WARN_ON(!test_and_clear_bit(lunid, - dev->lun_map)); - } - } - - kfree(ch_map->lun_offs); - } - - kfree(dev_map->chnls); - kfree(dev_map); - - kfree(tgt_dev->luns); - kfree(tgt_dev); -} - -static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev, - u16 lun_begin, u16 lun_end, - u16 op) -{ - struct nvm_tgt_dev *tgt_dev = NULL; - struct nvm_dev_map *dev_rmap = dev->rmap; - struct nvm_dev_map *dev_map; - struct ppa_addr *luns; - int num_lun = lun_end - lun_begin + 1; - int luns_left = num_lun; - int num_ch = num_lun / dev->geo.num_lun; - int num_ch_mod = num_lun % dev->geo.num_lun; - int bch = lun_begin / dev->geo.num_lun; - int blun = lun_begin % dev->geo.num_lun; - int lunid = 0; - int lun_balanced = 1; - int sec_per_lun, prev_num_lun; - int i, j; - - num_ch = (num_ch_mod == 0) ? num_ch : num_ch + 1; - - dev_map = kmalloc(sizeof(struct nvm_dev_map), GFP_KERNEL); - if (!dev_map) - goto err_dev; - - dev_map->chnls = kcalloc(num_ch, sizeof(struct nvm_ch_map), GFP_KERNEL); - if (!dev_map->chnls) - goto err_chnls; - - luns = kcalloc(num_lun, sizeof(struct ppa_addr), GFP_KERNEL); - if (!luns) - goto err_luns; - - prev_num_lun = (luns_left > dev->geo.num_lun) ? - dev->geo.num_lun : luns_left; - for (i = 0; i < num_ch; i++) { - struct nvm_ch_map *ch_rmap = &dev_rmap->chnls[i + bch]; - int *lun_roffs = ch_rmap->lun_offs; - struct nvm_ch_map *ch_map = &dev_map->chnls[i]; - int *lun_offs; - int luns_in_chnl = (luns_left > dev->geo.num_lun) ? - dev->geo.num_lun : luns_left; - - if (lun_balanced && prev_num_lun != luns_in_chnl) - lun_balanced = 0; - - ch_map->ch_off = ch_rmap->ch_off = bch; - ch_map->num_lun = luns_in_chnl; - - lun_offs = kcalloc(luns_in_chnl, sizeof(int), GFP_KERNEL); - if (!lun_offs) - goto err_ch; - - for (j = 0; j < luns_in_chnl; j++) { - luns[lunid].ppa = 0; - luns[lunid].a.ch = i; - luns[lunid++].a.lun = j; - - lun_offs[j] = blun; - lun_roffs[j + blun] = blun; - } - - ch_map->lun_offs = lun_offs; - - /* when starting a new channel, lun offset is reset */ - blun = 0; - luns_left -= luns_in_chnl; - } - - dev_map->num_ch = num_ch; - - tgt_dev = kmalloc(sizeof(struct nvm_tgt_dev), GFP_KERNEL); - if (!tgt_dev) - goto err_ch; - - /* Inherit device geometry from parent */ - memcpy(&tgt_dev->geo, &dev->geo, sizeof(struct nvm_geo)); - - /* Target device only owns a portion of the physical device */ - tgt_dev->geo.num_ch = num_ch; - tgt_dev->geo.num_lun = (lun_balanced) ? prev_num_lun : -1; - tgt_dev->geo.all_luns = num_lun; - tgt_dev->geo.all_chunks = num_lun * dev->geo.num_chk; - - tgt_dev->geo.op = op; - - sec_per_lun = dev->geo.clba * dev->geo.num_chk; - tgt_dev->geo.total_secs = num_lun * sec_per_lun; - - tgt_dev->q = dev->q; - tgt_dev->map = dev_map; - tgt_dev->luns = luns; - tgt_dev->parent = dev; - - return tgt_dev; -err_ch: - while (--i >= 0) - kfree(dev_map->chnls[i].lun_offs); - kfree(luns); -err_luns: - kfree(dev_map->chnls); -err_chnls: - kfree(dev_map); -err_dev: - return tgt_dev; -} - -static struct nvm_tgt_type *__nvm_find_target_type(const char *name) -{ - struct nvm_tgt_type *tt; - - list_for_each_entry(tt, &nvm_tgt_types, list) - if (!strcmp(name, tt->name)) - return tt; - - return NULL; -} - -static struct nvm_tgt_type *nvm_find_target_type(const char *name) -{ - struct nvm_tgt_type *tt; - - down_write(&nvm_tgtt_lock); - tt = __nvm_find_target_type(name); - up_write(&nvm_tgtt_lock); - - return tt; -} - -static int nvm_config_check_luns(struct nvm_geo *geo, int lun_begin, - int lun_end) -{ - if (lun_begin > lun_end || lun_end >= geo->all_luns) { - pr_err("lun out of bound (%u:%u > %u)\n", - lun_begin, lun_end, geo->all_luns - 1); - return -EINVAL; - } - - return 0; -} - -static int __nvm_config_simple(struct nvm_dev *dev, - struct nvm_ioctl_create_simple *s) -{ - struct nvm_geo *geo = &dev->geo; - - if (s->lun_begin == -1 && s->lun_end == -1) { - s->lun_begin = 0; - s->lun_end = geo->all_luns - 1; - } - - return nvm_config_check_luns(geo, s->lun_begin, s->lun_end); -} - -static int __nvm_config_extended(struct nvm_dev *dev, - struct nvm_ioctl_create_extended *e) -{ - if (e->lun_begin == 0xFFFF && e->lun_end == 0xFFFF) { - e->lun_begin = 0; - e->lun_end = dev->geo.all_luns - 1; - } - - /* op not set falls into target's default */ - if (e->op == 0xFFFF) { - e->op = NVM_TARGET_DEFAULT_OP; - } else if (e->op < NVM_TARGET_MIN_OP || e->op > NVM_TARGET_MAX_OP) { - pr_err("invalid over provisioning value\n"); - return -EINVAL; - } - - return nvm_config_check_luns(&dev->geo, e->lun_begin, e->lun_end); -} - -static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) -{ - struct nvm_ioctl_create_extended e; - struct gendisk *tdisk; - struct nvm_tgt_type *tt; - struct nvm_target *t; - struct nvm_tgt_dev *tgt_dev; - void *targetdata; - unsigned int mdts; - int ret; - - switch (create->conf.type) { - case NVM_CONFIG_TYPE_SIMPLE: - ret = __nvm_config_simple(dev, &create->conf.s); - if (ret) - return ret; - - e.lun_begin = create->conf.s.lun_begin; - e.lun_end = create->conf.s.lun_end; - e.op = NVM_TARGET_DEFAULT_OP; - break; - case NVM_CONFIG_TYPE_EXTENDED: - ret = __nvm_config_extended(dev, &create->conf.e); - if (ret) - return ret; - - e = create->conf.e; - break; - default: - pr_err("config type not valid\n"); - return -EINVAL; - } - - tt = nvm_find_target_type(create->tgttype); - if (!tt) { - pr_err("target type %s not found\n", create->tgttype); - return -EINVAL; - } - - if ((tt->flags & NVM_TGT_F_HOST_L2P) != (dev->geo.dom & NVM_RSP_L2P)) { - pr_err("device is incompatible with target L2P type.\n"); - return -EINVAL; - } - - if (nvm_target_exists(create->tgtname)) { - pr_err("target name already exists (%s)\n", - create->tgtname); - return -EINVAL; - } - - ret = nvm_reserve_luns(dev, e.lun_begin, e.lun_end); - if (ret) - return ret; - - t = kmalloc(sizeof(struct nvm_target), GFP_KERNEL); - if (!t) { - ret = -ENOMEM; - goto err_reserve; - } - - tgt_dev = nvm_create_tgt_dev(dev, e.lun_begin, e.lun_end, e.op); - if (!tgt_dev) { - pr_err("could not create target device\n"); - ret = -ENOMEM; - goto err_t; - } - - tdisk = blk_alloc_disk(dev->q->node); - if (!tdisk) { - ret = -ENOMEM; - goto err_dev; - } - - strlcpy(tdisk->disk_name, create->tgtname, sizeof(tdisk->disk_name)); - tdisk->major = 0; - tdisk->first_minor = 0; - tdisk->fops = tt->bops; - - targetdata = tt->init(tgt_dev, tdisk, create->flags); - if (IS_ERR(targetdata)) { - ret = PTR_ERR(targetdata); - goto err_init; - } - - tdisk->private_data = targetdata; - tdisk->queue->queuedata = targetdata; - - mdts = (dev->geo.csecs >> 9) * NVM_MAX_VLBA; - if (dev->geo.mdts) { - mdts = min_t(u32, dev->geo.mdts, - (dev->geo.csecs >> 9) * NVM_MAX_VLBA); - } - blk_queue_max_hw_sectors(tdisk->queue, mdts); - - set_capacity(tdisk, tt->capacity(targetdata)); - add_disk(tdisk); - - if (tt->sysfs_init && tt->sysfs_init(tdisk)) { - ret = -ENOMEM; - goto err_sysfs; - } - - t->type = tt; - t->disk = tdisk; - t->dev = tgt_dev; - - mutex_lock(&dev->mlock); - list_add_tail(&t->list, &dev->targets); - mutex_unlock(&dev->mlock); - - __module_get(tt->owner); - - return 0; -err_sysfs: - if (tt->exit) - tt->exit(targetdata, true); -err_init: - blk_cleanup_disk(tdisk); -err_dev: - nvm_remove_tgt_dev(tgt_dev, 0); -err_t: - kfree(t); -err_reserve: - nvm_release_luns_err(dev, e.lun_begin, e.lun_end); - return ret; -} - -static void __nvm_remove_target(struct nvm_target *t, bool graceful) -{ - struct nvm_tgt_type *tt = t->type; - struct gendisk *tdisk = t->disk; - - del_gendisk(tdisk); - - if (tt->sysfs_exit) - tt->sysfs_exit(tdisk); - - if (tt->exit) - tt->exit(tdisk->private_data, graceful); - - nvm_remove_tgt_dev(t->dev, 1); - blk_cleanup_disk(tdisk); - module_put(t->type->owner); - - list_del(&t->list); - kfree(t); -} - -/** - * nvm_remove_tgt - Removes a target from the media manager - * @remove: ioctl structure with target name to remove. - * - * Returns: - * 0: on success - * 1: on not found - * <0: on error - */ -static int nvm_remove_tgt(struct nvm_ioctl_remove *remove) -{ - struct nvm_target *t = NULL; - struct nvm_dev *dev; - - down_read(&nvm_lock); - list_for_each_entry(dev, &nvm_devices, devices) { - mutex_lock(&dev->mlock); - t = nvm_find_target(dev, remove->tgtname); - if (t) { - mutex_unlock(&dev->mlock); - break; - } - mutex_unlock(&dev->mlock); - } - up_read(&nvm_lock); - - if (!t) { - pr_err("failed to remove target %s\n", - remove->tgtname); - return 1; - } - - __nvm_remove_target(t, true); - kref_put(&dev->ref, nvm_free); - - return 0; -} - -static int nvm_register_map(struct nvm_dev *dev) -{ - struct nvm_dev_map *rmap; - int i, j; - - rmap = kmalloc(sizeof(struct nvm_dev_map), GFP_KERNEL); - if (!rmap) - goto err_rmap; - - rmap->chnls = kcalloc(dev->geo.num_ch, sizeof(struct nvm_ch_map), - GFP_KERNEL); - if (!rmap->chnls) - goto err_chnls; - - for (i = 0; i < dev->geo.num_ch; i++) { - struct nvm_ch_map *ch_rmap; - int *lun_roffs; - int luns_in_chnl = dev->geo.num_lun; - - ch_rmap = &rmap->chnls[i]; - - ch_rmap->ch_off = -1; - ch_rmap->num_lun = luns_in_chnl; - - lun_roffs = kcalloc(luns_in_chnl, sizeof(int), GFP_KERNEL); - if (!lun_roffs) - goto err_ch; - - for (j = 0; j < luns_in_chnl; j++) - lun_roffs[j] = -1; - - ch_rmap->lun_offs = lun_roffs; - } - - dev->rmap = rmap; - - return 0; -err_ch: - while (--i >= 0) - kfree(rmap->chnls[i].lun_offs); -err_chnls: - kfree(rmap); -err_rmap: - return -ENOMEM; -} - -static void nvm_unregister_map(struct nvm_dev *dev) -{ - struct nvm_dev_map *rmap = dev->rmap; - int i; - - for (i = 0; i < dev->geo.num_ch; i++) - kfree(rmap->chnls[i].lun_offs); - - kfree(rmap->chnls); - kfree(rmap); -} - -static void nvm_map_to_dev(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p) -{ - struct nvm_dev_map *dev_map = tgt_dev->map; - struct nvm_ch_map *ch_map = &dev_map->chnls[p->a.ch]; - int lun_off = ch_map->lun_offs[p->a.lun]; - - p->a.ch += ch_map->ch_off; - p->a.lun += lun_off; -} - -static void nvm_map_to_tgt(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p) -{ - struct nvm_dev *dev = tgt_dev->parent; - struct nvm_dev_map *dev_rmap = dev->rmap; - struct nvm_ch_map *ch_rmap = &dev_rmap->chnls[p->a.ch]; - int lun_roff = ch_rmap->lun_offs[p->a.lun]; - - p->a.ch -= ch_rmap->ch_off; - p->a.lun -= lun_roff; -} - -static void nvm_ppa_tgt_to_dev(struct nvm_tgt_dev *tgt_dev, - struct ppa_addr *ppa_list, int nr_ppas) -{ - int i; - - for (i = 0; i < nr_ppas; i++) { - nvm_map_to_dev(tgt_dev, &ppa_list[i]); - ppa_list[i] = generic_to_dev_addr(tgt_dev->parent, ppa_list[i]); - } -} - -static void nvm_ppa_dev_to_tgt(struct nvm_tgt_dev *tgt_dev, - struct ppa_addr *ppa_list, int nr_ppas) -{ - int i; - - for (i = 0; i < nr_ppas; i++) { - ppa_list[i] = dev_to_generic_addr(tgt_dev->parent, ppa_list[i]); - nvm_map_to_tgt(tgt_dev, &ppa_list[i]); - } -} - -static void nvm_rq_tgt_to_dev(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) -{ - struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd); - - nvm_ppa_tgt_to_dev(tgt_dev, ppa_list, rqd->nr_ppas); -} - -static void nvm_rq_dev_to_tgt(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) -{ - struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd); - - nvm_ppa_dev_to_tgt(tgt_dev, ppa_list, rqd->nr_ppas); -} - -int nvm_register_tgt_type(struct nvm_tgt_type *tt) -{ - int ret = 0; - - down_write(&nvm_tgtt_lock); - if (__nvm_find_target_type(tt->name)) - ret = -EEXIST; - else - list_add(&tt->list, &nvm_tgt_types); - up_write(&nvm_tgtt_lock); - - return ret; -} -EXPORT_SYMBOL(nvm_register_tgt_type); - -void nvm_unregister_tgt_type(struct nvm_tgt_type *tt) -{ - if (!tt) - return; - - down_write(&nvm_tgtt_lock); - list_del(&tt->list); - up_write(&nvm_tgtt_lock); -} -EXPORT_SYMBOL(nvm_unregister_tgt_type); - -void *nvm_dev_dma_alloc(struct nvm_dev *dev, gfp_t mem_flags, - dma_addr_t *dma_handler) -{ - return dev->ops->dev_dma_alloc(dev, dev->dma_pool, mem_flags, - dma_handler); -} -EXPORT_SYMBOL(nvm_dev_dma_alloc); - -void nvm_dev_dma_free(struct nvm_dev *dev, void *addr, dma_addr_t dma_handler) -{ - dev->ops->dev_dma_free(dev->dma_pool, addr, dma_handler); -} -EXPORT_SYMBOL(nvm_dev_dma_free); - -static struct nvm_dev *nvm_find_nvm_dev(const char *name) -{ - struct nvm_dev *dev; - - list_for_each_entry(dev, &nvm_devices, devices) - if (!strcmp(name, dev->name)) - return dev; - - return NULL; -} - -static int nvm_set_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd, - const struct ppa_addr *ppas, int nr_ppas) -{ - struct nvm_dev *dev = tgt_dev->parent; - struct nvm_geo *geo = &tgt_dev->geo; - int i, plane_cnt, pl_idx; - struct ppa_addr ppa; - - if (geo->pln_mode == NVM_PLANE_SINGLE && nr_ppas == 1) { - rqd->nr_ppas = nr_ppas; - rqd->ppa_addr = ppas[0]; - - return 0; - } - - rqd->nr_ppas = nr_ppas; - rqd->ppa_list = nvm_dev_dma_alloc(dev, GFP_KERNEL, &rqd->dma_ppa_list); - if (!rqd->ppa_list) { - pr_err("failed to allocate dma memory\n"); - return -ENOMEM; - } - - plane_cnt = geo->pln_mode; - rqd->nr_ppas *= plane_cnt; - - for (i = 0; i < nr_ppas; i++) { - for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) { - ppa = ppas[i]; - ppa.g.pl = pl_idx; - rqd->ppa_list[(pl_idx * nr_ppas) + i] = ppa; - } - } - - return 0; -} - -static void nvm_free_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, - struct nvm_rq *rqd) -{ - if (!rqd->ppa_list) - return; - - nvm_dev_dma_free(tgt_dev->parent, rqd->ppa_list, rqd->dma_ppa_list); -} - -static int nvm_set_flags(struct nvm_geo *geo, struct nvm_rq *rqd) -{ - int flags = 0; - - if (geo->version == NVM_OCSSD_SPEC_20) - return 0; - - if (rqd->is_seq) - flags |= geo->pln_mode >> 1; - - if (rqd->opcode == NVM_OP_PREAD) - flags |= (NVM_IO_SCRAMBLE_ENABLE | NVM_IO_SUSPEND); - else if (rqd->opcode == NVM_OP_PWRITE) - flags |= NVM_IO_SCRAMBLE_ENABLE; - - return flags; -} - -int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd, void *buf) -{ - struct nvm_dev *dev = tgt_dev->parent; - int ret; - - if (!dev->ops->submit_io) - return -ENODEV; - - nvm_rq_tgt_to_dev(tgt_dev, rqd); - - rqd->dev = tgt_dev; - rqd->flags = nvm_set_flags(&tgt_dev->geo, rqd); - - /* In case of error, fail with right address format */ - ret = dev->ops->submit_io(dev, rqd, buf); - if (ret) - nvm_rq_dev_to_tgt(tgt_dev, rqd); - return ret; -} -EXPORT_SYMBOL(nvm_submit_io); - -static void nvm_sync_end_io(struct nvm_rq *rqd) -{ - struct completion *waiting = rqd->private; - - complete(waiting); -} - -static int nvm_submit_io_wait(struct nvm_dev *dev, struct nvm_rq *rqd, - void *buf) -{ - DECLARE_COMPLETION_ONSTACK(wait); - int ret = 0; - - rqd->end_io = nvm_sync_end_io; - rqd->private = &wait; - - ret = dev->ops->submit_io(dev, rqd, buf); - if (ret) - return ret; - - wait_for_completion_io(&wait); - - return 0; -} - -int nvm_submit_io_sync(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd, - void *buf) -{ - struct nvm_dev *dev = tgt_dev->parent; - int ret; - - if (!dev->ops->submit_io) - return -ENODEV; - - nvm_rq_tgt_to_dev(tgt_dev, rqd); - - rqd->dev = tgt_dev; - rqd->flags = nvm_set_flags(&tgt_dev->geo, rqd); - - ret = nvm_submit_io_wait(dev, rqd, buf); - - return ret; -} -EXPORT_SYMBOL(nvm_submit_io_sync); - -void nvm_end_io(struct nvm_rq *rqd) -{ - struct nvm_tgt_dev *tgt_dev = rqd->dev; - - /* Convert address space */ - if (tgt_dev) - nvm_rq_dev_to_tgt(tgt_dev, rqd); - - if (rqd->end_io) - rqd->end_io(rqd); -} -EXPORT_SYMBOL(nvm_end_io); - -static int nvm_submit_io_sync_raw(struct nvm_dev *dev, struct nvm_rq *rqd) -{ - if (!dev->ops->submit_io) - return -ENODEV; - - rqd->dev = NULL; - rqd->flags = nvm_set_flags(&dev->geo, rqd); - - return nvm_submit_io_wait(dev, rqd, NULL); -} - -static int nvm_bb_chunk_sense(struct nvm_dev *dev, struct ppa_addr ppa) -{ - struct nvm_rq rqd = { NULL }; - struct bio bio; - struct bio_vec bio_vec; - struct page *page; - int ret; - - page = alloc_page(GFP_KERNEL); - if (!page) - return -ENOMEM; - - bio_init(&bio, &bio_vec, 1); - bio_add_page(&bio, page, PAGE_SIZE, 0); - bio_set_op_attrs(&bio, REQ_OP_READ, 0); - - rqd.bio = &bio; - rqd.opcode = NVM_OP_PREAD; - rqd.is_seq = 1; - rqd.nr_ppas = 1; - rqd.ppa_addr = generic_to_dev_addr(dev, ppa); - - ret = nvm_submit_io_sync_raw(dev, &rqd); - __free_page(page); - if (ret) - return ret; - - return rqd.error; -} - -/* - * Scans a 1.2 chunk first and last page to determine if its state. - * If the chunk is found to be open, also scan it to update the write - * pointer. - */ -static int nvm_bb_chunk_scan(struct nvm_dev *dev, struct ppa_addr ppa, - struct nvm_chk_meta *meta) -{ - struct nvm_geo *geo = &dev->geo; - int ret, pg, pl; - - /* sense first page */ - ret = nvm_bb_chunk_sense(dev, ppa); - if (ret < 0) /* io error */ - return ret; - else if (ret == 0) /* valid data */ - meta->state = NVM_CHK_ST_OPEN; - else if (ret > 0) { - /* - * If empty page, the chunk is free, else it is an - * actual io error. In that case, mark it offline. - */ - switch (ret) { - case NVM_RSP_ERR_EMPTYPAGE: - meta->state = NVM_CHK_ST_FREE; - return 0; - case NVM_RSP_ERR_FAILCRC: - case NVM_RSP_ERR_FAILECC: - case NVM_RSP_WARN_HIGHECC: - meta->state = NVM_CHK_ST_OPEN; - goto scan; - default: - return -ret; /* other io error */ - } - } - - /* sense last page */ - ppa.g.pg = geo->num_pg - 1; - ppa.g.pl = geo->num_pln - 1; - - ret = nvm_bb_chunk_sense(dev, ppa); - if (ret < 0) /* io error */ - return ret; - else if (ret == 0) { /* Chunk fully written */ - meta->state = NVM_CHK_ST_CLOSED; - meta->wp = geo->clba; - return 0; - } else if (ret > 0) { - switch (ret) { - case NVM_RSP_ERR_EMPTYPAGE: - case NVM_RSP_ERR_FAILCRC: - case NVM_RSP_ERR_FAILECC: - case NVM_RSP_WARN_HIGHECC: - meta->state = NVM_CHK_ST_OPEN; - break; - default: - return -ret; /* other io error */ - } - } - -scan: - /* - * chunk is open, we scan sequentially to update the write pointer. - * We make the assumption that targets write data across all planes - * before moving to the next page. - */ - for (pg = 0; pg < geo->num_pg; pg++) { - for (pl = 0; pl < geo->num_pln; pl++) { - ppa.g.pg = pg; - ppa.g.pl = pl; - - ret = nvm_bb_chunk_sense(dev, ppa); - if (ret < 0) /* io error */ - return ret; - else if (ret == 0) { - meta->wp += geo->ws_min; - } else if (ret > 0) { - switch (ret) { - case NVM_RSP_ERR_EMPTYPAGE: - return 0; - case NVM_RSP_ERR_FAILCRC: - case NVM_RSP_ERR_FAILECC: - case NVM_RSP_WARN_HIGHECC: - meta->wp += geo->ws_min; - break; - default: - return -ret; /* other io error */ - } - } - } - } - - return 0; -} - -/* - * folds a bad block list from its plane representation to its - * chunk representation. - * - * If any of the planes status are bad or grown bad, the chunk is marked - * offline. If not bad, the first plane state acts as the chunk state. - */ -static int nvm_bb_to_chunk(struct nvm_dev *dev, struct ppa_addr ppa, - u8 *blks, int nr_blks, struct nvm_chk_meta *meta) -{ - struct nvm_geo *geo = &dev->geo; - int ret, blk, pl, offset, blktype; - - for (blk = 0; blk < geo->num_chk; blk++) { - offset = blk * geo->pln_mode; - blktype = blks[offset]; - - for (pl = 0; pl < geo->pln_mode; pl++) { - if (blks[offset + pl] & - (NVM_BLK_T_BAD|NVM_BLK_T_GRWN_BAD)) { - blktype = blks[offset + pl]; - break; - } - } - - ppa.g.blk = blk; - - meta->wp = 0; - meta->type = NVM_CHK_TP_W_SEQ; - meta->wi = 0; - meta->slba = generic_to_dev_addr(dev, ppa).ppa; - meta->cnlb = dev->geo.clba; - - if (blktype == NVM_BLK_T_FREE) { - ret = nvm_bb_chunk_scan(dev, ppa, meta); - if (ret) - return ret; - } else { - meta->state = NVM_CHK_ST_OFFLINE; - } - - meta++; - } - - return 0; -} - -static int nvm_get_bb_meta(struct nvm_dev *dev, sector_t slba, - int nchks, struct nvm_chk_meta *meta) -{ - struct nvm_geo *geo = &dev->geo; - struct ppa_addr ppa; - u8 *blks; - int ch, lun, nr_blks; - int ret = 0; - - ppa.ppa = slba; - ppa = dev_to_generic_addr(dev, ppa); - - if (ppa.g.blk != 0) - return -EINVAL; - - if ((nchks % geo->num_chk) != 0) - return -EINVAL; - - nr_blks = geo->num_chk * geo->pln_mode; - - blks = kmalloc(nr_blks, GFP_KERNEL); - if (!blks) - return -ENOMEM; - - for (ch = ppa.g.ch; ch < geo->num_ch; ch++) { - for (lun = ppa.g.lun; lun < geo->num_lun; lun++) { - struct ppa_addr ppa_gen, ppa_dev; - - if (!nchks) - goto done; - - ppa_gen.ppa = 0; - ppa_gen.g.ch = ch; - ppa_gen.g.lun = lun; - ppa_dev = generic_to_dev_addr(dev, ppa_gen); - - ret = dev->ops->get_bb_tbl(dev, ppa_dev, blks); - if (ret) - goto done; - - ret = nvm_bb_to_chunk(dev, ppa_gen, blks, nr_blks, - meta); - if (ret) - goto done; - - meta += geo->num_chk; - nchks -= geo->num_chk; - } - } -done: - kfree(blks); - return ret; -} - -int nvm_get_chunk_meta(struct nvm_tgt_dev *tgt_dev, struct ppa_addr ppa, - int nchks, struct nvm_chk_meta *meta) -{ - struct nvm_dev *dev = tgt_dev->parent; - - nvm_ppa_tgt_to_dev(tgt_dev, &ppa, 1); - - if (dev->geo.version == NVM_OCSSD_SPEC_12) - return nvm_get_bb_meta(dev, (sector_t)ppa.ppa, nchks, meta); - - return dev->ops->get_chk_meta(dev, (sector_t)ppa.ppa, nchks, meta); -} -EXPORT_SYMBOL_GPL(nvm_get_chunk_meta); - -int nvm_set_chunk_meta(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas, - int nr_ppas, int type) -{ - struct nvm_dev *dev = tgt_dev->parent; - struct nvm_rq rqd; - int ret; - - if (dev->geo.version == NVM_OCSSD_SPEC_20) - return 0; - - if (nr_ppas > NVM_MAX_VLBA) { - pr_err("unable to update all blocks atomically\n"); - return -EINVAL; - } - - memset(&rqd, 0, sizeof(struct nvm_rq)); - - nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas); - nvm_rq_tgt_to_dev(tgt_dev, &rqd); - - ret = dev->ops->set_bb_tbl(dev, &rqd.ppa_addr, rqd.nr_ppas, type); - nvm_free_rqd_ppalist(tgt_dev, &rqd); - if (ret) - return -EINVAL; - - return 0; -} -EXPORT_SYMBOL_GPL(nvm_set_chunk_meta); - -static int nvm_core_init(struct nvm_dev *dev) -{ - struct nvm_geo *geo = &dev->geo; - int ret; - - dev->lun_map = kcalloc(BITS_TO_LONGS(geo->all_luns), - sizeof(unsigned long), GFP_KERNEL); - if (!dev->lun_map) - return -ENOMEM; - - INIT_LIST_HEAD(&dev->area_list); - INIT_LIST_HEAD(&dev->targets); - mutex_init(&dev->mlock); - spin_lock_init(&dev->lock); - - ret = nvm_register_map(dev); - if (ret) - goto err_fmtype; - - return 0; -err_fmtype: - kfree(dev->lun_map); - return ret; -} - -static void nvm_free(struct kref *ref) -{ - struct nvm_dev *dev = container_of(ref, struct nvm_dev, ref); - - if (dev->dma_pool) - dev->ops->destroy_dma_pool(dev->dma_pool); - - if (dev->rmap) - nvm_unregister_map(dev); - - kfree(dev->lun_map); - kfree(dev); -} - -static int nvm_init(struct nvm_dev *dev) -{ - struct nvm_geo *geo = &dev->geo; - int ret = -EINVAL; - - if (dev->ops->identity(dev)) { - pr_err("device could not be identified\n"); - goto err; - } - - pr_debug("ver:%u.%u nvm_vendor:%x\n", geo->major_ver_id, - geo->minor_ver_id, geo->vmnt); - - ret = nvm_core_init(dev); - if (ret) { - pr_err("could not initialize core structures.\n"); - goto err; - } - - pr_info("registered %s [%u/%u/%u/%u/%u]\n", - dev->name, dev->geo.ws_min, dev->geo.ws_opt, - dev->geo.num_chk, dev->geo.all_luns, - dev->geo.num_ch); - return 0; -err: - pr_err("failed to initialize nvm\n"); - return ret; -} - -struct nvm_dev *nvm_alloc_dev(int node) -{ - struct nvm_dev *dev; - - dev = kzalloc_node(sizeof(struct nvm_dev), GFP_KERNEL, node); - if (dev) - kref_init(&dev->ref); - - return dev; -} -EXPORT_SYMBOL(nvm_alloc_dev); - -int nvm_register(struct nvm_dev *dev) -{ - int ret, exp_pool_size; - - pr_warn_once("lightnvm support is deprecated and will be removed in Linux 5.15.\n"); - - if (!dev->q || !dev->ops) { - kref_put(&dev->ref, nvm_free); - return -EINVAL; - } - - ret = nvm_init(dev); - if (ret) { - kref_put(&dev->ref, nvm_free); - return ret; - } - - exp_pool_size = max_t(int, PAGE_SIZE, - (NVM_MAX_VLBA * (sizeof(u64) + dev->geo.sos))); - exp_pool_size = round_up(exp_pool_size, PAGE_SIZE); - - dev->dma_pool = dev->ops->create_dma_pool(dev, "ppalist", - exp_pool_size); - if (!dev->dma_pool) { - pr_err("could not create dma pool\n"); - kref_put(&dev->ref, nvm_free); - return -ENOMEM; - } - - /* register device with a supported media manager */ - down_write(&nvm_lock); - list_add(&dev->devices, &nvm_devices); - up_write(&nvm_lock); - - return 0; -} -EXPORT_SYMBOL(nvm_register); - -void nvm_unregister(struct nvm_dev *dev) -{ - struct nvm_target *t, *tmp; - - mutex_lock(&dev->mlock); - list_for_each_entry_safe(t, tmp, &dev->targets, list) { - if (t->dev->parent != dev) - continue; - __nvm_remove_target(t, false); - kref_put(&dev->ref, nvm_free); - } - mutex_unlock(&dev->mlock); - - down_write(&nvm_lock); - list_del(&dev->devices); - up_write(&nvm_lock); - - kref_put(&dev->ref, nvm_free); -} -EXPORT_SYMBOL(nvm_unregister); - -static int __nvm_configure_create(struct nvm_ioctl_create *create) -{ - struct nvm_dev *dev; - int ret; - - down_write(&nvm_lock); - dev = nvm_find_nvm_dev(create->dev); - up_write(&nvm_lock); - - if (!dev) { - pr_err("device not found\n"); - return -EINVAL; - } - - kref_get(&dev->ref); - ret = nvm_create_tgt(dev, create); - if (ret) - kref_put(&dev->ref, nvm_free); - - return ret; -} - -static long nvm_ioctl_info(struct file *file, void __user *arg) -{ - struct nvm_ioctl_info *info; - struct nvm_tgt_type *tt; - int tgt_iter = 0; - - info = memdup_user(arg, sizeof(struct nvm_ioctl_info)); - if (IS_ERR(info)) - return PTR_ERR(info); - - info->version[0] = NVM_VERSION_MAJOR; - info->version[1] = NVM_VERSION_MINOR; - info->version[2] = NVM_VERSION_PATCH; - - down_write(&nvm_tgtt_lock); - list_for_each_entry(tt, &nvm_tgt_types, list) { - struct nvm_ioctl_info_tgt *tgt = &info->tgts[tgt_iter]; - - tgt->version[0] = tt->version[0]; - tgt->version[1] = tt->version[1]; - tgt->version[2] = tt->version[2]; - strncpy(tgt->tgtname, tt->name, NVM_TTYPE_NAME_MAX); - - tgt_iter++; - } - - info->tgtsize = tgt_iter; - up_write(&nvm_tgtt_lock); - - if (copy_to_user(arg, info, sizeof(struct nvm_ioctl_info))) { - kfree(info); - return -EFAULT; - } - - kfree(info); - return 0; -} - -static long nvm_ioctl_get_devices(struct file *file, void __user *arg) -{ - struct nvm_ioctl_get_devices *devices; - struct nvm_dev *dev; - int i = 0; - - devices = kzalloc(sizeof(struct nvm_ioctl_get_devices), GFP_KERNEL); - if (!devices) - return -ENOMEM; - - down_write(&nvm_lock); - list_for_each_entry(dev, &nvm_devices, devices) { - struct nvm_ioctl_device_info *info = &devices->info[i]; - - strlcpy(info->devname, dev->name, sizeof(info->devname)); - - /* kept for compatibility */ - info->bmversion[0] = 1; - info->bmversion[1] = 0; - info->bmversion[2] = 0; - strlcpy(info->bmname, "gennvm", sizeof(info->bmname)); - i++; - - if (i >= ARRAY_SIZE(devices->info)) { - pr_err("max %zd devices can be reported.\n", - ARRAY_SIZE(devices->info)); - break; - } - } - up_write(&nvm_lock); - - devices->nr_devices = i; - - if (copy_to_user(arg, devices, - sizeof(struct nvm_ioctl_get_devices))) { - kfree(devices); - return -EFAULT; - } - - kfree(devices); - return 0; -} - -static long nvm_ioctl_dev_create(struct file *file, void __user *arg) -{ - struct nvm_ioctl_create create; - - if (copy_from_user(&create, arg, sizeof(struct nvm_ioctl_create))) - return -EFAULT; - - if (create.conf.type == NVM_CONFIG_TYPE_EXTENDED && - create.conf.e.rsv != 0) { - pr_err("reserved config field in use\n"); - return -EINVAL; - } - - create.dev[DISK_NAME_LEN - 1] = '\0'; - create.tgttype[NVM_TTYPE_NAME_MAX - 1] = '\0'; - create.tgtname[DISK_NAME_LEN - 1] = '\0'; - - if (create.flags != 0) { - __u32 flags = create.flags; - - /* Check for valid flags */ - if (flags & NVM_TARGET_FACTORY) - flags &= ~NVM_TARGET_FACTORY; - - if (flags) { - pr_err("flag not supported\n"); - return -EINVAL; - } - } - - return __nvm_configure_create(&create); -} - -static long nvm_ioctl_dev_remove(struct file *file, void __user *arg) -{ - struct nvm_ioctl_remove remove; - - if (copy_from_user(&remove, arg, sizeof(struct nvm_ioctl_remove))) - return -EFAULT; - - remove.tgtname[DISK_NAME_LEN - 1] = '\0'; - - if (remove.flags != 0) { - pr_err("no flags supported\n"); - return -EINVAL; - } - - return nvm_remove_tgt(&remove); -} - -/* kept for compatibility reasons */ -static long nvm_ioctl_dev_init(struct file *file, void __user *arg) -{ - struct nvm_ioctl_dev_init init; - - if (copy_from_user(&init, arg, sizeof(struct nvm_ioctl_dev_init))) - return -EFAULT; - - if (init.flags != 0) { - pr_err("no flags supported\n"); - return -EINVAL; - } - - return 0; -} - -/* Kept for compatibility reasons */ -static long nvm_ioctl_dev_factory(struct file *file, void __user *arg) -{ - struct nvm_ioctl_dev_factory fact; - - if (copy_from_user(&fact, arg, sizeof(struct nvm_ioctl_dev_factory))) - return -EFAULT; - - fact.dev[DISK_NAME_LEN - 1] = '\0'; - - if (fact.flags & ~(NVM_FACTORY_NR_BITS - 1)) - return -EINVAL; - - return 0; -} - -static long nvm_ctl_ioctl(struct file *file, uint cmd, unsigned long arg) -{ - void __user *argp = (void __user *)arg; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - switch (cmd) { - case NVM_INFO: - return nvm_ioctl_info(file, argp); - case NVM_GET_DEVICES: - return nvm_ioctl_get_devices(file, argp); - case NVM_DEV_CREATE: - return nvm_ioctl_dev_create(file, argp); - case NVM_DEV_REMOVE: - return nvm_ioctl_dev_remove(file, argp); - case NVM_DEV_INIT: - return nvm_ioctl_dev_init(file, argp); - case NVM_DEV_FACTORY: - return nvm_ioctl_dev_factory(file, argp); - } - return 0; -} - -static const struct file_operations _ctl_fops = { - .open = nonseekable_open, - .unlocked_ioctl = nvm_ctl_ioctl, - .owner = THIS_MODULE, - .llseek = noop_llseek, -}; - -static struct miscdevice _nvm_misc = { - .minor = MISC_DYNAMIC_MINOR, - .name = "lightnvm", - .nodename = "lightnvm/control", - .fops = &_ctl_fops, -}; -builtin_misc_device(_nvm_misc); diff --git a/drivers/lightnvm/pblk-cache.c b/drivers/lightnvm/pblk-cache.c deleted file mode 100644 index f185f1a00008..000000000000 --- a/drivers/lightnvm/pblk-cache.c +++ /dev/null @@ -1,137 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2016 CNEX Labs - * Initial release: Javier Gonzalez - * Matias Bjorling - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * pblk-cache.c - pblk's write cache - */ - -#include "pblk.h" - -void pblk_write_to_cache(struct pblk *pblk, struct bio *bio, - unsigned long flags) -{ - struct pblk_w_ctx w_ctx; - sector_t lba = pblk_get_lba(bio); - unsigned long start_time; - unsigned int bpos, pos; - int nr_entries = pblk_get_secs(bio); - int i, ret; - - start_time = bio_start_io_acct(bio); - - /* Update the write buffer head (mem) with the entries that we can - * write. The write in itself cannot fail, so there is no need to - * rollback from here on. - */ -retry: - ret = pblk_rb_may_write_user(&pblk->rwb, bio, nr_entries, &bpos); - switch (ret) { - case NVM_IO_REQUEUE: - io_schedule(); - goto retry; - case NVM_IO_ERR: - pblk_pipeline_stop(pblk); - bio_io_error(bio); - goto out; - } - - pblk_ppa_set_empty(&w_ctx.ppa); - w_ctx.flags = flags; - if (bio->bi_opf & REQ_PREFLUSH) { - w_ctx.flags |= PBLK_FLUSH_ENTRY; - pblk_write_kick(pblk); - } - - if (unlikely(!bio_has_data(bio))) - goto out; - - for (i = 0; i < nr_entries; i++) { - void *data = bio_data(bio); - - w_ctx.lba = lba + i; - - pos = pblk_rb_wrap_pos(&pblk->rwb, bpos + i); - pblk_rb_write_entry_user(&pblk->rwb, data, w_ctx, pos); - - bio_advance(bio, PBLK_EXPOSED_PAGE_SIZE); - } - - atomic64_add(nr_entries, &pblk->user_wa); - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_add(nr_entries, &pblk->inflight_writes); - atomic_long_add(nr_entries, &pblk->req_writes); -#endif - - pblk_rl_inserted(&pblk->rl, nr_entries); - -out: - bio_end_io_acct(bio, start_time); - pblk_write_should_kick(pblk); - - if (ret == NVM_IO_DONE) - bio_endio(bio); -} - -/* - * On GC the incoming lbas are not necessarily sequential. Also, some of the - * lbas might not be valid entries, which are marked as empty by the GC thread - */ -int pblk_write_gc_to_cache(struct pblk *pblk, struct pblk_gc_rq *gc_rq) -{ - struct pblk_w_ctx w_ctx; - unsigned int bpos, pos; - void *data = gc_rq->data; - int i, valid_entries; - - /* Update the write buffer head (mem) with the entries that we can - * write. The write in itself cannot fail, so there is no need to - * rollback from here on. - */ -retry: - if (!pblk_rb_may_write_gc(&pblk->rwb, gc_rq->secs_to_gc, &bpos)) { - io_schedule(); - goto retry; - } - - w_ctx.flags = PBLK_IOTYPE_GC; - pblk_ppa_set_empty(&w_ctx.ppa); - - for (i = 0, valid_entries = 0; i < gc_rq->nr_secs; i++) { - if (gc_rq->lba_list[i] == ADDR_EMPTY) - continue; - - w_ctx.lba = gc_rq->lba_list[i]; - - pos = pblk_rb_wrap_pos(&pblk->rwb, bpos + valid_entries); - pblk_rb_write_entry_gc(&pblk->rwb, data, w_ctx, gc_rq->line, - gc_rq->paddr_list[i], pos); - - data += PBLK_EXPOSED_PAGE_SIZE; - valid_entries++; - } - - WARN_ONCE(gc_rq->secs_to_gc != valid_entries, - "pblk: inconsistent GC write\n"); - - atomic64_add(valid_entries, &pblk->gc_wa); - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_add(valid_entries, &pblk->inflight_writes); - atomic_long_add(valid_entries, &pblk->recov_gc_writes); -#endif - - pblk_write_should_kick(pblk); - return NVM_IO_OK; -} diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c deleted file mode 100644 index 33d39d3dd343..000000000000 --- a/drivers/lightnvm/pblk-core.c +++ /dev/null @@ -1,2151 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2016 CNEX Labs - * Initial release: Javier Gonzalez - * Matias Bjorling - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * pblk-core.c - pblk's core functionality - * - */ - -#define CREATE_TRACE_POINTS - -#include "pblk.h" -#include "pblk-trace.h" - -static void pblk_line_mark_bb(struct work_struct *work) -{ - struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws, - ws); - struct pblk *pblk = line_ws->pblk; - struct nvm_tgt_dev *dev = pblk->dev; - struct ppa_addr *ppa = line_ws->priv; - int ret; - - ret = nvm_set_chunk_meta(dev, ppa, 1, NVM_BLK_T_GRWN_BAD); - if (ret) { - struct pblk_line *line; - int pos; - - line = pblk_ppa_to_line(pblk, *ppa); - pos = pblk_ppa_to_pos(&dev->geo, *ppa); - - pblk_err(pblk, "failed to mark bb, line:%d, pos:%d\n", - line->id, pos); - } - - kfree(ppa); - mempool_free(line_ws, &pblk->gen_ws_pool); -} - -static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line, - struct ppa_addr ppa_addr) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct ppa_addr *ppa; - int pos = pblk_ppa_to_pos(geo, ppa_addr); - - pblk_debug(pblk, "erase failed: line:%d, pos:%d\n", line->id, pos); - atomic_long_inc(&pblk->erase_failed); - - atomic_dec(&line->blk_in_line); - if (test_and_set_bit(pos, line->blk_bitmap)) - pblk_err(pblk, "attempted to erase bb: line:%d, pos:%d\n", - line->id, pos); - - /* Not necessary to mark bad blocks on 2.0 spec. */ - if (geo->version == NVM_OCSSD_SPEC_20) - return; - - ppa = kmalloc(sizeof(struct ppa_addr), GFP_ATOMIC); - if (!ppa) - return; - - *ppa = ppa_addr; - pblk_gen_run_ws(pblk, NULL, ppa, pblk_line_mark_bb, - GFP_ATOMIC, pblk->bb_wq); -} - -static void __pblk_end_io_erase(struct pblk *pblk, struct nvm_rq *rqd) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct nvm_chk_meta *chunk; - struct pblk_line *line; - int pos; - - line = pblk_ppa_to_line(pblk, rqd->ppa_addr); - pos = pblk_ppa_to_pos(geo, rqd->ppa_addr); - chunk = &line->chks[pos]; - - atomic_dec(&line->left_seblks); - - if (rqd->error) { - trace_pblk_chunk_reset(pblk_disk_name(pblk), - &rqd->ppa_addr, PBLK_CHUNK_RESET_FAILED); - - chunk->state = NVM_CHK_ST_OFFLINE; - pblk_mark_bb(pblk, line, rqd->ppa_addr); - } else { - trace_pblk_chunk_reset(pblk_disk_name(pblk), - &rqd->ppa_addr, PBLK_CHUNK_RESET_DONE); - - chunk->state = NVM_CHK_ST_FREE; - } - - trace_pblk_chunk_state(pblk_disk_name(pblk), &rqd->ppa_addr, - chunk->state); - - atomic_dec(&pblk->inflight_io); -} - -/* Erase completion assumes that only one block is erased at the time */ -static void pblk_end_io_erase(struct nvm_rq *rqd) -{ - struct pblk *pblk = rqd->private; - - __pblk_end_io_erase(pblk, rqd); - mempool_free(rqd, &pblk->e_rq_pool); -} - -/* - * Get information for all chunks from the device. - * - * The caller is responsible for freeing (vmalloc) the returned structure - */ -struct nvm_chk_meta *pblk_get_chunk_meta(struct pblk *pblk) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct nvm_chk_meta *meta; - struct ppa_addr ppa; - unsigned long len; - int ret; - - ppa.ppa = 0; - - len = geo->all_chunks * sizeof(*meta); - meta = vzalloc(len); - if (!meta) - return ERR_PTR(-ENOMEM); - - ret = nvm_get_chunk_meta(dev, ppa, geo->all_chunks, meta); - if (ret) { - vfree(meta); - return ERR_PTR(-EIO); - } - - return meta; -} - -struct nvm_chk_meta *pblk_chunk_get_off(struct pblk *pblk, - struct nvm_chk_meta *meta, - struct ppa_addr ppa) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - int ch_off = ppa.m.grp * geo->num_chk * geo->num_lun; - int lun_off = ppa.m.pu * geo->num_chk; - int chk_off = ppa.m.chk; - - return meta + ch_off + lun_off + chk_off; -} - -void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line, - u64 paddr) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct list_head *move_list = NULL; - - /* Lines being reclaimed (GC'ed) cannot be invalidated. Before the L2P - * table is modified with reclaimed sectors, a check is done to endure - * that newer updates are not overwritten. - */ - spin_lock(&line->lock); - WARN_ON(line->state == PBLK_LINESTATE_FREE); - - if (test_and_set_bit(paddr, line->invalid_bitmap)) { - WARN_ONCE(1, "pblk: double invalidate\n"); - spin_unlock(&line->lock); - return; - } - le32_add_cpu(line->vsc, -1); - - if (line->state == PBLK_LINESTATE_CLOSED) - move_list = pblk_line_gc_list(pblk, line); - spin_unlock(&line->lock); - - if (move_list) { - spin_lock(&l_mg->gc_lock); - spin_lock(&line->lock); - /* Prevent moving a line that has just been chosen for GC */ - if (line->state == PBLK_LINESTATE_GC) { - spin_unlock(&line->lock); - spin_unlock(&l_mg->gc_lock); - return; - } - spin_unlock(&line->lock); - - list_move_tail(&line->list, move_list); - spin_unlock(&l_mg->gc_lock); - } -} - -void pblk_map_invalidate(struct pblk *pblk, struct ppa_addr ppa) -{ - struct pblk_line *line; - u64 paddr; - -#ifdef CONFIG_NVM_PBLK_DEBUG - /* Callers must ensure that the ppa points to a device address */ - BUG_ON(pblk_addr_in_cache(ppa)); - BUG_ON(pblk_ppa_empty(ppa)); -#endif - - line = pblk_ppa_to_line(pblk, ppa); - paddr = pblk_dev_ppa_to_line_addr(pblk, ppa); - - __pblk_map_invalidate(pblk, line, paddr); -} - -static void pblk_invalidate_range(struct pblk *pblk, sector_t slba, - unsigned int nr_secs) -{ - sector_t lba; - - spin_lock(&pblk->trans_lock); - for (lba = slba; lba < slba + nr_secs; lba++) { - struct ppa_addr ppa; - - ppa = pblk_trans_map_get(pblk, lba); - - if (!pblk_addr_in_cache(ppa) && !pblk_ppa_empty(ppa)) - pblk_map_invalidate(pblk, ppa); - - pblk_ppa_set_empty(&ppa); - pblk_trans_map_set(pblk, lba, ppa); - } - spin_unlock(&pblk->trans_lock); -} - -int pblk_alloc_rqd_meta(struct pblk *pblk, struct nvm_rq *rqd) -{ - struct nvm_tgt_dev *dev = pblk->dev; - - rqd->meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, - &rqd->dma_meta_list); - if (!rqd->meta_list) - return -ENOMEM; - - if (rqd->nr_ppas == 1) - return 0; - - rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size(pblk); - rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size(pblk); - - return 0; -} - -void pblk_free_rqd_meta(struct pblk *pblk, struct nvm_rq *rqd) -{ - struct nvm_tgt_dev *dev = pblk->dev; - - if (rqd->meta_list) - nvm_dev_dma_free(dev->parent, rqd->meta_list, - rqd->dma_meta_list); -} - -/* Caller must guarantee that the request is a valid type */ -struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int type) -{ - mempool_t *pool; - struct nvm_rq *rqd; - int rq_size; - - switch (type) { - case PBLK_WRITE: - case PBLK_WRITE_INT: - pool = &pblk->w_rq_pool; - rq_size = pblk_w_rq_size; - break; - case PBLK_READ: - pool = &pblk->r_rq_pool; - rq_size = pblk_g_rq_size; - break; - default: - pool = &pblk->e_rq_pool; - rq_size = pblk_g_rq_size; - } - - rqd = mempool_alloc(pool, GFP_KERNEL); - memset(rqd, 0, rq_size); - - return rqd; -} - -/* Typically used on completion path. Cannot guarantee request consistency */ -void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int type) -{ - mempool_t *pool; - - switch (type) { - case PBLK_WRITE: - kfree(((struct pblk_c_ctx *)nvm_rq_to_pdu(rqd))->lun_bitmap); - fallthrough; - case PBLK_WRITE_INT: - pool = &pblk->w_rq_pool; - break; - case PBLK_READ: - pool = &pblk->r_rq_pool; - break; - case PBLK_ERASE: - pool = &pblk->e_rq_pool; - break; - default: - pblk_err(pblk, "trying to free unknown rqd type\n"); - return; - } - - pblk_free_rqd_meta(pblk, rqd); - mempool_free(rqd, pool); -} - -void pblk_bio_free_pages(struct pblk *pblk, struct bio *bio, int off, - int nr_pages) -{ - struct bio_vec *bv; - struct page *page; - int i, e, nbv = 0; - - for (i = 0; i < bio->bi_vcnt; i++) { - bv = &bio->bi_io_vec[i]; - page = bv->bv_page; - for (e = 0; e < bv->bv_len; e += PBLK_EXPOSED_PAGE_SIZE, nbv++) - if (nbv >= off) - mempool_free(page++, &pblk->page_bio_pool); - } -} - -int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags, - int nr_pages) -{ - struct request_queue *q = pblk->dev->q; - struct page *page; - int i, ret; - - for (i = 0; i < nr_pages; i++) { - page = mempool_alloc(&pblk->page_bio_pool, flags); - - ret = bio_add_pc_page(q, bio, page, PBLK_EXPOSED_PAGE_SIZE, 0); - if (ret != PBLK_EXPOSED_PAGE_SIZE) { - pblk_err(pblk, "could not add page to bio\n"); - mempool_free(page, &pblk->page_bio_pool); - goto err; - } - } - - return 0; -err: - pblk_bio_free_pages(pblk, bio, (bio->bi_vcnt - i), i); - return -1; -} - -void pblk_write_kick(struct pblk *pblk) -{ - wake_up_process(pblk->writer_ts); - mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(1000)); -} - -void pblk_write_timer_fn(struct timer_list *t) -{ - struct pblk *pblk = from_timer(pblk, t, wtimer); - - /* kick the write thread every tick to flush outstanding data */ - pblk_write_kick(pblk); -} - -void pblk_write_should_kick(struct pblk *pblk) -{ - unsigned int secs_avail = pblk_rb_read_count(&pblk->rwb); - - if (secs_avail >= pblk->min_write_pgs_data) - pblk_write_kick(pblk); -} - -static void pblk_wait_for_meta(struct pblk *pblk) -{ - do { - if (!atomic_read(&pblk->inflight_io)) - break; - - schedule(); - } while (1); -} - -static void pblk_flush_writer(struct pblk *pblk) -{ - pblk_rb_flush(&pblk->rwb); - do { - if (!pblk_rb_sync_count(&pblk->rwb)) - break; - - pblk_write_kick(pblk); - schedule(); - } while (1); -} - -struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct list_head *move_list = NULL; - int packed_meta = (le32_to_cpu(*line->vsc) / pblk->min_write_pgs_data) - * (pblk->min_write_pgs - pblk->min_write_pgs_data); - int vsc = le32_to_cpu(*line->vsc) + packed_meta; - - lockdep_assert_held(&line->lock); - - if (line->w_err_gc->has_write_err) { - if (line->gc_group != PBLK_LINEGC_WERR) { - line->gc_group = PBLK_LINEGC_WERR; - move_list = &l_mg->gc_werr_list; - pblk_rl_werr_line_in(&pblk->rl); - } - } else if (!vsc) { - if (line->gc_group != PBLK_LINEGC_FULL) { - line->gc_group = PBLK_LINEGC_FULL; - move_list = &l_mg->gc_full_list; - } - } else if (vsc < lm->high_thrs) { - if (line->gc_group != PBLK_LINEGC_HIGH) { - line->gc_group = PBLK_LINEGC_HIGH; - move_list = &l_mg->gc_high_list; - } - } else if (vsc < lm->mid_thrs) { - if (line->gc_group != PBLK_LINEGC_MID) { - line->gc_group = PBLK_LINEGC_MID; - move_list = &l_mg->gc_mid_list; - } - } else if (vsc < line->sec_in_line) { - if (line->gc_group != PBLK_LINEGC_LOW) { - line->gc_group = PBLK_LINEGC_LOW; - move_list = &l_mg->gc_low_list; - } - } else if (vsc == line->sec_in_line) { - if (line->gc_group != PBLK_LINEGC_EMPTY) { - line->gc_group = PBLK_LINEGC_EMPTY; - move_list = &l_mg->gc_empty_list; - } - } else { - line->state = PBLK_LINESTATE_CORRUPT; - trace_pblk_line_state(pblk_disk_name(pblk), line->id, - line->state); - - line->gc_group = PBLK_LINEGC_NONE; - move_list = &l_mg->corrupt_list; - pblk_err(pblk, "corrupted vsc for line %d, vsc:%d (%d/%d/%d)\n", - line->id, vsc, - line->sec_in_line, - lm->high_thrs, lm->mid_thrs); - } - - return move_list; -} - -void pblk_discard(struct pblk *pblk, struct bio *bio) -{ - sector_t slba = pblk_get_lba(bio); - sector_t nr_secs = pblk_get_secs(bio); - - pblk_invalidate_range(pblk, slba, nr_secs); -} - -void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd) -{ - atomic_long_inc(&pblk->write_failed); -#ifdef CONFIG_NVM_PBLK_DEBUG - pblk_print_failed_rqd(pblk, rqd, rqd->error); -#endif -} - -void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd) -{ - /* Empty page read is not necessarily an error (e.g., L2P recovery) */ - if (rqd->error == NVM_RSP_ERR_EMPTYPAGE) { - atomic_long_inc(&pblk->read_empty); - return; - } - - switch (rqd->error) { - case NVM_RSP_WARN_HIGHECC: - atomic_long_inc(&pblk->read_high_ecc); - break; - case NVM_RSP_ERR_FAILECC: - case NVM_RSP_ERR_FAILCRC: - atomic_long_inc(&pblk->read_failed); - break; - default: - pblk_err(pblk, "unknown read error:%d\n", rqd->error); - } -#ifdef CONFIG_NVM_PBLK_DEBUG - pblk_print_failed_rqd(pblk, rqd, rqd->error); -#endif -} - -void pblk_set_sec_per_write(struct pblk *pblk, int sec_per_write) -{ - pblk->sec_per_write = sec_per_write; -} - -int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd, void *buf) -{ - struct nvm_tgt_dev *dev = pblk->dev; - - atomic_inc(&pblk->inflight_io); - -#ifdef CONFIG_NVM_PBLK_DEBUG - if (pblk_check_io(pblk, rqd)) - return NVM_IO_ERR; -#endif - - return nvm_submit_io(dev, rqd, buf); -} - -void pblk_check_chunk_state_update(struct pblk *pblk, struct nvm_rq *rqd) -{ - struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd); - - int i; - - for (i = 0; i < rqd->nr_ppas; i++) { - struct ppa_addr *ppa = &ppa_list[i]; - struct nvm_chk_meta *chunk = pblk_dev_ppa_to_chunk(pblk, *ppa); - u64 caddr = pblk_dev_ppa_to_chunk_addr(pblk, *ppa); - - if (caddr == 0) - trace_pblk_chunk_state(pblk_disk_name(pblk), - ppa, NVM_CHK_ST_OPEN); - else if (caddr == (chunk->cnlb - 1)) - trace_pblk_chunk_state(pblk_disk_name(pblk), - ppa, NVM_CHK_ST_CLOSED); - } -} - -int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd, void *buf) -{ - struct nvm_tgt_dev *dev = pblk->dev; - int ret; - - atomic_inc(&pblk->inflight_io); - -#ifdef CONFIG_NVM_PBLK_DEBUG - if (pblk_check_io(pblk, rqd)) - return NVM_IO_ERR; -#endif - - ret = nvm_submit_io_sync(dev, rqd, buf); - - if (trace_pblk_chunk_state_enabled() && !ret && - rqd->opcode == NVM_OP_PWRITE) - pblk_check_chunk_state_update(pblk, rqd); - - return ret; -} - -static int pblk_submit_io_sync_sem(struct pblk *pblk, struct nvm_rq *rqd, - void *buf) -{ - struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd); - int ret; - - pblk_down_chunk(pblk, ppa_list[0]); - ret = pblk_submit_io_sync(pblk, rqd, buf); - pblk_up_chunk(pblk, ppa_list[0]); - - return ret; -} - -int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail, - unsigned long secs_to_flush, bool skip_meta) -{ - int max = pblk->sec_per_write; - int min = pblk->min_write_pgs; - int secs_to_sync = 0; - - if (skip_meta && pblk->min_write_pgs_data != pblk->min_write_pgs) - min = max = pblk->min_write_pgs_data; - - if (secs_avail >= max) - secs_to_sync = max; - else if (secs_avail >= min) - secs_to_sync = min * (secs_avail / min); - else if (secs_to_flush) - secs_to_sync = min; - - return secs_to_sync; -} - -void pblk_dealloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs) -{ - u64 addr; - int i; - - spin_lock(&line->lock); - addr = find_next_zero_bit(line->map_bitmap, - pblk->lm.sec_per_line, line->cur_sec); - line->cur_sec = addr - nr_secs; - - for (i = 0; i < nr_secs; i++, line->cur_sec--) - WARN_ON(!test_and_clear_bit(line->cur_sec, line->map_bitmap)); - spin_unlock(&line->lock); -} - -u64 __pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs) -{ - u64 addr; - int i; - - lockdep_assert_held(&line->lock); - - /* logic error: ppa out-of-bounds. Prevent generating bad address */ - if (line->cur_sec + nr_secs > pblk->lm.sec_per_line) { - WARN(1, "pblk: page allocation out of bounds\n"); - nr_secs = pblk->lm.sec_per_line - line->cur_sec; - } - - line->cur_sec = addr = find_next_zero_bit(line->map_bitmap, - pblk->lm.sec_per_line, line->cur_sec); - for (i = 0; i < nr_secs; i++, line->cur_sec++) - WARN_ON(test_and_set_bit(line->cur_sec, line->map_bitmap)); - - return addr; -} - -u64 pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs) -{ - u64 addr; - - /* Lock needed in case a write fails and a recovery needs to remap - * failed write buffer entries - */ - spin_lock(&line->lock); - addr = __pblk_alloc_page(pblk, line, nr_secs); - line->left_msecs -= nr_secs; - WARN(line->left_msecs < 0, "pblk: page allocation out of bounds\n"); - spin_unlock(&line->lock); - - return addr; -} - -u64 pblk_lookup_page(struct pblk *pblk, struct pblk_line *line) -{ - u64 paddr; - - spin_lock(&line->lock); - paddr = find_next_zero_bit(line->map_bitmap, - pblk->lm.sec_per_line, line->cur_sec); - spin_unlock(&line->lock); - - return paddr; -} - -u64 pblk_line_smeta_start(struct pblk *pblk, struct pblk_line *line) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line_meta *lm = &pblk->lm; - int bit; - - /* This usually only happens on bad lines */ - bit = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line); - if (bit >= lm->blk_per_line) - return -1; - - return bit * geo->ws_opt; -} - -int pblk_line_smeta_read(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_line_meta *lm = &pblk->lm; - struct ppa_addr *ppa_list; - struct nvm_rq rqd; - u64 paddr = pblk_line_smeta_start(pblk, line); - int i, ret; - - memset(&rqd, 0, sizeof(struct nvm_rq)); - - ret = pblk_alloc_rqd_meta(pblk, &rqd); - if (ret) - return ret; - - rqd.opcode = NVM_OP_PREAD; - rqd.nr_ppas = lm->smeta_sec; - rqd.is_seq = 1; - ppa_list = nvm_rq_to_ppa_list(&rqd); - - for (i = 0; i < lm->smeta_sec; i++, paddr++) - ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id); - - ret = pblk_submit_io_sync(pblk, &rqd, line->smeta); - if (ret) { - pblk_err(pblk, "smeta I/O submission failed: %d\n", ret); - goto clear_rqd; - } - - atomic_dec(&pblk->inflight_io); - - if (rqd.error && rqd.error != NVM_RSP_WARN_HIGHECC) { - pblk_log_read_err(pblk, &rqd); - ret = -EIO; - } - -clear_rqd: - pblk_free_rqd_meta(pblk, &rqd); - return ret; -} - -static int pblk_line_smeta_write(struct pblk *pblk, struct pblk_line *line, - u64 paddr) -{ - struct pblk_line_meta *lm = &pblk->lm; - struct ppa_addr *ppa_list; - struct nvm_rq rqd; - __le64 *lba_list = emeta_to_lbas(pblk, line->emeta->buf); - __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); - int i, ret; - - memset(&rqd, 0, sizeof(struct nvm_rq)); - - ret = pblk_alloc_rqd_meta(pblk, &rqd); - if (ret) - return ret; - - rqd.opcode = NVM_OP_PWRITE; - rqd.nr_ppas = lm->smeta_sec; - rqd.is_seq = 1; - ppa_list = nvm_rq_to_ppa_list(&rqd); - - for (i = 0; i < lm->smeta_sec; i++, paddr++) { - struct pblk_sec_meta *meta = pblk_get_meta(pblk, - rqd.meta_list, i); - - ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id); - meta->lba = lba_list[paddr] = addr_empty; - } - - ret = pblk_submit_io_sync_sem(pblk, &rqd, line->smeta); - if (ret) { - pblk_err(pblk, "smeta I/O submission failed: %d\n", ret); - goto clear_rqd; - } - - atomic_dec(&pblk->inflight_io); - - if (rqd.error) { - pblk_log_write_err(pblk, &rqd); - ret = -EIO; - } - -clear_rqd: - pblk_free_rqd_meta(pblk, &rqd); - return ret; -} - -int pblk_line_emeta_read(struct pblk *pblk, struct pblk_line *line, - void *emeta_buf) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line_meta *lm = &pblk->lm; - void *ppa_list_buf, *meta_list; - struct ppa_addr *ppa_list; - struct nvm_rq rqd; - u64 paddr = line->emeta_ssec; - dma_addr_t dma_ppa_list, dma_meta_list; - int min = pblk->min_write_pgs; - int left_ppas = lm->emeta_sec[0]; - int line_id = line->id; - int rq_ppas, rq_len; - int i, j; - int ret; - - meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, - &dma_meta_list); - if (!meta_list) - return -ENOMEM; - - ppa_list_buf = meta_list + pblk_dma_meta_size(pblk); - dma_ppa_list = dma_meta_list + pblk_dma_meta_size(pblk); - -next_rq: - memset(&rqd, 0, sizeof(struct nvm_rq)); - - rq_ppas = pblk_calc_secs(pblk, left_ppas, 0, false); - rq_len = rq_ppas * geo->csecs; - - rqd.meta_list = meta_list; - rqd.ppa_list = ppa_list_buf; - rqd.dma_meta_list = dma_meta_list; - rqd.dma_ppa_list = dma_ppa_list; - rqd.opcode = NVM_OP_PREAD; - rqd.nr_ppas = rq_ppas; - ppa_list = nvm_rq_to_ppa_list(&rqd); - - for (i = 0; i < rqd.nr_ppas; ) { - struct ppa_addr ppa = addr_to_gen_ppa(pblk, paddr, line_id); - int pos = pblk_ppa_to_pos(geo, ppa); - - if (pblk_io_aligned(pblk, rq_ppas)) - rqd.is_seq = 1; - - while (test_bit(pos, line->blk_bitmap)) { - paddr += min; - if (pblk_boundary_paddr_checks(pblk, paddr)) { - ret = -EINTR; - goto free_rqd_dma; - } - - ppa = addr_to_gen_ppa(pblk, paddr, line_id); - pos = pblk_ppa_to_pos(geo, ppa); - } - - if (pblk_boundary_paddr_checks(pblk, paddr + min)) { - ret = -EINTR; - goto free_rqd_dma; - } - - for (j = 0; j < min; j++, i++, paddr++) - ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line_id); - } - - ret = pblk_submit_io_sync(pblk, &rqd, emeta_buf); - if (ret) { - pblk_err(pblk, "emeta I/O submission failed: %d\n", ret); - goto free_rqd_dma; - } - - atomic_dec(&pblk->inflight_io); - - if (rqd.error && rqd.error != NVM_RSP_WARN_HIGHECC) { - pblk_log_read_err(pblk, &rqd); - ret = -EIO; - goto free_rqd_dma; - } - - emeta_buf += rq_len; - left_ppas -= rq_ppas; - if (left_ppas) - goto next_rq; - -free_rqd_dma: - nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list); - return ret; -} - -static void pblk_setup_e_rq(struct pblk *pblk, struct nvm_rq *rqd, - struct ppa_addr ppa) -{ - rqd->opcode = NVM_OP_ERASE; - rqd->ppa_addr = ppa; - rqd->nr_ppas = 1; - rqd->is_seq = 1; - rqd->bio = NULL; -} - -static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa) -{ - struct nvm_rq rqd = {NULL}; - int ret; - - trace_pblk_chunk_reset(pblk_disk_name(pblk), &ppa, - PBLK_CHUNK_RESET_START); - - pblk_setup_e_rq(pblk, &rqd, ppa); - - /* The write thread schedules erases so that it minimizes disturbances - * with writes. Thus, there is no need to take the LUN semaphore. - */ - ret = pblk_submit_io_sync(pblk, &rqd, NULL); - rqd.private = pblk; - __pblk_end_io_erase(pblk, &rqd); - - return ret; -} - -int pblk_line_erase(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_line_meta *lm = &pblk->lm; - struct ppa_addr ppa; - int ret, bit = -1; - - /* Erase only good blocks, one at a time */ - do { - spin_lock(&line->lock); - bit = find_next_zero_bit(line->erase_bitmap, lm->blk_per_line, - bit + 1); - if (bit >= lm->blk_per_line) { - spin_unlock(&line->lock); - break; - } - - ppa = pblk->luns[bit].bppa; /* set ch and lun */ - ppa.a.blk = line->id; - - atomic_dec(&line->left_eblks); - WARN_ON(test_and_set_bit(bit, line->erase_bitmap)); - spin_unlock(&line->lock); - - ret = pblk_blk_erase_sync(pblk, ppa); - if (ret) { - pblk_err(pblk, "failed to erase line %d\n", line->id); - return ret; - } - } while (1); - - return 0; -} - -static void pblk_line_setup_metadata(struct pblk_line *line, - struct pblk_line_mgmt *l_mg, - struct pblk_line_meta *lm) -{ - int meta_line; - - lockdep_assert_held(&l_mg->free_lock); - -retry_meta: - meta_line = find_first_zero_bit(&l_mg->meta_bitmap, PBLK_DATA_LINES); - if (meta_line == PBLK_DATA_LINES) { - spin_unlock(&l_mg->free_lock); - io_schedule(); - spin_lock(&l_mg->free_lock); - goto retry_meta; - } - - set_bit(meta_line, &l_mg->meta_bitmap); - line->meta_line = meta_line; - - line->smeta = l_mg->sline_meta[meta_line]; - line->emeta = l_mg->eline_meta[meta_line]; - - memset(line->smeta, 0, lm->smeta_len); - memset(line->emeta->buf, 0, lm->emeta_len[0]); - - line->emeta->mem = 0; - atomic_set(&line->emeta->sync, 0); -} - -/* For now lines are always assumed full lines. Thus, smeta former and current - * lun bitmaps are omitted. - */ -static int pblk_line_init_metadata(struct pblk *pblk, struct pblk_line *line, - struct pblk_line *cur) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_emeta *emeta = line->emeta; - struct line_emeta *emeta_buf = emeta->buf; - struct line_smeta *smeta_buf = (struct line_smeta *)line->smeta; - int nr_blk_line; - - /* After erasing the line, new bad blocks might appear and we risk - * having an invalid line - */ - nr_blk_line = lm->blk_per_line - - bitmap_weight(line->blk_bitmap, lm->blk_per_line); - if (nr_blk_line < lm->min_blk_line) { - spin_lock(&l_mg->free_lock); - spin_lock(&line->lock); - line->state = PBLK_LINESTATE_BAD; - trace_pblk_line_state(pblk_disk_name(pblk), line->id, - line->state); - spin_unlock(&line->lock); - - list_add_tail(&line->list, &l_mg->bad_list); - spin_unlock(&l_mg->free_lock); - - pblk_debug(pblk, "line %d is bad\n", line->id); - - return 0; - } - - /* Run-time metadata */ - line->lun_bitmap = ((void *)(smeta_buf)) + sizeof(struct line_smeta); - - /* Mark LUNs allocated in this line (all for now) */ - bitmap_set(line->lun_bitmap, 0, lm->lun_bitmap_len); - - smeta_buf->header.identifier = cpu_to_le32(PBLK_MAGIC); - export_guid(smeta_buf->header.uuid, &pblk->instance_uuid); - smeta_buf->header.id = cpu_to_le32(line->id); - smeta_buf->header.type = cpu_to_le16(line->type); - smeta_buf->header.version_major = SMETA_VERSION_MAJOR; - smeta_buf->header.version_minor = SMETA_VERSION_MINOR; - - /* Start metadata */ - smeta_buf->seq_nr = cpu_to_le64(line->seq_nr); - smeta_buf->window_wr_lun = cpu_to_le32(geo->all_luns); - - /* Fill metadata among lines */ - if (cur) { - memcpy(line->lun_bitmap, cur->lun_bitmap, lm->lun_bitmap_len); - smeta_buf->prev_id = cpu_to_le32(cur->id); - cur->emeta->buf->next_id = cpu_to_le32(line->id); - } else { - smeta_buf->prev_id = cpu_to_le32(PBLK_LINE_EMPTY); - } - - /* All smeta must be set at this point */ - smeta_buf->header.crc = cpu_to_le32( - pblk_calc_meta_header_crc(pblk, &smeta_buf->header)); - smeta_buf->crc = cpu_to_le32(pblk_calc_smeta_crc(pblk, smeta_buf)); - - /* End metadata */ - memcpy(&emeta_buf->header, &smeta_buf->header, - sizeof(struct line_header)); - - emeta_buf->header.version_major = EMETA_VERSION_MAJOR; - emeta_buf->header.version_minor = EMETA_VERSION_MINOR; - emeta_buf->header.crc = cpu_to_le32( - pblk_calc_meta_header_crc(pblk, &emeta_buf->header)); - - emeta_buf->seq_nr = cpu_to_le64(line->seq_nr); - emeta_buf->nr_lbas = cpu_to_le64(line->sec_in_line); - emeta_buf->nr_valid_lbas = cpu_to_le64(0); - emeta_buf->next_id = cpu_to_le32(PBLK_LINE_EMPTY); - emeta_buf->crc = cpu_to_le32(0); - emeta_buf->prev_id = smeta_buf->prev_id; - - return 1; -} - -static int pblk_line_alloc_bitmaps(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - - line->map_bitmap = mempool_alloc(l_mg->bitmap_pool, GFP_KERNEL); - if (!line->map_bitmap) - return -ENOMEM; - - memset(line->map_bitmap, 0, lm->sec_bitmap_len); - - /* will be initialized using bb info from map_bitmap */ - line->invalid_bitmap = mempool_alloc(l_mg->bitmap_pool, GFP_KERNEL); - if (!line->invalid_bitmap) { - mempool_free(line->map_bitmap, l_mg->bitmap_pool); - line->map_bitmap = NULL; - return -ENOMEM; - } - - return 0; -} - -/* For now lines are always assumed full lines. Thus, smeta former and current - * lun bitmaps are omitted. - */ -static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line, - int init) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - u64 off; - int bit = -1; - int emeta_secs; - - line->sec_in_line = lm->sec_per_line; - - /* Capture bad block information on line mapping bitmaps */ - while ((bit = find_next_bit(line->blk_bitmap, lm->blk_per_line, - bit + 1)) < lm->blk_per_line) { - off = bit * geo->ws_opt; - bitmap_shift_left(l_mg->bb_aux, l_mg->bb_template, off, - lm->sec_per_line); - bitmap_or(line->map_bitmap, line->map_bitmap, l_mg->bb_aux, - lm->sec_per_line); - line->sec_in_line -= geo->clba; - } - - /* Mark smeta metadata sectors as bad sectors */ - bit = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line); - off = bit * geo->ws_opt; - bitmap_set(line->map_bitmap, off, lm->smeta_sec); - line->sec_in_line -= lm->smeta_sec; - line->cur_sec = off + lm->smeta_sec; - - if (init && pblk_line_smeta_write(pblk, line, off)) { - pblk_debug(pblk, "line smeta I/O failed. Retry\n"); - return 0; - } - - bitmap_copy(line->invalid_bitmap, line->map_bitmap, lm->sec_per_line); - - /* Mark emeta metadata sectors as bad sectors. We need to consider bad - * blocks to make sure that there are enough sectors to store emeta - */ - emeta_secs = lm->emeta_sec[0]; - off = lm->sec_per_line; - while (emeta_secs) { - off -= geo->ws_opt; - if (!test_bit(off, line->invalid_bitmap)) { - bitmap_set(line->invalid_bitmap, off, geo->ws_opt); - emeta_secs -= geo->ws_opt; - } - } - - line->emeta_ssec = off; - line->sec_in_line -= lm->emeta_sec[0]; - line->nr_valid_lbas = 0; - line->left_msecs = line->sec_in_line; - *line->vsc = cpu_to_le32(line->sec_in_line); - - if (lm->sec_per_line - line->sec_in_line != - bitmap_weight(line->invalid_bitmap, lm->sec_per_line)) { - spin_lock(&line->lock); - line->state = PBLK_LINESTATE_BAD; - trace_pblk_line_state(pblk_disk_name(pblk), line->id, - line->state); - spin_unlock(&line->lock); - - list_add_tail(&line->list, &l_mg->bad_list); - pblk_err(pblk, "unexpected line %d is bad\n", line->id); - - return 0; - } - - return 1; -} - -static int pblk_prepare_new_line(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_line_meta *lm = &pblk->lm; - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - int blk_to_erase = atomic_read(&line->blk_in_line); - int i; - - for (i = 0; i < lm->blk_per_line; i++) { - struct pblk_lun *rlun = &pblk->luns[i]; - int pos = pblk_ppa_to_pos(geo, rlun->bppa); - int state = line->chks[pos].state; - - /* Free chunks should not be erased */ - if (state & NVM_CHK_ST_FREE) { - set_bit(pblk_ppa_to_pos(geo, rlun->bppa), - line->erase_bitmap); - blk_to_erase--; - } - } - - return blk_to_erase; -} - -static int pblk_line_prepare(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_line_meta *lm = &pblk->lm; - int blk_in_line = atomic_read(&line->blk_in_line); - int blk_to_erase; - - /* Bad blocks do not need to be erased */ - bitmap_copy(line->erase_bitmap, line->blk_bitmap, lm->blk_per_line); - - spin_lock(&line->lock); - - /* If we have not written to this line, we need to mark up free chunks - * as already erased - */ - if (line->state == PBLK_LINESTATE_NEW) { - blk_to_erase = pblk_prepare_new_line(pblk, line); - line->state = PBLK_LINESTATE_FREE; - trace_pblk_line_state(pblk_disk_name(pblk), line->id, - line->state); - } else { - blk_to_erase = blk_in_line; - } - - if (blk_in_line < lm->min_blk_line) { - spin_unlock(&line->lock); - return -EAGAIN; - } - - if (line->state != PBLK_LINESTATE_FREE) { - WARN(1, "pblk: corrupted line %d, state %d\n", - line->id, line->state); - spin_unlock(&line->lock); - return -EINTR; - } - - line->state = PBLK_LINESTATE_OPEN; - trace_pblk_line_state(pblk_disk_name(pblk), line->id, - line->state); - - atomic_set(&line->left_eblks, blk_to_erase); - atomic_set(&line->left_seblks, blk_to_erase); - - line->meta_distance = lm->meta_distance; - spin_unlock(&line->lock); - - kref_init(&line->ref); - atomic_set(&line->sec_to_update, 0); - - return 0; -} - -/* Line allocations in the recovery path are always single threaded */ -int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - int ret; - - spin_lock(&l_mg->free_lock); - l_mg->data_line = line; - list_del(&line->list); - - ret = pblk_line_prepare(pblk, line); - if (ret) { - list_add(&line->list, &l_mg->free_list); - spin_unlock(&l_mg->free_lock); - return ret; - } - spin_unlock(&l_mg->free_lock); - - ret = pblk_line_alloc_bitmaps(pblk, line); - if (ret) - goto fail; - - if (!pblk_line_init_bb(pblk, line, 0)) { - ret = -EINTR; - goto fail; - } - - pblk_rl_free_lines_dec(&pblk->rl, line, true); - return 0; - -fail: - spin_lock(&l_mg->free_lock); - list_add(&line->list, &l_mg->free_list); - spin_unlock(&l_mg->free_lock); - - return ret; -} - -void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - - mempool_free(line->map_bitmap, l_mg->bitmap_pool); - line->map_bitmap = NULL; - line->smeta = NULL; - line->emeta = NULL; -} - -static void pblk_line_reinit(struct pblk_line *line) -{ - *line->vsc = cpu_to_le32(EMPTY_ENTRY); - - line->map_bitmap = NULL; - line->invalid_bitmap = NULL; - line->smeta = NULL; - line->emeta = NULL; -} - -void pblk_line_free(struct pblk_line *line) -{ - struct pblk *pblk = line->pblk; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - - mempool_free(line->map_bitmap, l_mg->bitmap_pool); - mempool_free(line->invalid_bitmap, l_mg->bitmap_pool); - - pblk_line_reinit(line); -} - -struct pblk_line *pblk_line_get(struct pblk *pblk) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_line *line; - int ret, bit; - - lockdep_assert_held(&l_mg->free_lock); - -retry: - if (list_empty(&l_mg->free_list)) { - pblk_err(pblk, "no free lines\n"); - return NULL; - } - - line = list_first_entry(&l_mg->free_list, struct pblk_line, list); - list_del(&line->list); - l_mg->nr_free_lines--; - - bit = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line); - if (unlikely(bit >= lm->blk_per_line)) { - spin_lock(&line->lock); - line->state = PBLK_LINESTATE_BAD; - trace_pblk_line_state(pblk_disk_name(pblk), line->id, - line->state); - spin_unlock(&line->lock); - - list_add_tail(&line->list, &l_mg->bad_list); - - pblk_debug(pblk, "line %d is bad\n", line->id); - goto retry; - } - - ret = pblk_line_prepare(pblk, line); - if (ret) { - switch (ret) { - case -EAGAIN: - list_add(&line->list, &l_mg->bad_list); - goto retry; - case -EINTR: - list_add(&line->list, &l_mg->corrupt_list); - goto retry; - default: - pblk_err(pblk, "failed to prepare line %d\n", line->id); - list_add(&line->list, &l_mg->free_list); - l_mg->nr_free_lines++; - return NULL; - } - } - - return line; -} - -static struct pblk_line *pblk_line_retry(struct pblk *pblk, - struct pblk_line *line) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line *retry_line; - -retry: - spin_lock(&l_mg->free_lock); - retry_line = pblk_line_get(pblk); - if (!retry_line) { - l_mg->data_line = NULL; - spin_unlock(&l_mg->free_lock); - return NULL; - } - - retry_line->map_bitmap = line->map_bitmap; - retry_line->invalid_bitmap = line->invalid_bitmap; - retry_line->smeta = line->smeta; - retry_line->emeta = line->emeta; - retry_line->meta_line = line->meta_line; - - pblk_line_reinit(line); - - l_mg->data_line = retry_line; - spin_unlock(&l_mg->free_lock); - - pblk_rl_free_lines_dec(&pblk->rl, line, false); - - if (pblk_line_erase(pblk, retry_line)) - goto retry; - - return retry_line; -} - -static void pblk_set_space_limit(struct pblk *pblk) -{ - struct pblk_rl *rl = &pblk->rl; - - atomic_set(&rl->rb_space, 0); -} - -struct pblk_line *pblk_line_get_first_data(struct pblk *pblk) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line *line; - - spin_lock(&l_mg->free_lock); - line = pblk_line_get(pblk); - if (!line) { - spin_unlock(&l_mg->free_lock); - return NULL; - } - - line->seq_nr = l_mg->d_seq_nr++; - line->type = PBLK_LINETYPE_DATA; - l_mg->data_line = line; - - pblk_line_setup_metadata(line, l_mg, &pblk->lm); - - /* Allocate next line for preparation */ - l_mg->data_next = pblk_line_get(pblk); - if (!l_mg->data_next) { - /* If we cannot get a new line, we need to stop the pipeline. - * Only allow as many writes in as we can store safely and then - * fail gracefully - */ - pblk_set_space_limit(pblk); - - l_mg->data_next = NULL; - } else { - l_mg->data_next->seq_nr = l_mg->d_seq_nr++; - l_mg->data_next->type = PBLK_LINETYPE_DATA; - } - spin_unlock(&l_mg->free_lock); - - if (pblk_line_alloc_bitmaps(pblk, line)) - return NULL; - - if (pblk_line_erase(pblk, line)) { - line = pblk_line_retry(pblk, line); - if (!line) - return NULL; - } - -retry_setup: - if (!pblk_line_init_metadata(pblk, line, NULL)) { - line = pblk_line_retry(pblk, line); - if (!line) - return NULL; - - goto retry_setup; - } - - if (!pblk_line_init_bb(pblk, line, 1)) { - line = pblk_line_retry(pblk, line); - if (!line) - return NULL; - - goto retry_setup; - } - - pblk_rl_free_lines_dec(&pblk->rl, line, true); - - return line; -} - -void pblk_ppa_to_line_put(struct pblk *pblk, struct ppa_addr ppa) -{ - struct pblk_line *line; - - line = pblk_ppa_to_line(pblk, ppa); - kref_put(&line->ref, pblk_line_put_wq); -} - -void pblk_rq_to_line_put(struct pblk *pblk, struct nvm_rq *rqd) -{ - struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd); - int i; - - for (i = 0; i < rqd->nr_ppas; i++) - pblk_ppa_to_line_put(pblk, ppa_list[i]); -} - -static void pblk_stop_writes(struct pblk *pblk, struct pblk_line *line) -{ - lockdep_assert_held(&pblk->l_mg.free_lock); - - pblk_set_space_limit(pblk); - pblk->state = PBLK_STATE_STOPPING; - trace_pblk_state(pblk_disk_name(pblk), pblk->state); -} - -static void pblk_line_close_meta_sync(struct pblk *pblk) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_line *line, *tline; - LIST_HEAD(list); - - spin_lock(&l_mg->close_lock); - if (list_empty(&l_mg->emeta_list)) { - spin_unlock(&l_mg->close_lock); - return; - } - - list_cut_position(&list, &l_mg->emeta_list, l_mg->emeta_list.prev); - spin_unlock(&l_mg->close_lock); - - list_for_each_entry_safe(line, tline, &list, list) { - struct pblk_emeta *emeta = line->emeta; - - while (emeta->mem < lm->emeta_len[0]) { - int ret; - - ret = pblk_submit_meta_io(pblk, line); - if (ret) { - pblk_err(pblk, "sync meta line %d failed (%d)\n", - line->id, ret); - return; - } - } - } - - pblk_wait_for_meta(pblk); - flush_workqueue(pblk->close_wq); -} - -void __pblk_pipeline_flush(struct pblk *pblk) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - int ret; - - spin_lock(&l_mg->free_lock); - if (pblk->state == PBLK_STATE_RECOVERING || - pblk->state == PBLK_STATE_STOPPED) { - spin_unlock(&l_mg->free_lock); - return; - } - pblk->state = PBLK_STATE_RECOVERING; - trace_pblk_state(pblk_disk_name(pblk), pblk->state); - spin_unlock(&l_mg->free_lock); - - pblk_flush_writer(pblk); - pblk_wait_for_meta(pblk); - - ret = pblk_recov_pad(pblk); - if (ret) { - pblk_err(pblk, "could not close data on teardown(%d)\n", ret); - return; - } - - flush_workqueue(pblk->bb_wq); - pblk_line_close_meta_sync(pblk); -} - -void __pblk_pipeline_stop(struct pblk *pblk) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - - spin_lock(&l_mg->free_lock); - pblk->state = PBLK_STATE_STOPPED; - trace_pblk_state(pblk_disk_name(pblk), pblk->state); - l_mg->data_line = NULL; - l_mg->data_next = NULL; - spin_unlock(&l_mg->free_lock); -} - -void pblk_pipeline_stop(struct pblk *pblk) -{ - __pblk_pipeline_flush(pblk); - __pblk_pipeline_stop(pblk); -} - -struct pblk_line *pblk_line_replace_data(struct pblk *pblk) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line *cur, *new = NULL; - unsigned int left_seblks; - - new = l_mg->data_next; - if (!new) - goto out; - - spin_lock(&l_mg->free_lock); - cur = l_mg->data_line; - l_mg->data_line = new; - - pblk_line_setup_metadata(new, l_mg, &pblk->lm); - spin_unlock(&l_mg->free_lock); - -retry_erase: - left_seblks = atomic_read(&new->left_seblks); - if (left_seblks) { - /* If line is not fully erased, erase it */ - if (atomic_read(&new->left_eblks)) { - if (pblk_line_erase(pblk, new)) - goto out; - } else { - io_schedule(); - } - goto retry_erase; - } - - if (pblk_line_alloc_bitmaps(pblk, new)) - return NULL; - -retry_setup: - if (!pblk_line_init_metadata(pblk, new, cur)) { - new = pblk_line_retry(pblk, new); - if (!new) - goto out; - - goto retry_setup; - } - - if (!pblk_line_init_bb(pblk, new, 1)) { - new = pblk_line_retry(pblk, new); - if (!new) - goto out; - - goto retry_setup; - } - - pblk_rl_free_lines_dec(&pblk->rl, new, true); - - /* Allocate next line for preparation */ - spin_lock(&l_mg->free_lock); - l_mg->data_next = pblk_line_get(pblk); - if (!l_mg->data_next) { - /* If we cannot get a new line, we need to stop the pipeline. - * Only allow as many writes in as we can store safely and then - * fail gracefully - */ - pblk_stop_writes(pblk, new); - l_mg->data_next = NULL; - } else { - l_mg->data_next->seq_nr = l_mg->d_seq_nr++; - l_mg->data_next->type = PBLK_LINETYPE_DATA; - } - spin_unlock(&l_mg->free_lock); - -out: - return new; -} - -static void __pblk_line_put(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_gc *gc = &pblk->gc; - - spin_lock(&line->lock); - WARN_ON(line->state != PBLK_LINESTATE_GC); - if (line->w_err_gc->has_gc_err) { - spin_unlock(&line->lock); - pblk_err(pblk, "line %d had errors during GC\n", line->id); - pblk_put_line_back(pblk, line); - line->w_err_gc->has_gc_err = 0; - return; - } - - line->state = PBLK_LINESTATE_FREE; - trace_pblk_line_state(pblk_disk_name(pblk), line->id, - line->state); - line->gc_group = PBLK_LINEGC_NONE; - pblk_line_free(line); - - if (line->w_err_gc->has_write_err) { - pblk_rl_werr_line_out(&pblk->rl); - line->w_err_gc->has_write_err = 0; - } - - spin_unlock(&line->lock); - atomic_dec(&gc->pipeline_gc); - - spin_lock(&l_mg->free_lock); - list_add_tail(&line->list, &l_mg->free_list); - l_mg->nr_free_lines++; - spin_unlock(&l_mg->free_lock); - - pblk_rl_free_lines_inc(&pblk->rl, line); -} - -static void pblk_line_put_ws(struct work_struct *work) -{ - struct pblk_line_ws *line_put_ws = container_of(work, - struct pblk_line_ws, ws); - struct pblk *pblk = line_put_ws->pblk; - struct pblk_line *line = line_put_ws->line; - - __pblk_line_put(pblk, line); - mempool_free(line_put_ws, &pblk->gen_ws_pool); -} - -void pblk_line_put(struct kref *ref) -{ - struct pblk_line *line = container_of(ref, struct pblk_line, ref); - struct pblk *pblk = line->pblk; - - __pblk_line_put(pblk, line); -} - -void pblk_line_put_wq(struct kref *ref) -{ - struct pblk_line *line = container_of(ref, struct pblk_line, ref); - struct pblk *pblk = line->pblk; - struct pblk_line_ws *line_put_ws; - - line_put_ws = mempool_alloc(&pblk->gen_ws_pool, GFP_ATOMIC); - if (!line_put_ws) - return; - - line_put_ws->pblk = pblk; - line_put_ws->line = line; - line_put_ws->priv = NULL; - - INIT_WORK(&line_put_ws->ws, pblk_line_put_ws); - queue_work(pblk->r_end_wq, &line_put_ws->ws); -} - -int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa) -{ - struct nvm_rq *rqd; - int err; - - rqd = pblk_alloc_rqd(pblk, PBLK_ERASE); - - pblk_setup_e_rq(pblk, rqd, ppa); - - rqd->end_io = pblk_end_io_erase; - rqd->private = pblk; - - trace_pblk_chunk_reset(pblk_disk_name(pblk), - &ppa, PBLK_CHUNK_RESET_START); - - /* The write thread schedules erases so that it minimizes disturbances - * with writes. Thus, there is no need to take the LUN semaphore. - */ - err = pblk_submit_io(pblk, rqd, NULL); - if (err) { - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - - pblk_err(pblk, "could not async erase line:%d,blk:%d\n", - pblk_ppa_to_line_id(ppa), - pblk_ppa_to_pos(geo, ppa)); - } - - return err; -} - -struct pblk_line *pblk_line_get_data(struct pblk *pblk) -{ - return pblk->l_mg.data_line; -} - -/* For now, always erase next line */ -struct pblk_line *pblk_line_get_erase(struct pblk *pblk) -{ - return pblk->l_mg.data_next; -} - -int pblk_line_is_full(struct pblk_line *line) -{ - return (line->left_msecs == 0); -} - -static void pblk_line_should_sync_meta(struct pblk *pblk) -{ - if (pblk_rl_is_limit(&pblk->rl)) - pblk_line_close_meta_sync(pblk); -} - -void pblk_line_close(struct pblk *pblk, struct pblk_line *line) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct list_head *move_list; - int i; - -#ifdef CONFIG_NVM_PBLK_DEBUG - WARN(!bitmap_full(line->map_bitmap, lm->sec_per_line), - "pblk: corrupt closed line %d\n", line->id); -#endif - - spin_lock(&l_mg->free_lock); - WARN_ON(!test_and_clear_bit(line->meta_line, &l_mg->meta_bitmap)); - spin_unlock(&l_mg->free_lock); - - spin_lock(&l_mg->gc_lock); - spin_lock(&line->lock); - WARN_ON(line->state != PBLK_LINESTATE_OPEN); - line->state = PBLK_LINESTATE_CLOSED; - move_list = pblk_line_gc_list(pblk, line); - list_add_tail(&line->list, move_list); - - mempool_free(line->map_bitmap, l_mg->bitmap_pool); - line->map_bitmap = NULL; - line->smeta = NULL; - line->emeta = NULL; - - for (i = 0; i < lm->blk_per_line; i++) { - struct pblk_lun *rlun = &pblk->luns[i]; - int pos = pblk_ppa_to_pos(geo, rlun->bppa); - int state = line->chks[pos].state; - - if (!(state & NVM_CHK_ST_OFFLINE)) - state = NVM_CHK_ST_CLOSED; - } - - spin_unlock(&line->lock); - spin_unlock(&l_mg->gc_lock); - - trace_pblk_line_state(pblk_disk_name(pblk), line->id, - line->state); -} - -void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_emeta *emeta = line->emeta; - struct line_emeta *emeta_buf = emeta->buf; - struct wa_counters *wa = emeta_to_wa(lm, emeta_buf); - - /* No need for exact vsc value; avoid a big line lock and take aprox. */ - memcpy(emeta_to_vsc(pblk, emeta_buf), l_mg->vsc_list, lm->vsc_list_len); - memcpy(emeta_to_bb(emeta_buf), line->blk_bitmap, lm->blk_bitmap_len); - - wa->user = cpu_to_le64(atomic64_read(&pblk->user_wa)); - wa->pad = cpu_to_le64(atomic64_read(&pblk->pad_wa)); - wa->gc = cpu_to_le64(atomic64_read(&pblk->gc_wa)); - - if (le32_to_cpu(emeta_buf->header.identifier) != PBLK_MAGIC) { - emeta_buf->header.identifier = cpu_to_le32(PBLK_MAGIC); - export_guid(emeta_buf->header.uuid, &pblk->instance_uuid); - emeta_buf->header.id = cpu_to_le32(line->id); - emeta_buf->header.type = cpu_to_le16(line->type); - emeta_buf->header.version_major = EMETA_VERSION_MAJOR; - emeta_buf->header.version_minor = EMETA_VERSION_MINOR; - emeta_buf->header.crc = cpu_to_le32( - pblk_calc_meta_header_crc(pblk, &emeta_buf->header)); - } - - emeta_buf->nr_valid_lbas = cpu_to_le64(line->nr_valid_lbas); - emeta_buf->crc = cpu_to_le32(pblk_calc_emeta_crc(pblk, emeta_buf)); - - spin_lock(&l_mg->close_lock); - spin_lock(&line->lock); - - /* Update the in-memory start address for emeta, in case it has - * shifted due to write errors - */ - if (line->emeta_ssec != line->cur_sec) - line->emeta_ssec = line->cur_sec; - - list_add_tail(&line->list, &l_mg->emeta_list); - spin_unlock(&line->lock); - spin_unlock(&l_mg->close_lock); - - pblk_line_should_sync_meta(pblk); -} - -static void pblk_save_lba_list(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_line_meta *lm = &pblk->lm; - unsigned int lba_list_size = lm->emeta_len[2]; - struct pblk_w_err_gc *w_err_gc = line->w_err_gc; - struct pblk_emeta *emeta = line->emeta; - - w_err_gc->lba_list = kvmalloc(lba_list_size, GFP_KERNEL); - memcpy(w_err_gc->lba_list, emeta_to_lbas(pblk, emeta->buf), - lba_list_size); -} - -void pblk_line_close_ws(struct work_struct *work) -{ - struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws, - ws); - struct pblk *pblk = line_ws->pblk; - struct pblk_line *line = line_ws->line; - struct pblk_w_err_gc *w_err_gc = line->w_err_gc; - - /* Write errors makes the emeta start address stored in smeta invalid, - * so keep a copy of the lba list until we've gc'd the line - */ - if (w_err_gc->has_write_err) - pblk_save_lba_list(pblk, line); - - pblk_line_close(pblk, line); - mempool_free(line_ws, &pblk->gen_ws_pool); -} - -void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv, - void (*work)(struct work_struct *), gfp_t gfp_mask, - struct workqueue_struct *wq) -{ - struct pblk_line_ws *line_ws; - - line_ws = mempool_alloc(&pblk->gen_ws_pool, gfp_mask); - if (!line_ws) { - pblk_err(pblk, "pblk: could not allocate memory\n"); - return; - } - - line_ws->pblk = pblk; - line_ws->line = line; - line_ws->priv = priv; - - INIT_WORK(&line_ws->ws, work); - queue_work(wq, &line_ws->ws); -} - -static void __pblk_down_chunk(struct pblk *pblk, int pos) -{ - struct pblk_lun *rlun = &pblk->luns[pos]; - int ret; - - /* - * Only send one inflight I/O per LUN. Since we map at a page - * granurality, all ppas in the I/O will map to the same LUN - */ - - ret = down_timeout(&rlun->wr_sem, msecs_to_jiffies(30000)); - if (ret == -ETIME || ret == -EINTR) - pblk_err(pblk, "taking lun semaphore timed out: err %d\n", - -ret); -} - -void pblk_down_chunk(struct pblk *pblk, struct ppa_addr ppa) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - int pos = pblk_ppa_to_pos(geo, ppa); - - __pblk_down_chunk(pblk, pos); -} - -void pblk_down_rq(struct pblk *pblk, struct ppa_addr ppa, - unsigned long *lun_bitmap) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - int pos = pblk_ppa_to_pos(geo, ppa); - - /* If the LUN has been locked for this same request, do no attempt to - * lock it again - */ - if (test_and_set_bit(pos, lun_bitmap)) - return; - - __pblk_down_chunk(pblk, pos); -} - -void pblk_up_chunk(struct pblk *pblk, struct ppa_addr ppa) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_lun *rlun; - int pos = pblk_ppa_to_pos(geo, ppa); - - rlun = &pblk->luns[pos]; - up(&rlun->wr_sem); -} - -void pblk_up_rq(struct pblk *pblk, unsigned long *lun_bitmap) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_lun *rlun; - int num_lun = geo->all_luns; - int bit = -1; - - while ((bit = find_next_bit(lun_bitmap, num_lun, bit + 1)) < num_lun) { - rlun = &pblk->luns[bit]; - up(&rlun->wr_sem); - } -} - -void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa) -{ - struct ppa_addr ppa_l2p; - - /* logic error: lba out-of-bounds. Ignore update */ - if (!(lba < pblk->capacity)) { - WARN(1, "pblk: corrupted L2P map request\n"); - return; - } - - spin_lock(&pblk->trans_lock); - ppa_l2p = pblk_trans_map_get(pblk, lba); - - if (!pblk_addr_in_cache(ppa_l2p) && !pblk_ppa_empty(ppa_l2p)) - pblk_map_invalidate(pblk, ppa_l2p); - - pblk_trans_map_set(pblk, lba, ppa); - spin_unlock(&pblk->trans_lock); -} - -void pblk_update_map_cache(struct pblk *pblk, sector_t lba, struct ppa_addr ppa) -{ - -#ifdef CONFIG_NVM_PBLK_DEBUG - /* Callers must ensure that the ppa points to a cache address */ - BUG_ON(!pblk_addr_in_cache(ppa)); - BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa))); -#endif - - pblk_update_map(pblk, lba, ppa); -} - -int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa_new, - struct pblk_line *gc_line, u64 paddr_gc) -{ - struct ppa_addr ppa_l2p, ppa_gc; - int ret = 1; - -#ifdef CONFIG_NVM_PBLK_DEBUG - /* Callers must ensure that the ppa points to a cache address */ - BUG_ON(!pblk_addr_in_cache(ppa_new)); - BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa_new))); -#endif - - /* logic error: lba out-of-bounds. Ignore update */ - if (!(lba < pblk->capacity)) { - WARN(1, "pblk: corrupted L2P map request\n"); - return 0; - } - - spin_lock(&pblk->trans_lock); - ppa_l2p = pblk_trans_map_get(pblk, lba); - ppa_gc = addr_to_gen_ppa(pblk, paddr_gc, gc_line->id); - - if (!pblk_ppa_comp(ppa_l2p, ppa_gc)) { - spin_lock(&gc_line->lock); - WARN(!test_bit(paddr_gc, gc_line->invalid_bitmap), - "pblk: corrupted GC update"); - spin_unlock(&gc_line->lock); - - ret = 0; - goto out; - } - - pblk_trans_map_set(pblk, lba, ppa_new); -out: - spin_unlock(&pblk->trans_lock); - return ret; -} - -void pblk_update_map_dev(struct pblk *pblk, sector_t lba, - struct ppa_addr ppa_mapped, struct ppa_addr ppa_cache) -{ - struct ppa_addr ppa_l2p; - -#ifdef CONFIG_NVM_PBLK_DEBUG - /* Callers must ensure that the ppa points to a device address */ - BUG_ON(pblk_addr_in_cache(ppa_mapped)); -#endif - /* Invalidate and discard padded entries */ - if (lba == ADDR_EMPTY) { - atomic64_inc(&pblk->pad_wa); -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_inc(&pblk->padded_wb); -#endif - if (!pblk_ppa_empty(ppa_mapped)) - pblk_map_invalidate(pblk, ppa_mapped); - return; - } - - /* logic error: lba out-of-bounds. Ignore update */ - if (!(lba < pblk->capacity)) { - WARN(1, "pblk: corrupted L2P map request\n"); - return; - } - - spin_lock(&pblk->trans_lock); - ppa_l2p = pblk_trans_map_get(pblk, lba); - - /* Do not update L2P if the cacheline has been updated. In this case, - * the mapped ppa must be invalidated - */ - if (!pblk_ppa_comp(ppa_l2p, ppa_cache)) { - if (!pblk_ppa_empty(ppa_mapped)) - pblk_map_invalidate(pblk, ppa_mapped); - goto out; - } - -#ifdef CONFIG_NVM_PBLK_DEBUG - WARN_ON(!pblk_addr_in_cache(ppa_l2p) && !pblk_ppa_empty(ppa_l2p)); -#endif - - pblk_trans_map_set(pblk, lba, ppa_mapped); -out: - spin_unlock(&pblk->trans_lock); -} - -int pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas, - sector_t blba, int nr_secs, bool *from_cache) -{ - int i; - - spin_lock(&pblk->trans_lock); - for (i = 0; i < nr_secs; i++) { - struct ppa_addr ppa; - - ppa = ppas[i] = pblk_trans_map_get(pblk, blba + i); - - /* If the L2P entry maps to a line, the reference is valid */ - if (!pblk_ppa_empty(ppa) && !pblk_addr_in_cache(ppa)) { - struct pblk_line *line = pblk_ppa_to_line(pblk, ppa); - - if (i > 0 && *from_cache) - break; - *from_cache = false; - - kref_get(&line->ref); - } else { - if (i > 0 && !*from_cache) - break; - *from_cache = true; - } - } - spin_unlock(&pblk->trans_lock); - return i; -} - -void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas, - u64 *lba_list, int nr_secs) -{ - u64 lba; - int i; - - spin_lock(&pblk->trans_lock); - for (i = 0; i < nr_secs; i++) { - lba = lba_list[i]; - if (lba != ADDR_EMPTY) { - /* logic error: lba out-of-bounds. Ignore update */ - if (!(lba < pblk->capacity)) { - WARN(1, "pblk: corrupted L2P map request\n"); - continue; - } - ppas[i] = pblk_trans_map_get(pblk, lba); - } - } - spin_unlock(&pblk->trans_lock); -} - -void *pblk_get_meta_for_writes(struct pblk *pblk, struct nvm_rq *rqd) -{ - void *buffer; - - if (pblk_is_oob_meta_supported(pblk)) { - /* Just use OOB metadata buffer as always */ - buffer = rqd->meta_list; - } else { - /* We need to reuse last page of request (packed metadata) - * in similar way as traditional oob metadata - */ - buffer = page_to_virt( - rqd->bio->bi_io_vec[rqd->bio->bi_vcnt - 1].bv_page); - } - - return buffer; -} - -void pblk_get_packed_meta(struct pblk *pblk, struct nvm_rq *rqd) -{ - void *meta_list = rqd->meta_list; - void *page; - int i = 0; - - if (pblk_is_oob_meta_supported(pblk)) - return; - - page = page_to_virt(rqd->bio->bi_io_vec[rqd->bio->bi_vcnt - 1].bv_page); - /* We need to fill oob meta buffer with data from packed metadata */ - for (; i < rqd->nr_ppas; i++) - memcpy(pblk_get_meta(pblk, meta_list, i), - page + (i * sizeof(struct pblk_sec_meta)), - sizeof(struct pblk_sec_meta)); -} diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c deleted file mode 100644 index b31658be35a7..000000000000 --- a/drivers/lightnvm/pblk-gc.c +++ /dev/null @@ -1,726 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2016 CNEX Labs - * Initial release: Javier Gonzalez - * Matias Bjorling - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * pblk-gc.c - pblk's garbage collector - */ - -#include "pblk.h" -#include "pblk-trace.h" -#include - - -static void pblk_gc_free_gc_rq(struct pblk_gc_rq *gc_rq) -{ - vfree(gc_rq->data); - kfree(gc_rq); -} - -static int pblk_gc_write(struct pblk *pblk) -{ - struct pblk_gc *gc = &pblk->gc; - struct pblk_gc_rq *gc_rq, *tgc_rq; - LIST_HEAD(w_list); - - spin_lock(&gc->w_lock); - if (list_empty(&gc->w_list)) { - spin_unlock(&gc->w_lock); - return 1; - } - - list_cut_position(&w_list, &gc->w_list, gc->w_list.prev); - gc->w_entries = 0; - spin_unlock(&gc->w_lock); - - list_for_each_entry_safe(gc_rq, tgc_rq, &w_list, list) { - pblk_write_gc_to_cache(pblk, gc_rq); - list_del(&gc_rq->list); - kref_put(&gc_rq->line->ref, pblk_line_put); - pblk_gc_free_gc_rq(gc_rq); - } - - return 0; -} - -static void pblk_gc_writer_kick(struct pblk_gc *gc) -{ - wake_up_process(gc->gc_writer_ts); -} - -void pblk_put_line_back(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct list_head *move_list; - - spin_lock(&l_mg->gc_lock); - spin_lock(&line->lock); - WARN_ON(line->state != PBLK_LINESTATE_GC); - line->state = PBLK_LINESTATE_CLOSED; - trace_pblk_line_state(pblk_disk_name(pblk), line->id, - line->state); - - /* We need to reset gc_group in order to ensure that - * pblk_line_gc_list will return proper move_list - * since right now current line is not on any of the - * gc lists. - */ - line->gc_group = PBLK_LINEGC_NONE; - move_list = pblk_line_gc_list(pblk, line); - spin_unlock(&line->lock); - list_add_tail(&line->list, move_list); - spin_unlock(&l_mg->gc_lock); -} - -static void pblk_gc_line_ws(struct work_struct *work) -{ - struct pblk_line_ws *gc_rq_ws = container_of(work, - struct pblk_line_ws, ws); - struct pblk *pblk = gc_rq_ws->pblk; - struct pblk_gc *gc = &pblk->gc; - struct pblk_line *line = gc_rq_ws->line; - struct pblk_gc_rq *gc_rq = gc_rq_ws->priv; - int ret; - - up(&gc->gc_sem); - - /* Read from GC victim block */ - ret = pblk_submit_read_gc(pblk, gc_rq); - if (ret) { - line->w_err_gc->has_gc_err = 1; - goto out; - } - - if (!gc_rq->secs_to_gc) - goto out; - -retry: - spin_lock(&gc->w_lock); - if (gc->w_entries >= PBLK_GC_RQ_QD) { - spin_unlock(&gc->w_lock); - pblk_gc_writer_kick(&pblk->gc); - usleep_range(128, 256); - goto retry; - } - gc->w_entries++; - list_add_tail(&gc_rq->list, &gc->w_list); - spin_unlock(&gc->w_lock); - - pblk_gc_writer_kick(&pblk->gc); - - kfree(gc_rq_ws); - return; - -out: - pblk_gc_free_gc_rq(gc_rq); - kref_put(&line->ref, pblk_line_put); - kfree(gc_rq_ws); -} - -static __le64 *get_lba_list_from_emeta(struct pblk *pblk, - struct pblk_line *line) -{ - struct line_emeta *emeta_buf; - struct pblk_line_meta *lm = &pblk->lm; - unsigned int lba_list_size = lm->emeta_len[2]; - __le64 *lba_list; - int ret; - - emeta_buf = kvmalloc(lm->emeta_len[0], GFP_KERNEL); - if (!emeta_buf) - return NULL; - - ret = pblk_line_emeta_read(pblk, line, emeta_buf); - if (ret) { - pblk_err(pblk, "line %d read emeta failed (%d)\n", - line->id, ret); - kvfree(emeta_buf); - return NULL; - } - - /* If this read fails, it means that emeta is corrupted. - * For now, leave the line untouched. - * TODO: Implement a recovery routine that scans and moves - * all sectors on the line. - */ - - ret = pblk_recov_check_emeta(pblk, emeta_buf); - if (ret) { - pblk_err(pblk, "inconsistent emeta (line %d)\n", - line->id); - kvfree(emeta_buf); - return NULL; - } - - lba_list = kvmalloc(lba_list_size, GFP_KERNEL); - - if (lba_list) - memcpy(lba_list, emeta_to_lbas(pblk, emeta_buf), lba_list_size); - - kvfree(emeta_buf); - - return lba_list; -} - -static void pblk_gc_line_prepare_ws(struct work_struct *work) -{ - struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws, - ws); - struct pblk *pblk = line_ws->pblk; - struct pblk_line *line = line_ws->line; - struct pblk_line_meta *lm = &pblk->lm; - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_gc *gc = &pblk->gc; - struct pblk_line_ws *gc_rq_ws; - struct pblk_gc_rq *gc_rq; - __le64 *lba_list; - unsigned long *invalid_bitmap; - int sec_left, nr_secs, bit; - - invalid_bitmap = kmalloc(lm->sec_bitmap_len, GFP_KERNEL); - if (!invalid_bitmap) - goto fail_free_ws; - - if (line->w_err_gc->has_write_err) { - lba_list = line->w_err_gc->lba_list; - line->w_err_gc->lba_list = NULL; - } else { - lba_list = get_lba_list_from_emeta(pblk, line); - if (!lba_list) { - pblk_err(pblk, "could not interpret emeta (line %d)\n", - line->id); - goto fail_free_invalid_bitmap; - } - } - - spin_lock(&line->lock); - bitmap_copy(invalid_bitmap, line->invalid_bitmap, lm->sec_per_line); - sec_left = pblk_line_vsc(line); - spin_unlock(&line->lock); - - if (sec_left < 0) { - pblk_err(pblk, "corrupted GC line (%d)\n", line->id); - goto fail_free_lba_list; - } - - bit = -1; -next_rq: - gc_rq = kmalloc(sizeof(struct pblk_gc_rq), GFP_KERNEL); - if (!gc_rq) - goto fail_free_lba_list; - - nr_secs = 0; - do { - bit = find_next_zero_bit(invalid_bitmap, lm->sec_per_line, - bit + 1); - if (bit > line->emeta_ssec) - break; - - gc_rq->paddr_list[nr_secs] = bit; - gc_rq->lba_list[nr_secs++] = le64_to_cpu(lba_list[bit]); - } while (nr_secs < pblk->max_write_pgs); - - if (unlikely(!nr_secs)) { - kfree(gc_rq); - goto out; - } - - gc_rq->nr_secs = nr_secs; - gc_rq->line = line; - - gc_rq->data = vmalloc(array_size(gc_rq->nr_secs, geo->csecs)); - if (!gc_rq->data) - goto fail_free_gc_rq; - - gc_rq_ws = kmalloc(sizeof(struct pblk_line_ws), GFP_KERNEL); - if (!gc_rq_ws) - goto fail_free_gc_data; - - gc_rq_ws->pblk = pblk; - gc_rq_ws->line = line; - gc_rq_ws->priv = gc_rq; - - /* The write GC path can be much slower than the read GC one due to - * the budget imposed by the rate-limiter. Balance in case that we get - * back pressure from the write GC path. - */ - while (down_timeout(&gc->gc_sem, msecs_to_jiffies(30000))) - io_schedule(); - - kref_get(&line->ref); - - INIT_WORK(&gc_rq_ws->ws, pblk_gc_line_ws); - queue_work(gc->gc_line_reader_wq, &gc_rq_ws->ws); - - sec_left -= nr_secs; - if (sec_left > 0) - goto next_rq; - -out: - kvfree(lba_list); - kfree(line_ws); - kfree(invalid_bitmap); - - kref_put(&line->ref, pblk_line_put); - atomic_dec(&gc->read_inflight_gc); - - return; - -fail_free_gc_data: - vfree(gc_rq->data); -fail_free_gc_rq: - kfree(gc_rq); -fail_free_lba_list: - kvfree(lba_list); -fail_free_invalid_bitmap: - kfree(invalid_bitmap); -fail_free_ws: - kfree(line_ws); - - /* Line goes back to closed state, so we cannot release additional - * reference for line, since we do that only when we want to do - * gc to free line state transition. - */ - pblk_put_line_back(pblk, line); - atomic_dec(&gc->read_inflight_gc); - - pblk_err(pblk, "failed to GC line %d\n", line->id); -} - -static int pblk_gc_line(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_gc *gc = &pblk->gc; - struct pblk_line_ws *line_ws; - - pblk_debug(pblk, "line '%d' being reclaimed for GC\n", line->id); - - line_ws = kmalloc(sizeof(struct pblk_line_ws), GFP_KERNEL); - if (!line_ws) - return -ENOMEM; - - line_ws->pblk = pblk; - line_ws->line = line; - - atomic_inc(&gc->pipeline_gc); - INIT_WORK(&line_ws->ws, pblk_gc_line_prepare_ws); - queue_work(gc->gc_reader_wq, &line_ws->ws); - - return 0; -} - -static void pblk_gc_reader_kick(struct pblk_gc *gc) -{ - wake_up_process(gc->gc_reader_ts); -} - -static void pblk_gc_kick(struct pblk *pblk) -{ - struct pblk_gc *gc = &pblk->gc; - - pblk_gc_writer_kick(gc); - pblk_gc_reader_kick(gc); - - /* If we're shutting down GC, let's not start it up again */ - if (gc->gc_enabled) { - wake_up_process(gc->gc_ts); - mod_timer(&gc->gc_timer, - jiffies + msecs_to_jiffies(GC_TIME_MSECS)); - } -} - -static int pblk_gc_read(struct pblk *pblk) -{ - struct pblk_gc *gc = &pblk->gc; - struct pblk_line *line; - - spin_lock(&gc->r_lock); - if (list_empty(&gc->r_list)) { - spin_unlock(&gc->r_lock); - return 1; - } - - line = list_first_entry(&gc->r_list, struct pblk_line, list); - list_del(&line->list); - spin_unlock(&gc->r_lock); - - pblk_gc_kick(pblk); - - if (pblk_gc_line(pblk, line)) { - pblk_err(pblk, "failed to GC line %d\n", line->id); - /* rollback */ - spin_lock(&gc->r_lock); - list_add_tail(&line->list, &gc->r_list); - spin_unlock(&gc->r_lock); - } - - return 0; -} - -static struct pblk_line *pblk_gc_get_victim_line(struct pblk *pblk, - struct list_head *group_list) -{ - struct pblk_line *line, *victim; - unsigned int line_vsc = ~0x0L, victim_vsc = ~0x0L; - - victim = list_first_entry(group_list, struct pblk_line, list); - - list_for_each_entry(line, group_list, list) { - if (!atomic_read(&line->sec_to_update)) - line_vsc = le32_to_cpu(*line->vsc); - if (line_vsc < victim_vsc) { - victim = line; - victim_vsc = le32_to_cpu(*victim->vsc); - } - } - - if (victim_vsc == ~0x0) - return NULL; - - return victim; -} - -static bool pblk_gc_should_run(struct pblk_gc *gc, struct pblk_rl *rl) -{ - unsigned int nr_blocks_free, nr_blocks_need; - unsigned int werr_lines = atomic_read(&rl->werr_lines); - - nr_blocks_need = pblk_rl_high_thrs(rl); - nr_blocks_free = pblk_rl_nr_free_blks(rl); - - /* This is not critical, no need to take lock here */ - return ((werr_lines > 0) || - ((gc->gc_active) && (nr_blocks_need > nr_blocks_free))); -} - -void pblk_gc_free_full_lines(struct pblk *pblk) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_gc *gc = &pblk->gc; - struct pblk_line *line; - - do { - spin_lock(&l_mg->gc_lock); - if (list_empty(&l_mg->gc_full_list)) { - spin_unlock(&l_mg->gc_lock); - return; - } - - line = list_first_entry(&l_mg->gc_full_list, - struct pblk_line, list); - - spin_lock(&line->lock); - WARN_ON(line->state != PBLK_LINESTATE_CLOSED); - line->state = PBLK_LINESTATE_GC; - trace_pblk_line_state(pblk_disk_name(pblk), line->id, - line->state); - spin_unlock(&line->lock); - - list_del(&line->list); - spin_unlock(&l_mg->gc_lock); - - atomic_inc(&gc->pipeline_gc); - kref_put(&line->ref, pblk_line_put); - } while (1); -} - -/* - * Lines with no valid sectors will be returned to the free list immediately. If - * GC is activated - either because the free block count is under the determined - * threshold, or because it is being forced from user space - only lines with a - * high count of invalid sectors will be recycled. - */ -static void pblk_gc_run(struct pblk *pblk) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_gc *gc = &pblk->gc; - struct pblk_line *line; - struct list_head *group_list; - bool run_gc; - int read_inflight_gc, gc_group = 0, prev_group = 0; - - pblk_gc_free_full_lines(pblk); - - run_gc = pblk_gc_should_run(&pblk->gc, &pblk->rl); - if (!run_gc || (atomic_read(&gc->read_inflight_gc) >= PBLK_GC_L_QD)) - return; - -next_gc_group: - group_list = l_mg->gc_lists[gc_group++]; - - do { - spin_lock(&l_mg->gc_lock); - - line = pblk_gc_get_victim_line(pblk, group_list); - if (!line) { - spin_unlock(&l_mg->gc_lock); - break; - } - - spin_lock(&line->lock); - WARN_ON(line->state != PBLK_LINESTATE_CLOSED); - line->state = PBLK_LINESTATE_GC; - trace_pblk_line_state(pblk_disk_name(pblk), line->id, - line->state); - spin_unlock(&line->lock); - - list_del(&line->list); - spin_unlock(&l_mg->gc_lock); - - spin_lock(&gc->r_lock); - list_add_tail(&line->list, &gc->r_list); - spin_unlock(&gc->r_lock); - - read_inflight_gc = atomic_inc_return(&gc->read_inflight_gc); - pblk_gc_reader_kick(gc); - - prev_group = 1; - - /* No need to queue up more GC lines than we can handle */ - run_gc = pblk_gc_should_run(&pblk->gc, &pblk->rl); - if (!run_gc || read_inflight_gc >= PBLK_GC_L_QD) - break; - } while (1); - - if (!prev_group && pblk->rl.rb_state > gc_group && - gc_group < PBLK_GC_NR_LISTS) - goto next_gc_group; -} - -static void pblk_gc_timer(struct timer_list *t) -{ - struct pblk *pblk = from_timer(pblk, t, gc.gc_timer); - - pblk_gc_kick(pblk); -} - -static int pblk_gc_ts(void *data) -{ - struct pblk *pblk = data; - - while (!kthread_should_stop()) { - pblk_gc_run(pblk); - set_current_state(TASK_INTERRUPTIBLE); - io_schedule(); - } - - return 0; -} - -static int pblk_gc_writer_ts(void *data) -{ - struct pblk *pblk = data; - - while (!kthread_should_stop()) { - if (!pblk_gc_write(pblk)) - continue; - set_current_state(TASK_INTERRUPTIBLE); - io_schedule(); - } - - return 0; -} - -static int pblk_gc_reader_ts(void *data) -{ - struct pblk *pblk = data; - struct pblk_gc *gc = &pblk->gc; - - while (!kthread_should_stop()) { - if (!pblk_gc_read(pblk)) - continue; - set_current_state(TASK_INTERRUPTIBLE); - io_schedule(); - } - -#ifdef CONFIG_NVM_PBLK_DEBUG - pblk_info(pblk, "flushing gc pipeline, %d lines left\n", - atomic_read(&gc->pipeline_gc)); -#endif - - do { - if (!atomic_read(&gc->pipeline_gc)) - break; - - schedule(); - } while (1); - - return 0; -} - -static void pblk_gc_start(struct pblk *pblk) -{ - pblk->gc.gc_active = 1; - pblk_debug(pblk, "gc start\n"); -} - -void pblk_gc_should_start(struct pblk *pblk) -{ - struct pblk_gc *gc = &pblk->gc; - - if (gc->gc_enabled && !gc->gc_active) { - pblk_gc_start(pblk); - pblk_gc_kick(pblk); - } -} - -void pblk_gc_should_stop(struct pblk *pblk) -{ - struct pblk_gc *gc = &pblk->gc; - - if (gc->gc_active && !gc->gc_forced) - gc->gc_active = 0; -} - -void pblk_gc_should_kick(struct pblk *pblk) -{ - pblk_rl_update_rates(&pblk->rl); -} - -void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled, - int *gc_active) -{ - struct pblk_gc *gc = &pblk->gc; - - spin_lock(&gc->lock); - *gc_enabled = gc->gc_enabled; - *gc_active = gc->gc_active; - spin_unlock(&gc->lock); -} - -int pblk_gc_sysfs_force(struct pblk *pblk, int force) -{ - struct pblk_gc *gc = &pblk->gc; - - if (force < 0 || force > 1) - return -EINVAL; - - spin_lock(&gc->lock); - gc->gc_forced = force; - - if (force) - gc->gc_enabled = 1; - else - gc->gc_enabled = 0; - spin_unlock(&gc->lock); - - pblk_gc_should_start(pblk); - - return 0; -} - -int pblk_gc_init(struct pblk *pblk) -{ - struct pblk_gc *gc = &pblk->gc; - int ret; - - gc->gc_ts = kthread_create(pblk_gc_ts, pblk, "pblk-gc-ts"); - if (IS_ERR(gc->gc_ts)) { - pblk_err(pblk, "could not allocate GC main kthread\n"); - return PTR_ERR(gc->gc_ts); - } - - gc->gc_writer_ts = kthread_create(pblk_gc_writer_ts, pblk, - "pblk-gc-writer-ts"); - if (IS_ERR(gc->gc_writer_ts)) { - pblk_err(pblk, "could not allocate GC writer kthread\n"); - ret = PTR_ERR(gc->gc_writer_ts); - goto fail_free_main_kthread; - } - - gc->gc_reader_ts = kthread_create(pblk_gc_reader_ts, pblk, - "pblk-gc-reader-ts"); - if (IS_ERR(gc->gc_reader_ts)) { - pblk_err(pblk, "could not allocate GC reader kthread\n"); - ret = PTR_ERR(gc->gc_reader_ts); - goto fail_free_writer_kthread; - } - - timer_setup(&gc->gc_timer, pblk_gc_timer, 0); - mod_timer(&gc->gc_timer, jiffies + msecs_to_jiffies(GC_TIME_MSECS)); - - gc->gc_active = 0; - gc->gc_forced = 0; - gc->gc_enabled = 1; - gc->w_entries = 0; - atomic_set(&gc->read_inflight_gc, 0); - atomic_set(&gc->pipeline_gc, 0); - - /* Workqueue that reads valid sectors from a line and submit them to the - * GC writer to be recycled. - */ - gc->gc_line_reader_wq = alloc_workqueue("pblk-gc-line-reader-wq", - WQ_MEM_RECLAIM | WQ_UNBOUND, PBLK_GC_MAX_READERS); - if (!gc->gc_line_reader_wq) { - pblk_err(pblk, "could not allocate GC line reader workqueue\n"); - ret = -ENOMEM; - goto fail_free_reader_kthread; - } - - /* Workqueue that prepare lines for GC */ - gc->gc_reader_wq = alloc_workqueue("pblk-gc-line_wq", - WQ_MEM_RECLAIM | WQ_UNBOUND, 1); - if (!gc->gc_reader_wq) { - pblk_err(pblk, "could not allocate GC reader workqueue\n"); - ret = -ENOMEM; - goto fail_free_reader_line_wq; - } - - spin_lock_init(&gc->lock); - spin_lock_init(&gc->w_lock); - spin_lock_init(&gc->r_lock); - - sema_init(&gc->gc_sem, PBLK_GC_RQ_QD); - - INIT_LIST_HEAD(&gc->w_list); - INIT_LIST_HEAD(&gc->r_list); - - return 0; - -fail_free_reader_line_wq: - destroy_workqueue(gc->gc_line_reader_wq); -fail_free_reader_kthread: - kthread_stop(gc->gc_reader_ts); -fail_free_writer_kthread: - kthread_stop(gc->gc_writer_ts); -fail_free_main_kthread: - kthread_stop(gc->gc_ts); - - return ret; -} - -void pblk_gc_exit(struct pblk *pblk, bool graceful) -{ - struct pblk_gc *gc = &pblk->gc; - - gc->gc_enabled = 0; - del_timer_sync(&gc->gc_timer); - gc->gc_active = 0; - - if (gc->gc_ts) - kthread_stop(gc->gc_ts); - - if (gc->gc_reader_ts) - kthread_stop(gc->gc_reader_ts); - - if (graceful) { - flush_workqueue(gc->gc_reader_wq); - flush_workqueue(gc->gc_line_reader_wq); - } - - destroy_workqueue(gc->gc_reader_wq); - destroy_workqueue(gc->gc_line_reader_wq); - - if (gc->gc_writer_ts) - kthread_stop(gc->gc_writer_ts); -} diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c deleted file mode 100644 index 5924f09c217b..000000000000 --- a/drivers/lightnvm/pblk-init.c +++ /dev/null @@ -1,1324 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2015 IT University of Copenhagen (rrpc.c) - * Copyright (C) 2016 CNEX Labs - * Initial release: Javier Gonzalez - * Matias Bjorling - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * Implementation of a physical block-device target for Open-channel SSDs. - * - * pblk-init.c - pblk's initialization. - */ - -#include "pblk.h" -#include "pblk-trace.h" - -static unsigned int write_buffer_size; - -module_param(write_buffer_size, uint, 0644); -MODULE_PARM_DESC(write_buffer_size, "number of entries in a write buffer"); - -struct pblk_global_caches { - struct kmem_cache *ws; - struct kmem_cache *rec; - struct kmem_cache *g_rq; - struct kmem_cache *w_rq; - - struct kref kref; - - struct mutex mutex; /* Ensures consistency between - * caches and kref - */ -}; - -static struct pblk_global_caches pblk_caches = { - .mutex = __MUTEX_INITIALIZER(pblk_caches.mutex), - .kref = KREF_INIT(0), -}; - -struct bio_set pblk_bio_set; - -static blk_qc_t pblk_submit_bio(struct bio *bio) -{ - struct pblk *pblk = bio->bi_bdev->bd_disk->queue->queuedata; - - if (bio_op(bio) == REQ_OP_DISCARD) { - pblk_discard(pblk, bio); - if (!(bio->bi_opf & REQ_PREFLUSH)) { - bio_endio(bio); - return BLK_QC_T_NONE; - } - } - - /* Read requests must be <= 256kb due to NVMe's 64 bit completion bitmap - * constraint. Writes can be of arbitrary size. - */ - if (bio_data_dir(bio) == READ) { - blk_queue_split(&bio); - pblk_submit_read(pblk, bio); - } else { - /* Prevent deadlock in the case of a modest LUN configuration - * and large user I/Os. Unless stalled, the rate limiter - * leaves at least 256KB available for user I/O. - */ - if (pblk_get_secs(bio) > pblk_rl_max_io(&pblk->rl)) - blk_queue_split(&bio); - - pblk_write_to_cache(pblk, bio, PBLK_IOTYPE_USER); - } - - return BLK_QC_T_NONE; -} - -static const struct block_device_operations pblk_bops = { - .owner = THIS_MODULE, - .submit_bio = pblk_submit_bio, -}; - - -static size_t pblk_trans_map_size(struct pblk *pblk) -{ - int entry_size = 8; - - if (pblk->addrf_len < 32) - entry_size = 4; - - return entry_size * pblk->capacity; -} - -#ifdef CONFIG_NVM_PBLK_DEBUG -static u32 pblk_l2p_crc(struct pblk *pblk) -{ - size_t map_size; - u32 crc = ~(u32)0; - - map_size = pblk_trans_map_size(pblk); - crc = crc32_le(crc, pblk->trans_map, map_size); - return crc; -} -#endif - -static void pblk_l2p_free(struct pblk *pblk) -{ - vfree(pblk->trans_map); -} - -static int pblk_l2p_recover(struct pblk *pblk, bool factory_init) -{ - struct pblk_line *line = NULL; - - if (factory_init) { - guid_gen(&pblk->instance_uuid); - } else { - line = pblk_recov_l2p(pblk); - if (IS_ERR(line)) { - pblk_err(pblk, "could not recover l2p table\n"); - return -EFAULT; - } - } - -#ifdef CONFIG_NVM_PBLK_DEBUG - pblk_info(pblk, "init: L2P CRC: %x\n", pblk_l2p_crc(pblk)); -#endif - - /* Free full lines directly as GC has not been started yet */ - pblk_gc_free_full_lines(pblk); - - if (!line) { - /* Configure next line for user data */ - line = pblk_line_get_first_data(pblk); - if (!line) - return -EFAULT; - } - - return 0; -} - -static int pblk_l2p_init(struct pblk *pblk, bool factory_init) -{ - sector_t i; - struct ppa_addr ppa; - size_t map_size; - int ret = 0; - - map_size = pblk_trans_map_size(pblk); - pblk->trans_map = __vmalloc(map_size, GFP_KERNEL | __GFP_NOWARN | - __GFP_RETRY_MAYFAIL | __GFP_HIGHMEM); - if (!pblk->trans_map) { - pblk_err(pblk, "failed to allocate L2P (need %zu of memory)\n", - map_size); - return -ENOMEM; - } - - pblk_ppa_set_empty(&ppa); - - for (i = 0; i < pblk->capacity; i++) - pblk_trans_map_set(pblk, i, ppa); - - ret = pblk_l2p_recover(pblk, factory_init); - if (ret) - vfree(pblk->trans_map); - - return ret; -} - -static void pblk_rwb_free(struct pblk *pblk) -{ - if (pblk_rb_tear_down_check(&pblk->rwb)) - pblk_err(pblk, "write buffer error on tear down\n"); - - pblk_rb_free(&pblk->rwb); -} - -static int pblk_rwb_init(struct pblk *pblk) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - unsigned long buffer_size; - int pgs_in_buffer, threshold; - - threshold = geo->mw_cunits * geo->all_luns; - pgs_in_buffer = (max(geo->mw_cunits, geo->ws_opt) + geo->ws_opt) - * geo->all_luns; - - if (write_buffer_size && (write_buffer_size > pgs_in_buffer)) - buffer_size = write_buffer_size; - else - buffer_size = pgs_in_buffer; - - return pblk_rb_init(&pblk->rwb, buffer_size, threshold, geo->csecs); -} - -static int pblk_set_addrf_12(struct pblk *pblk, struct nvm_geo *geo, - struct nvm_addrf_12 *dst) -{ - struct nvm_addrf_12 *src = (struct nvm_addrf_12 *)&geo->addrf; - int power_len; - - /* Re-calculate channel and lun format to adapt to configuration */ - power_len = get_count_order(geo->num_ch); - if (1 << power_len != geo->num_ch) { - pblk_err(pblk, "supports only power-of-two channel config.\n"); - return -EINVAL; - } - dst->ch_len = power_len; - - power_len = get_count_order(geo->num_lun); - if (1 << power_len != geo->num_lun) { - pblk_err(pblk, "supports only power-of-two LUN config.\n"); - return -EINVAL; - } - dst->lun_len = power_len; - - dst->blk_len = src->blk_len; - dst->pg_len = src->pg_len; - dst->pln_len = src->pln_len; - dst->sec_len = src->sec_len; - - dst->sec_offset = 0; - dst->pln_offset = dst->sec_len; - dst->ch_offset = dst->pln_offset + dst->pln_len; - dst->lun_offset = dst->ch_offset + dst->ch_len; - dst->pg_offset = dst->lun_offset + dst->lun_len; - dst->blk_offset = dst->pg_offset + dst->pg_len; - - dst->sec_mask = ((1ULL << dst->sec_len) - 1) << dst->sec_offset; - dst->pln_mask = ((1ULL << dst->pln_len) - 1) << dst->pln_offset; - dst->ch_mask = ((1ULL << dst->ch_len) - 1) << dst->ch_offset; - dst->lun_mask = ((1ULL << dst->lun_len) - 1) << dst->lun_offset; - dst->pg_mask = ((1ULL << dst->pg_len) - 1) << dst->pg_offset; - dst->blk_mask = ((1ULL << dst->blk_len) - 1) << dst->blk_offset; - - return dst->blk_offset + src->blk_len; -} - -static int pblk_set_addrf_20(struct nvm_geo *geo, struct nvm_addrf *adst, - struct pblk_addrf *udst) -{ - struct nvm_addrf *src = &geo->addrf; - - adst->ch_len = get_count_order(geo->num_ch); - adst->lun_len = get_count_order(geo->num_lun); - adst->chk_len = src->chk_len; - adst->sec_len = src->sec_len; - - adst->sec_offset = 0; - adst->ch_offset = adst->sec_len; - adst->lun_offset = adst->ch_offset + adst->ch_len; - adst->chk_offset = adst->lun_offset + adst->lun_len; - - adst->sec_mask = ((1ULL << adst->sec_len) - 1) << adst->sec_offset; - adst->chk_mask = ((1ULL << adst->chk_len) - 1) << adst->chk_offset; - adst->lun_mask = ((1ULL << adst->lun_len) - 1) << adst->lun_offset; - adst->ch_mask = ((1ULL << adst->ch_len) - 1) << adst->ch_offset; - - udst->sec_stripe = geo->ws_opt; - udst->ch_stripe = geo->num_ch; - udst->lun_stripe = geo->num_lun; - - udst->sec_lun_stripe = udst->sec_stripe * udst->ch_stripe; - udst->sec_ws_stripe = udst->sec_lun_stripe * udst->lun_stripe; - - return adst->chk_offset + adst->chk_len; -} - -static int pblk_set_addrf(struct pblk *pblk) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - int mod; - - switch (geo->version) { - case NVM_OCSSD_SPEC_12: - div_u64_rem(geo->clba, pblk->min_write_pgs, &mod); - if (mod) { - pblk_err(pblk, "bad configuration of sectors/pages\n"); - return -EINVAL; - } - - pblk->addrf_len = pblk_set_addrf_12(pblk, geo, - (void *)&pblk->addrf); - break; - case NVM_OCSSD_SPEC_20: - pblk->addrf_len = pblk_set_addrf_20(geo, (void *)&pblk->addrf, - &pblk->uaddrf); - break; - default: - pblk_err(pblk, "OCSSD revision not supported (%d)\n", - geo->version); - return -EINVAL; - } - - return 0; -} - -static int pblk_create_global_caches(void) -{ - - pblk_caches.ws = kmem_cache_create("pblk_blk_ws", - sizeof(struct pblk_line_ws), 0, 0, NULL); - if (!pblk_caches.ws) - return -ENOMEM; - - pblk_caches.rec = kmem_cache_create("pblk_rec", - sizeof(struct pblk_rec_ctx), 0, 0, NULL); - if (!pblk_caches.rec) - goto fail_destroy_ws; - - pblk_caches.g_rq = kmem_cache_create("pblk_g_rq", pblk_g_rq_size, - 0, 0, NULL); - if (!pblk_caches.g_rq) - goto fail_destroy_rec; - - pblk_caches.w_rq = kmem_cache_create("pblk_w_rq", pblk_w_rq_size, - 0, 0, NULL); - if (!pblk_caches.w_rq) - goto fail_destroy_g_rq; - - return 0; - -fail_destroy_g_rq: - kmem_cache_destroy(pblk_caches.g_rq); -fail_destroy_rec: - kmem_cache_destroy(pblk_caches.rec); -fail_destroy_ws: - kmem_cache_destroy(pblk_caches.ws); - - return -ENOMEM; -} - -static int pblk_get_global_caches(void) -{ - int ret = 0; - - mutex_lock(&pblk_caches.mutex); - - if (kref_get_unless_zero(&pblk_caches.kref)) - goto out; - - ret = pblk_create_global_caches(); - if (!ret) - kref_init(&pblk_caches.kref); - -out: - mutex_unlock(&pblk_caches.mutex); - return ret; -} - -static void pblk_destroy_global_caches(struct kref *ref) -{ - struct pblk_global_caches *c; - - c = container_of(ref, struct pblk_global_caches, kref); - - kmem_cache_destroy(c->ws); - kmem_cache_destroy(c->rec); - kmem_cache_destroy(c->g_rq); - kmem_cache_destroy(c->w_rq); -} - -static void pblk_put_global_caches(void) -{ - mutex_lock(&pblk_caches.mutex); - kref_put(&pblk_caches.kref, pblk_destroy_global_caches); - mutex_unlock(&pblk_caches.mutex); -} - -static int pblk_core_init(struct pblk *pblk) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - int ret, max_write_ppas; - - atomic64_set(&pblk->user_wa, 0); - atomic64_set(&pblk->pad_wa, 0); - atomic64_set(&pblk->gc_wa, 0); - pblk->user_rst_wa = 0; - pblk->pad_rst_wa = 0; - pblk->gc_rst_wa = 0; - - atomic64_set(&pblk->nr_flush, 0); - pblk->nr_flush_rst = 0; - - pblk->min_write_pgs = geo->ws_opt; - pblk->min_write_pgs_data = pblk->min_write_pgs; - max_write_ppas = pblk->min_write_pgs * geo->all_luns; - pblk->max_write_pgs = min_t(int, max_write_ppas, NVM_MAX_VLBA); - pblk->max_write_pgs = min_t(int, pblk->max_write_pgs, - queue_max_hw_sectors(dev->q) / (geo->csecs >> SECTOR_SHIFT)); - pblk_set_sec_per_write(pblk, pblk->min_write_pgs); - - pblk->oob_meta_size = geo->sos; - if (!pblk_is_oob_meta_supported(pblk)) { - /* For drives which does not have OOB metadata feature - * in order to support recovery feature we need to use - * so called packed metadata. Packed metada will store - * the same information as OOB metadata (l2p table mapping, - * but in the form of the single page at the end of - * every write request. - */ - if (pblk->min_write_pgs - * sizeof(struct pblk_sec_meta) > PAGE_SIZE) { - /* We want to keep all the packed metadata on single - * page per write requests. So we need to ensure that - * it will fit. - * - * This is more like sanity check, since there is - * no device with such a big minimal write size - * (above 1 metabytes). - */ - pblk_err(pblk, "Not supported min write size\n"); - return -EINVAL; - } - /* For packed meta approach we do some simplification. - * On read path we always issue requests which size - * equal to max_write_pgs, with all pages filled with - * user payload except of last one page which will be - * filled with packed metadata. - */ - pblk->max_write_pgs = pblk->min_write_pgs; - pblk->min_write_pgs_data = pblk->min_write_pgs - 1; - } - - pblk->pad_dist = kcalloc(pblk->min_write_pgs - 1, sizeof(atomic64_t), - GFP_KERNEL); - if (!pblk->pad_dist) - return -ENOMEM; - - if (pblk_get_global_caches()) - goto fail_free_pad_dist; - - /* Internal bios can be at most the sectors signaled by the device. */ - ret = mempool_init_page_pool(&pblk->page_bio_pool, NVM_MAX_VLBA, 0); - if (ret) - goto free_global_caches; - - ret = mempool_init_slab_pool(&pblk->gen_ws_pool, PBLK_GEN_WS_POOL_SIZE, - pblk_caches.ws); - if (ret) - goto free_page_bio_pool; - - ret = mempool_init_slab_pool(&pblk->rec_pool, geo->all_luns, - pblk_caches.rec); - if (ret) - goto free_gen_ws_pool; - - ret = mempool_init_slab_pool(&pblk->r_rq_pool, geo->all_luns, - pblk_caches.g_rq); - if (ret) - goto free_rec_pool; - - ret = mempool_init_slab_pool(&pblk->e_rq_pool, geo->all_luns, - pblk_caches.g_rq); - if (ret) - goto free_r_rq_pool; - - ret = mempool_init_slab_pool(&pblk->w_rq_pool, geo->all_luns, - pblk_caches.w_rq); - if (ret) - goto free_e_rq_pool; - - pblk->close_wq = alloc_workqueue("pblk-close-wq", - WQ_MEM_RECLAIM | WQ_UNBOUND, PBLK_NR_CLOSE_JOBS); - if (!pblk->close_wq) - goto free_w_rq_pool; - - pblk->bb_wq = alloc_workqueue("pblk-bb-wq", - WQ_MEM_RECLAIM | WQ_UNBOUND, 0); - if (!pblk->bb_wq) - goto free_close_wq; - - pblk->r_end_wq = alloc_workqueue("pblk-read-end-wq", - WQ_MEM_RECLAIM | WQ_UNBOUND, 0); - if (!pblk->r_end_wq) - goto free_bb_wq; - - if (pblk_set_addrf(pblk)) - goto free_r_end_wq; - - INIT_LIST_HEAD(&pblk->compl_list); - INIT_LIST_HEAD(&pblk->resubmit_list); - - return 0; - -free_r_end_wq: - destroy_workqueue(pblk->r_end_wq); -free_bb_wq: - destroy_workqueue(pblk->bb_wq); -free_close_wq: - destroy_workqueue(pblk->close_wq); -free_w_rq_pool: - mempool_exit(&pblk->w_rq_pool); -free_e_rq_pool: - mempool_exit(&pblk->e_rq_pool); -free_r_rq_pool: - mempool_exit(&pblk->r_rq_pool); -free_rec_pool: - mempool_exit(&pblk->rec_pool); -free_gen_ws_pool: - mempool_exit(&pblk->gen_ws_pool); -free_page_bio_pool: - mempool_exit(&pblk->page_bio_pool); -free_global_caches: - pblk_put_global_caches(); -fail_free_pad_dist: - kfree(pblk->pad_dist); - return -ENOMEM; -} - -static void pblk_core_free(struct pblk *pblk) -{ - if (pblk->close_wq) - destroy_workqueue(pblk->close_wq); - - if (pblk->r_end_wq) - destroy_workqueue(pblk->r_end_wq); - - if (pblk->bb_wq) - destroy_workqueue(pblk->bb_wq); - - mempool_exit(&pblk->page_bio_pool); - mempool_exit(&pblk->gen_ws_pool); - mempool_exit(&pblk->rec_pool); - mempool_exit(&pblk->r_rq_pool); - mempool_exit(&pblk->e_rq_pool); - mempool_exit(&pblk->w_rq_pool); - - pblk_put_global_caches(); - kfree(pblk->pad_dist); -} - -static void pblk_line_mg_free(struct pblk *pblk) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - int i; - - kfree(l_mg->bb_template); - kfree(l_mg->bb_aux); - kfree(l_mg->vsc_list); - - for (i = 0; i < PBLK_DATA_LINES; i++) { - kfree(l_mg->sline_meta[i]); - kvfree(l_mg->eline_meta[i]->buf); - kfree(l_mg->eline_meta[i]); - } - - mempool_destroy(l_mg->bitmap_pool); - kmem_cache_destroy(l_mg->bitmap_cache); -} - -static void pblk_line_meta_free(struct pblk_line_mgmt *l_mg, - struct pblk_line *line) -{ - struct pblk_w_err_gc *w_err_gc = line->w_err_gc; - - kfree(line->blk_bitmap); - kfree(line->erase_bitmap); - kfree(line->chks); - - kvfree(w_err_gc->lba_list); - kfree(w_err_gc); -} - -static void pblk_lines_free(struct pblk *pblk) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line *line; - int i; - - for (i = 0; i < l_mg->nr_lines; i++) { - line = &pblk->lines[i]; - - pblk_line_free(line); - pblk_line_meta_free(l_mg, line); - } - - pblk_line_mg_free(pblk); - - kfree(pblk->luns); - kfree(pblk->lines); -} - -static int pblk_luns_init(struct pblk *pblk) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_lun *rlun; - int i; - - /* TODO: Implement unbalanced LUN support */ - if (geo->num_lun < 0) { - pblk_err(pblk, "unbalanced LUN config.\n"); - return -EINVAL; - } - - pblk->luns = kcalloc(geo->all_luns, sizeof(struct pblk_lun), - GFP_KERNEL); - if (!pblk->luns) - return -ENOMEM; - - for (i = 0; i < geo->all_luns; i++) { - /* Stripe across channels */ - int ch = i % geo->num_ch; - int lun_raw = i / geo->num_ch; - int lunid = lun_raw + ch * geo->num_lun; - - rlun = &pblk->luns[i]; - rlun->bppa = dev->luns[lunid]; - - sema_init(&rlun->wr_sem, 1); - } - - return 0; -} - -/* See comment over struct line_emeta definition */ -static unsigned int calc_emeta_len(struct pblk *pblk) -{ - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - - /* Round to sector size so that lba_list starts on its own sector */ - lm->emeta_sec[1] = DIV_ROUND_UP( - sizeof(struct line_emeta) + lm->blk_bitmap_len + - sizeof(struct wa_counters), geo->csecs); - lm->emeta_len[1] = lm->emeta_sec[1] * geo->csecs; - - /* Round to sector size so that vsc_list starts on its own sector */ - lm->dsec_per_line = lm->sec_per_line - lm->emeta_sec[0]; - lm->emeta_sec[2] = DIV_ROUND_UP(lm->dsec_per_line * sizeof(u64), - geo->csecs); - lm->emeta_len[2] = lm->emeta_sec[2] * geo->csecs; - - lm->emeta_sec[3] = DIV_ROUND_UP(l_mg->nr_lines * sizeof(u32), - geo->csecs); - lm->emeta_len[3] = lm->emeta_sec[3] * geo->csecs; - - lm->vsc_list_len = l_mg->nr_lines * sizeof(u32); - - return (lm->emeta_len[1] + lm->emeta_len[2] + lm->emeta_len[3]); -} - -static int pblk_set_provision(struct pblk *pblk, int nr_free_chks) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line_meta *lm = &pblk->lm; - struct nvm_geo *geo = &dev->geo; - sector_t provisioned; - int sec_meta, blk_meta, clba; - int minimum; - - if (geo->op == NVM_TARGET_DEFAULT_OP) - pblk->op = PBLK_DEFAULT_OP; - else - pblk->op = geo->op; - - minimum = pblk_get_min_chks(pblk); - provisioned = nr_free_chks; - provisioned *= (100 - pblk->op); - sector_div(provisioned, 100); - - if ((nr_free_chks - provisioned) < minimum) { - if (geo->op != NVM_TARGET_DEFAULT_OP) { - pblk_err(pblk, "OP too small to create a sane instance\n"); - return -EINTR; - } - - /* If the user did not specify an OP value, and PBLK_DEFAULT_OP - * is not enough, calculate and set sane value - */ - - provisioned = nr_free_chks - minimum; - pblk->op = (100 * minimum) / nr_free_chks; - pblk_info(pblk, "Default OP insufficient, adjusting OP to %d\n", - pblk->op); - } - - pblk->op_blks = nr_free_chks - provisioned; - - /* Internally pblk manages all free blocks, but all calculations based - * on user capacity consider only provisioned blocks - */ - pblk->rl.total_blocks = nr_free_chks; - - /* Consider sectors used for metadata */ - sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines; - blk_meta = DIV_ROUND_UP(sec_meta, geo->clba); - - clba = (geo->clba / pblk->min_write_pgs) * pblk->min_write_pgs_data; - pblk->capacity = (provisioned - blk_meta) * clba; - - atomic_set(&pblk->rl.free_blocks, nr_free_chks); - atomic_set(&pblk->rl.free_user_blocks, nr_free_chks); - - return 0; -} - -static int pblk_setup_line_meta_chk(struct pblk *pblk, struct pblk_line *line, - struct nvm_chk_meta *meta) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line_meta *lm = &pblk->lm; - int i, nr_bad_chks = 0; - - for (i = 0; i < lm->blk_per_line; i++) { - struct pblk_lun *rlun = &pblk->luns[i]; - struct nvm_chk_meta *chunk; - struct nvm_chk_meta *chunk_meta; - struct ppa_addr ppa; - int pos; - - ppa = rlun->bppa; - pos = pblk_ppa_to_pos(geo, ppa); - chunk = &line->chks[pos]; - - ppa.m.chk = line->id; - chunk_meta = pblk_chunk_get_off(pblk, meta, ppa); - - chunk->state = chunk_meta->state; - chunk->type = chunk_meta->type; - chunk->wi = chunk_meta->wi; - chunk->slba = chunk_meta->slba; - chunk->cnlb = chunk_meta->cnlb; - chunk->wp = chunk_meta->wp; - - trace_pblk_chunk_state(pblk_disk_name(pblk), &ppa, - chunk->state); - - if (chunk->type & NVM_CHK_TP_SZ_SPEC) { - WARN_ONCE(1, "pblk: custom-sized chunks unsupported\n"); - continue; - } - - if (!(chunk->state & NVM_CHK_ST_OFFLINE)) - continue; - - set_bit(pos, line->blk_bitmap); - nr_bad_chks++; - } - - return nr_bad_chks; -} - -static long pblk_setup_line_meta(struct pblk *pblk, struct pblk_line *line, - void *chunk_meta, int line_id) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line_meta *lm = &pblk->lm; - long nr_bad_chks, chk_in_line; - - line->pblk = pblk; - line->id = line_id; - line->type = PBLK_LINETYPE_FREE; - line->state = PBLK_LINESTATE_NEW; - line->gc_group = PBLK_LINEGC_NONE; - line->vsc = &l_mg->vsc_list[line_id]; - spin_lock_init(&line->lock); - - nr_bad_chks = pblk_setup_line_meta_chk(pblk, line, chunk_meta); - - chk_in_line = lm->blk_per_line - nr_bad_chks; - if (nr_bad_chks < 0 || nr_bad_chks > lm->blk_per_line || - chk_in_line < lm->min_blk_line) { - line->state = PBLK_LINESTATE_BAD; - list_add_tail(&line->list, &l_mg->bad_list); - return 0; - } - - atomic_set(&line->blk_in_line, chk_in_line); - list_add_tail(&line->list, &l_mg->free_list); - l_mg->nr_free_lines++; - - return chk_in_line; -} - -static int pblk_alloc_line_meta(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_line_meta *lm = &pblk->lm; - - line->blk_bitmap = kzalloc(lm->blk_bitmap_len, GFP_KERNEL); - if (!line->blk_bitmap) - return -ENOMEM; - - line->erase_bitmap = kzalloc(lm->blk_bitmap_len, GFP_KERNEL); - if (!line->erase_bitmap) - goto free_blk_bitmap; - - - line->chks = kmalloc_array(lm->blk_per_line, - sizeof(struct nvm_chk_meta), GFP_KERNEL); - if (!line->chks) - goto free_erase_bitmap; - - line->w_err_gc = kzalloc(sizeof(struct pblk_w_err_gc), GFP_KERNEL); - if (!line->w_err_gc) - goto free_chks; - - return 0; - -free_chks: - kfree(line->chks); -free_erase_bitmap: - kfree(line->erase_bitmap); -free_blk_bitmap: - kfree(line->blk_bitmap); - return -ENOMEM; -} - -static int pblk_line_mg_init(struct pblk *pblk) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line_meta *lm = &pblk->lm; - int i, bb_distance; - - l_mg->nr_lines = geo->num_chk; - l_mg->log_line = l_mg->data_line = NULL; - l_mg->l_seq_nr = l_mg->d_seq_nr = 0; - l_mg->nr_free_lines = 0; - bitmap_zero(&l_mg->meta_bitmap, PBLK_DATA_LINES); - - INIT_LIST_HEAD(&l_mg->free_list); - INIT_LIST_HEAD(&l_mg->corrupt_list); - INIT_LIST_HEAD(&l_mg->bad_list); - INIT_LIST_HEAD(&l_mg->gc_full_list); - INIT_LIST_HEAD(&l_mg->gc_high_list); - INIT_LIST_HEAD(&l_mg->gc_mid_list); - INIT_LIST_HEAD(&l_mg->gc_low_list); - INIT_LIST_HEAD(&l_mg->gc_empty_list); - INIT_LIST_HEAD(&l_mg->gc_werr_list); - - INIT_LIST_HEAD(&l_mg->emeta_list); - - l_mg->gc_lists[0] = &l_mg->gc_werr_list; - l_mg->gc_lists[1] = &l_mg->gc_high_list; - l_mg->gc_lists[2] = &l_mg->gc_mid_list; - l_mg->gc_lists[3] = &l_mg->gc_low_list; - - spin_lock_init(&l_mg->free_lock); - spin_lock_init(&l_mg->close_lock); - spin_lock_init(&l_mg->gc_lock); - - l_mg->vsc_list = kcalloc(l_mg->nr_lines, sizeof(__le32), GFP_KERNEL); - if (!l_mg->vsc_list) - goto fail; - - l_mg->bb_template = kzalloc(lm->sec_bitmap_len, GFP_KERNEL); - if (!l_mg->bb_template) - goto fail_free_vsc_list; - - l_mg->bb_aux = kzalloc(lm->sec_bitmap_len, GFP_KERNEL); - if (!l_mg->bb_aux) - goto fail_free_bb_template; - - /* smeta is always small enough to fit on a kmalloc memory allocation, - * emeta depends on the number of LUNs allocated to the pblk instance - */ - for (i = 0; i < PBLK_DATA_LINES; i++) { - l_mg->sline_meta[i] = kmalloc(lm->smeta_len, GFP_KERNEL); - if (!l_mg->sline_meta[i]) - goto fail_free_smeta; - } - - l_mg->bitmap_cache = kmem_cache_create("pblk_lm_bitmap", - lm->sec_bitmap_len, 0, 0, NULL); - if (!l_mg->bitmap_cache) - goto fail_free_smeta; - - /* the bitmap pool is used for both valid and map bitmaps */ - l_mg->bitmap_pool = mempool_create_slab_pool(PBLK_DATA_LINES * 2, - l_mg->bitmap_cache); - if (!l_mg->bitmap_pool) - goto fail_destroy_bitmap_cache; - - /* emeta allocates three different buffers for managing metadata with - * in-memory and in-media layouts - */ - for (i = 0; i < PBLK_DATA_LINES; i++) { - struct pblk_emeta *emeta; - - emeta = kmalloc(sizeof(struct pblk_emeta), GFP_KERNEL); - if (!emeta) - goto fail_free_emeta; - - emeta->buf = kvmalloc(lm->emeta_len[0], GFP_KERNEL); - if (!emeta->buf) { - kfree(emeta); - goto fail_free_emeta; - } - - emeta->nr_entries = lm->emeta_sec[0]; - l_mg->eline_meta[i] = emeta; - } - - for (i = 0; i < l_mg->nr_lines; i++) - l_mg->vsc_list[i] = cpu_to_le32(EMPTY_ENTRY); - - bb_distance = (geo->all_luns) * geo->ws_opt; - for (i = 0; i < lm->sec_per_line; i += bb_distance) - bitmap_set(l_mg->bb_template, i, geo->ws_opt); - - return 0; - -fail_free_emeta: - while (--i >= 0) { - kvfree(l_mg->eline_meta[i]->buf); - kfree(l_mg->eline_meta[i]); - } - - mempool_destroy(l_mg->bitmap_pool); -fail_destroy_bitmap_cache: - kmem_cache_destroy(l_mg->bitmap_cache); -fail_free_smeta: - for (i = 0; i < PBLK_DATA_LINES; i++) - kfree(l_mg->sline_meta[i]); - kfree(l_mg->bb_aux); -fail_free_bb_template: - kfree(l_mg->bb_template); -fail_free_vsc_list: - kfree(l_mg->vsc_list); -fail: - return -ENOMEM; -} - -static int pblk_line_meta_init(struct pblk *pblk) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line_meta *lm = &pblk->lm; - unsigned int smeta_len, emeta_len; - int i; - - lm->sec_per_line = geo->clba * geo->all_luns; - lm->blk_per_line = geo->all_luns; - lm->blk_bitmap_len = BITS_TO_LONGS(geo->all_luns) * sizeof(long); - lm->sec_bitmap_len = BITS_TO_LONGS(lm->sec_per_line) * sizeof(long); - lm->lun_bitmap_len = BITS_TO_LONGS(geo->all_luns) * sizeof(long); - lm->mid_thrs = lm->sec_per_line / 2; - lm->high_thrs = lm->sec_per_line / 4; - lm->meta_distance = (geo->all_luns / 2) * pblk->min_write_pgs; - - /* Calculate necessary pages for smeta. See comment over struct - * line_smeta definition - */ - i = 1; -add_smeta_page: - lm->smeta_sec = i * geo->ws_opt; - lm->smeta_len = lm->smeta_sec * geo->csecs; - - smeta_len = sizeof(struct line_smeta) + lm->lun_bitmap_len; - if (smeta_len > lm->smeta_len) { - i++; - goto add_smeta_page; - } - - /* Calculate necessary pages for emeta. See comment over struct - * line_emeta definition - */ - i = 1; -add_emeta_page: - lm->emeta_sec[0] = i * geo->ws_opt; - lm->emeta_len[0] = lm->emeta_sec[0] * geo->csecs; - - emeta_len = calc_emeta_len(pblk); - if (emeta_len > lm->emeta_len[0]) { - i++; - goto add_emeta_page; - } - - lm->emeta_bb = geo->all_luns > i ? geo->all_luns - i : 0; - - lm->min_blk_line = 1; - if (geo->all_luns > 1) - lm->min_blk_line += DIV_ROUND_UP(lm->smeta_sec + - lm->emeta_sec[0], geo->clba); - - if (lm->min_blk_line > lm->blk_per_line) { - pblk_err(pblk, "config. not supported. Min. LUN in line:%d\n", - lm->blk_per_line); - return -EINVAL; - } - - return 0; -} - -static int pblk_lines_init(struct pblk *pblk) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line *line; - void *chunk_meta; - int nr_free_chks = 0; - int i, ret; - - ret = pblk_line_meta_init(pblk); - if (ret) - return ret; - - ret = pblk_line_mg_init(pblk); - if (ret) - return ret; - - ret = pblk_luns_init(pblk); - if (ret) - goto fail_free_meta; - - chunk_meta = pblk_get_chunk_meta(pblk); - if (IS_ERR(chunk_meta)) { - ret = PTR_ERR(chunk_meta); - goto fail_free_luns; - } - - pblk->lines = kcalloc(l_mg->nr_lines, sizeof(struct pblk_line), - GFP_KERNEL); - if (!pblk->lines) { - ret = -ENOMEM; - goto fail_free_chunk_meta; - } - - for (i = 0; i < l_mg->nr_lines; i++) { - line = &pblk->lines[i]; - - ret = pblk_alloc_line_meta(pblk, line); - if (ret) - goto fail_free_lines; - - nr_free_chks += pblk_setup_line_meta(pblk, line, chunk_meta, i); - - trace_pblk_line_state(pblk_disk_name(pblk), line->id, - line->state); - } - - if (!nr_free_chks) { - pblk_err(pblk, "too many bad blocks prevent for sane instance\n"); - ret = -EINTR; - goto fail_free_lines; - } - - ret = pblk_set_provision(pblk, nr_free_chks); - if (ret) - goto fail_free_lines; - - vfree(chunk_meta); - return 0; - -fail_free_lines: - while (--i >= 0) - pblk_line_meta_free(l_mg, &pblk->lines[i]); - kfree(pblk->lines); -fail_free_chunk_meta: - vfree(chunk_meta); -fail_free_luns: - kfree(pblk->luns); -fail_free_meta: - pblk_line_mg_free(pblk); - - return ret; -} - -static int pblk_writer_init(struct pblk *pblk) -{ - pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t"); - if (IS_ERR(pblk->writer_ts)) { - int err = PTR_ERR(pblk->writer_ts); - - if (err != -EINTR) - pblk_err(pblk, "could not allocate writer kthread (%d)\n", - err); - return err; - } - - timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0); - mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100)); - - return 0; -} - -static void pblk_writer_stop(struct pblk *pblk) -{ - /* The pipeline must be stopped and the write buffer emptied before the - * write thread is stopped - */ - WARN(pblk_rb_read_count(&pblk->rwb), - "Stopping not fully persisted write buffer\n"); - - WARN(pblk_rb_sync_count(&pblk->rwb), - "Stopping not fully synced write buffer\n"); - - del_timer_sync(&pblk->wtimer); - if (pblk->writer_ts) - kthread_stop(pblk->writer_ts); -} - -static void pblk_free(struct pblk *pblk) -{ - pblk_lines_free(pblk); - pblk_l2p_free(pblk); - pblk_rwb_free(pblk); - pblk_core_free(pblk); - - kfree(pblk); -} - -static void pblk_tear_down(struct pblk *pblk, bool graceful) -{ - if (graceful) - __pblk_pipeline_flush(pblk); - __pblk_pipeline_stop(pblk); - pblk_writer_stop(pblk); - pblk_rb_sync_l2p(&pblk->rwb); - pblk_rl_free(&pblk->rl); - - pblk_debug(pblk, "consistent tear down (graceful:%d)\n", graceful); -} - -static void pblk_exit(void *private, bool graceful) -{ - struct pblk *pblk = private; - - pblk_gc_exit(pblk, graceful); - pblk_tear_down(pblk, graceful); - -#ifdef CONFIG_NVM_PBLK_DEBUG - pblk_info(pblk, "exit: L2P CRC: %x\n", pblk_l2p_crc(pblk)); -#endif - - pblk_free(pblk); -} - -static sector_t pblk_capacity(void *private) -{ - struct pblk *pblk = private; - - return pblk->capacity * NR_PHY_IN_LOG; -} - -static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk, - int flags) -{ - struct nvm_geo *geo = &dev->geo; - struct request_queue *bqueue = dev->q; - struct request_queue *tqueue = tdisk->queue; - struct pblk *pblk; - int ret; - - pblk = kzalloc(sizeof(struct pblk), GFP_KERNEL); - if (!pblk) - return ERR_PTR(-ENOMEM); - - pblk->dev = dev; - pblk->disk = tdisk; - pblk->state = PBLK_STATE_RUNNING; - trace_pblk_state(pblk_disk_name(pblk), pblk->state); - pblk->gc.gc_enabled = 0; - - if (!(geo->version == NVM_OCSSD_SPEC_12 || - geo->version == NVM_OCSSD_SPEC_20)) { - pblk_err(pblk, "OCSSD version not supported (%u)\n", - geo->version); - kfree(pblk); - return ERR_PTR(-EINVAL); - } - - if (geo->ext) { - pblk_err(pblk, "extended metadata not supported\n"); - kfree(pblk); - return ERR_PTR(-EINVAL); - } - - spin_lock_init(&pblk->resubmit_lock); - spin_lock_init(&pblk->trans_lock); - spin_lock_init(&pblk->lock); - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_set(&pblk->inflight_writes, 0); - atomic_long_set(&pblk->padded_writes, 0); - atomic_long_set(&pblk->padded_wb, 0); - atomic_long_set(&pblk->req_writes, 0); - atomic_long_set(&pblk->sub_writes, 0); - atomic_long_set(&pblk->sync_writes, 0); - atomic_long_set(&pblk->inflight_reads, 0); - atomic_long_set(&pblk->cache_reads, 0); - atomic_long_set(&pblk->sync_reads, 0); - atomic_long_set(&pblk->recov_writes, 0); - atomic_long_set(&pblk->recov_writes, 0); - atomic_long_set(&pblk->recov_gc_writes, 0); - atomic_long_set(&pblk->recov_gc_reads, 0); -#endif - - atomic_long_set(&pblk->read_failed, 0); - atomic_long_set(&pblk->read_empty, 0); - atomic_long_set(&pblk->read_high_ecc, 0); - atomic_long_set(&pblk->read_failed_gc, 0); - atomic_long_set(&pblk->write_failed, 0); - atomic_long_set(&pblk->erase_failed, 0); - - ret = pblk_core_init(pblk); - if (ret) { - pblk_err(pblk, "could not initialize core\n"); - goto fail; - } - - ret = pblk_lines_init(pblk); - if (ret) { - pblk_err(pblk, "could not initialize lines\n"); - goto fail_free_core; - } - - ret = pblk_rwb_init(pblk); - if (ret) { - pblk_err(pblk, "could not initialize write buffer\n"); - goto fail_free_lines; - } - - ret = pblk_l2p_init(pblk, flags & NVM_TARGET_FACTORY); - if (ret) { - pblk_err(pblk, "could not initialize maps\n"); - goto fail_free_rwb; - } - - ret = pblk_writer_init(pblk); - if (ret) { - if (ret != -EINTR) - pblk_err(pblk, "could not initialize write thread\n"); - goto fail_free_l2p; - } - - ret = pblk_gc_init(pblk); - if (ret) { - pblk_err(pblk, "could not initialize gc\n"); - goto fail_stop_writer; - } - - /* inherit the size from the underlying device */ - blk_queue_logical_block_size(tqueue, queue_physical_block_size(bqueue)); - blk_queue_max_hw_sectors(tqueue, queue_max_hw_sectors(bqueue)); - - blk_queue_write_cache(tqueue, true, false); - - tqueue->limits.discard_granularity = geo->clba * geo->csecs; - tqueue->limits.discard_alignment = 0; - blk_queue_max_discard_sectors(tqueue, UINT_MAX >> 9); - blk_queue_flag_set(QUEUE_FLAG_DISCARD, tqueue); - - pblk_info(pblk, "luns:%u, lines:%d, secs:%llu, buf entries:%u\n", - geo->all_luns, pblk->l_mg.nr_lines, - (unsigned long long)pblk->capacity, - pblk->rwb.nr_entries); - - wake_up_process(pblk->writer_ts); - - /* Check if we need to start GC */ - pblk_gc_should_kick(pblk); - - return pblk; - -fail_stop_writer: - pblk_writer_stop(pblk); -fail_free_l2p: - pblk_l2p_free(pblk); -fail_free_rwb: - pblk_rwb_free(pblk); -fail_free_lines: - pblk_lines_free(pblk); -fail_free_core: - pblk_core_free(pblk); -fail: - kfree(pblk); - return ERR_PTR(ret); -} - -/* physical block device target */ -static struct nvm_tgt_type tt_pblk = { - .name = "pblk", - .version = {1, 0, 0}, - - .bops = &pblk_bops, - .capacity = pblk_capacity, - - .init = pblk_init, - .exit = pblk_exit, - - .sysfs_init = pblk_sysfs_init, - .sysfs_exit = pblk_sysfs_exit, - .owner = THIS_MODULE, -}; - -static int __init pblk_module_init(void) -{ - int ret; - - ret = bioset_init(&pblk_bio_set, BIO_POOL_SIZE, 0, 0); - if (ret) - return ret; - ret = nvm_register_tgt_type(&tt_pblk); - if (ret) - bioset_exit(&pblk_bio_set); - return ret; -} - -static void pblk_module_exit(void) -{ - bioset_exit(&pblk_bio_set); - nvm_unregister_tgt_type(&tt_pblk); -} - -module_init(pblk_module_init); -module_exit(pblk_module_exit); -MODULE_AUTHOR("Javier Gonzalez "); -MODULE_AUTHOR("Matias Bjorling "); -MODULE_LICENSE("GPL v2"); -MODULE_DESCRIPTION("Physical Block-Device for Open-Channel SSDs"); diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c deleted file mode 100644 index 5408e32b2f13..000000000000 --- a/drivers/lightnvm/pblk-map.c +++ /dev/null @@ -1,210 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2016 CNEX Labs - * Initial release: Javier Gonzalez - * Matias Bjorling - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * pblk-map.c - pblk's lba-ppa mapping strategy - * - */ - -#include "pblk.h" - -static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry, - struct ppa_addr *ppa_list, - unsigned long *lun_bitmap, - void *meta_list, - unsigned int valid_secs) -{ - struct pblk_line *line = pblk_line_get_data(pblk); - struct pblk_emeta *emeta; - struct pblk_w_ctx *w_ctx; - __le64 *lba_list; - u64 paddr; - int nr_secs = pblk->min_write_pgs; - int i; - - if (!line) - return -ENOSPC; - - if (pblk_line_is_full(line)) { - struct pblk_line *prev_line = line; - - /* If we cannot allocate a new line, make sure to store metadata - * on current line and then fail - */ - line = pblk_line_replace_data(pblk); - pblk_line_close_meta(pblk, prev_line); - - if (!line) { - pblk_pipeline_stop(pblk); - return -ENOSPC; - } - - } - - emeta = line->emeta; - lba_list = emeta_to_lbas(pblk, emeta->buf); - - paddr = pblk_alloc_page(pblk, line, nr_secs); - - for (i = 0; i < nr_secs; i++, paddr++) { - struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i); - __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); - - /* ppa to be sent to the device */ - ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id); - - /* Write context for target bio completion on write buffer. Note - * that the write buffer is protected by the sync backpointer, - * and a single writer thread have access to each specific entry - * at a time. Thus, it is safe to modify the context for the - * entry we are setting up for submission without taking any - * lock or memory barrier. - */ - if (i < valid_secs) { - kref_get(&line->ref); - atomic_inc(&line->sec_to_update); - w_ctx = pblk_rb_w_ctx(&pblk->rwb, sentry + i); - w_ctx->ppa = ppa_list[i]; - meta->lba = cpu_to_le64(w_ctx->lba); - lba_list[paddr] = cpu_to_le64(w_ctx->lba); - if (lba_list[paddr] != addr_empty) - line->nr_valid_lbas++; - else - atomic64_inc(&pblk->pad_wa); - } else { - lba_list[paddr] = addr_empty; - meta->lba = addr_empty; - __pblk_map_invalidate(pblk, line, paddr); - } - } - - pblk_down_rq(pblk, ppa_list[0], lun_bitmap); - return 0; -} - -int pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry, - unsigned long *lun_bitmap, unsigned int valid_secs, - unsigned int off) -{ - void *meta_list = pblk_get_meta_for_writes(pblk, rqd); - void *meta_buffer; - struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd); - unsigned int map_secs; - int min = pblk->min_write_pgs; - int i; - int ret; - - for (i = off; i < rqd->nr_ppas; i += min) { - map_secs = (i + min > valid_secs) ? (valid_secs % min) : min; - meta_buffer = pblk_get_meta(pblk, meta_list, i); - - ret = pblk_map_page_data(pblk, sentry + i, &ppa_list[i], - lun_bitmap, meta_buffer, map_secs); - if (ret) - return ret; - } - - return 0; -} - -/* only if erase_ppa is set, acquire erase semaphore */ -int pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd, - unsigned int sentry, unsigned long *lun_bitmap, - unsigned int valid_secs, struct ppa_addr *erase_ppa) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line_meta *lm = &pblk->lm; - void *meta_list = pblk_get_meta_for_writes(pblk, rqd); - void *meta_buffer; - struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd); - struct pblk_line *e_line, *d_line; - unsigned int map_secs; - int min = pblk->min_write_pgs; - int i, erase_lun; - int ret; - - - for (i = 0; i < rqd->nr_ppas; i += min) { - map_secs = (i + min > valid_secs) ? (valid_secs % min) : min; - meta_buffer = pblk_get_meta(pblk, meta_list, i); - - ret = pblk_map_page_data(pblk, sentry + i, &ppa_list[i], - lun_bitmap, meta_buffer, map_secs); - if (ret) - return ret; - - erase_lun = pblk_ppa_to_pos(geo, ppa_list[i]); - - /* line can change after page map. We might also be writing the - * last line. - */ - e_line = pblk_line_get_erase(pblk); - if (!e_line) - return pblk_map_rq(pblk, rqd, sentry, lun_bitmap, - valid_secs, i + min); - - spin_lock(&e_line->lock); - if (!test_bit(erase_lun, e_line->erase_bitmap)) { - set_bit(erase_lun, e_line->erase_bitmap); - atomic_dec(&e_line->left_eblks); - - *erase_ppa = ppa_list[i]; - erase_ppa->a.blk = e_line->id; - erase_ppa->a.reserved = 0; - - spin_unlock(&e_line->lock); - - /* Avoid evaluating e_line->left_eblks */ - return pblk_map_rq(pblk, rqd, sentry, lun_bitmap, - valid_secs, i + min); - } - spin_unlock(&e_line->lock); - } - - d_line = pblk_line_get_data(pblk); - - /* line can change after page map. We might also be writing the - * last line. - */ - e_line = pblk_line_get_erase(pblk); - if (!e_line) - return -ENOSPC; - - /* Erase blocks that are bad in this line but might not be in next */ - if (unlikely(pblk_ppa_empty(*erase_ppa)) && - bitmap_weight(d_line->blk_bitmap, lm->blk_per_line)) { - int bit = -1; - -retry: - bit = find_next_bit(d_line->blk_bitmap, - lm->blk_per_line, bit + 1); - if (bit >= lm->blk_per_line) - return 0; - - spin_lock(&e_line->lock); - if (test_bit(bit, e_line->erase_bitmap)) { - spin_unlock(&e_line->lock); - goto retry; - } - spin_unlock(&e_line->lock); - - set_bit(bit, e_line->erase_bitmap); - atomic_dec(&e_line->left_eblks); - *erase_ppa = pblk->luns[bit].bppa; /* set ch and lun */ - erase_ppa->a.blk = e_line->id; - } - - return 0; -} diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c deleted file mode 100644 index 5abb1705b039..000000000000 --- a/drivers/lightnvm/pblk-rb.c +++ /dev/null @@ -1,858 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2016 CNEX Labs - * Initial release: Javier Gonzalez - * - * Based upon the circular ringbuffer. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * pblk-rb.c - pblk's write buffer - */ - -#include - -#include "pblk.h" - -static DECLARE_RWSEM(pblk_rb_lock); - -static void pblk_rb_data_free(struct pblk_rb *rb) -{ - struct pblk_rb_pages *p, *t; - - down_write(&pblk_rb_lock); - list_for_each_entry_safe(p, t, &rb->pages, list) { - free_pages((unsigned long)page_address(p->pages), p->order); - list_del(&p->list); - kfree(p); - } - up_write(&pblk_rb_lock); -} - -void pblk_rb_free(struct pblk_rb *rb) -{ - pblk_rb_data_free(rb); - vfree(rb->entries); -} - -/* - * pblk_rb_calculate_size -- calculate the size of the write buffer - */ -static unsigned int pblk_rb_calculate_size(unsigned int nr_entries, - unsigned int threshold) -{ - unsigned int thr_sz = 1 << (get_count_order(threshold + NVM_MAX_VLBA)); - unsigned int max_sz = max(thr_sz, nr_entries); - unsigned int max_io; - - /* Alloc a write buffer that can (i) fit at least two split bios - * (considering max I/O size NVM_MAX_VLBA, and (ii) guarantee that the - * threshold will be respected - */ - max_io = (1 << max((int)(get_count_order(max_sz)), - (int)(get_count_order(NVM_MAX_VLBA << 1)))); - if ((threshold + NVM_MAX_VLBA) >= max_io) - max_io <<= 1; - - return max_io; -} - -/* - * Initialize ring buffer. The data and metadata buffers must be previously - * allocated and their size must be a power of two - * (Documentation/core-api/circular-buffers.rst) - */ -int pblk_rb_init(struct pblk_rb *rb, unsigned int size, unsigned int threshold, - unsigned int seg_size) -{ - struct pblk *pblk = container_of(rb, struct pblk, rwb); - struct pblk_rb_entry *entries; - unsigned int init_entry = 0; - unsigned int max_order = MAX_ORDER - 1; - unsigned int power_size, power_seg_sz; - unsigned int alloc_order, order, iter; - unsigned int nr_entries; - - nr_entries = pblk_rb_calculate_size(size, threshold); - entries = vzalloc(array_size(nr_entries, sizeof(struct pblk_rb_entry))); - if (!entries) - return -ENOMEM; - - power_size = get_count_order(nr_entries); - power_seg_sz = get_count_order(seg_size); - - down_write(&pblk_rb_lock); - rb->entries = entries; - rb->seg_size = (1 << power_seg_sz); - rb->nr_entries = (1 << power_size); - rb->mem = rb->subm = rb->sync = rb->l2p_update = 0; - rb->back_thres = threshold; - rb->flush_point = EMPTY_ENTRY; - - spin_lock_init(&rb->w_lock); - spin_lock_init(&rb->s_lock); - - INIT_LIST_HEAD(&rb->pages); - - alloc_order = power_size; - if (alloc_order >= max_order) { - order = max_order; - iter = (1 << (alloc_order - max_order)); - } else { - order = alloc_order; - iter = 1; - } - - do { - struct pblk_rb_entry *entry; - struct pblk_rb_pages *page_set; - void *kaddr; - unsigned long set_size; - int i; - - page_set = kmalloc(sizeof(struct pblk_rb_pages), GFP_KERNEL); - if (!page_set) { - up_write(&pblk_rb_lock); - vfree(entries); - return -ENOMEM; - } - - page_set->order = order; - page_set->pages = alloc_pages(GFP_KERNEL, order); - if (!page_set->pages) { - kfree(page_set); - pblk_rb_data_free(rb); - up_write(&pblk_rb_lock); - vfree(entries); - return -ENOMEM; - } - kaddr = page_address(page_set->pages); - - entry = &rb->entries[init_entry]; - entry->data = kaddr; - entry->cacheline = pblk_cacheline_to_addr(init_entry++); - entry->w_ctx.flags = PBLK_WRITABLE_ENTRY; - - set_size = (1 << order); - for (i = 1; i < set_size; i++) { - entry = &rb->entries[init_entry]; - entry->cacheline = pblk_cacheline_to_addr(init_entry++); - entry->data = kaddr + (i * rb->seg_size); - entry->w_ctx.flags = PBLK_WRITABLE_ENTRY; - bio_list_init(&entry->w_ctx.bios); - } - - list_add_tail(&page_set->list, &rb->pages); - iter--; - } while (iter > 0); - up_write(&pblk_rb_lock); - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_set(&rb->inflight_flush_point, 0); -#endif - - /* - * Initialize rate-limiter, which controls access to the write buffer - * by user and GC I/O - */ - pblk_rl_init(&pblk->rl, rb->nr_entries, threshold); - - return 0; -} - -static void clean_wctx(struct pblk_w_ctx *w_ctx) -{ - int flags; - - flags = READ_ONCE(w_ctx->flags); - WARN_ONCE(!(flags & PBLK_SUBMITTED_ENTRY), - "pblk: overwriting unsubmitted data\n"); - - /* Release flags on context. Protect from writes and reads */ - smp_store_release(&w_ctx->flags, PBLK_WRITABLE_ENTRY); - pblk_ppa_set_empty(&w_ctx->ppa); - w_ctx->lba = ADDR_EMPTY; -} - -#define pblk_rb_ring_count(head, tail, size) CIRC_CNT(head, tail, size) -#define pblk_rb_ring_space(rb, head, tail, size) \ - (CIRC_SPACE(head, tail, size)) - -/* - * Buffer space is calculated with respect to the back pointer signaling - * synchronized entries to the media. - */ -static unsigned int pblk_rb_space(struct pblk_rb *rb) -{ - unsigned int mem = READ_ONCE(rb->mem); - unsigned int sync = READ_ONCE(rb->sync); - - return pblk_rb_ring_space(rb, mem, sync, rb->nr_entries); -} - -unsigned int pblk_rb_ptr_wrap(struct pblk_rb *rb, unsigned int p, - unsigned int nr_entries) -{ - return (p + nr_entries) & (rb->nr_entries - 1); -} - -/* - * Buffer count is calculated with respect to the submission entry signaling the - * entries that are available to send to the media - */ -unsigned int pblk_rb_read_count(struct pblk_rb *rb) -{ - unsigned int mem = READ_ONCE(rb->mem); - unsigned int subm = READ_ONCE(rb->subm); - - return pblk_rb_ring_count(mem, subm, rb->nr_entries); -} - -unsigned int pblk_rb_sync_count(struct pblk_rb *rb) -{ - unsigned int mem = READ_ONCE(rb->mem); - unsigned int sync = READ_ONCE(rb->sync); - - return pblk_rb_ring_count(mem, sync, rb->nr_entries); -} - -unsigned int pblk_rb_read_commit(struct pblk_rb *rb, unsigned int nr_entries) -{ - unsigned int subm; - - subm = READ_ONCE(rb->subm); - /* Commit read means updating submission pointer */ - smp_store_release(&rb->subm, pblk_rb_ptr_wrap(rb, subm, nr_entries)); - - return subm; -} - -static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int to_update) -{ - struct pblk *pblk = container_of(rb, struct pblk, rwb); - struct pblk_line *line; - struct pblk_rb_entry *entry; - struct pblk_w_ctx *w_ctx; - unsigned int user_io = 0, gc_io = 0; - unsigned int i; - int flags; - - for (i = 0; i < to_update; i++) { - entry = &rb->entries[rb->l2p_update]; - w_ctx = &entry->w_ctx; - - flags = READ_ONCE(entry->w_ctx.flags); - if (flags & PBLK_IOTYPE_USER) - user_io++; - else if (flags & PBLK_IOTYPE_GC) - gc_io++; - else - WARN(1, "pblk: unknown IO type\n"); - - pblk_update_map_dev(pblk, w_ctx->lba, w_ctx->ppa, - entry->cacheline); - - line = pblk_ppa_to_line(pblk, w_ctx->ppa); - atomic_dec(&line->sec_to_update); - kref_put(&line->ref, pblk_line_put); - clean_wctx(w_ctx); - rb->l2p_update = pblk_rb_ptr_wrap(rb, rb->l2p_update, 1); - } - - pblk_rl_out(&pblk->rl, user_io, gc_io); - - return 0; -} - -/* - * When we move the l2p_update pointer, we update the l2p table - lookups will - * point to the physical address instead of to the cacheline in the write buffer - * from this moment on. - */ -static int pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int nr_entries, - unsigned int mem, unsigned int sync) -{ - unsigned int space, count; - int ret = 0; - - lockdep_assert_held(&rb->w_lock); - - /* Update l2p only as buffer entries are being overwritten */ - space = pblk_rb_ring_space(rb, mem, rb->l2p_update, rb->nr_entries); - if (space > nr_entries) - goto out; - - count = nr_entries - space; - /* l2p_update used exclusively under rb->w_lock */ - ret = __pblk_rb_update_l2p(rb, count); - -out: - return ret; -} - -/* - * Update the l2p entry for all sectors stored on the write buffer. This means - * that all future lookups to the l2p table will point to a device address, not - * to the cacheline in the write buffer. - */ -void pblk_rb_sync_l2p(struct pblk_rb *rb) -{ - unsigned int sync; - unsigned int to_update; - - spin_lock(&rb->w_lock); - - /* Protect from reads and writes */ - sync = smp_load_acquire(&rb->sync); - - to_update = pblk_rb_ring_count(sync, rb->l2p_update, rb->nr_entries); - __pblk_rb_update_l2p(rb, to_update); - - spin_unlock(&rb->w_lock); -} - -/* - * Write @nr_entries to ring buffer from @data buffer if there is enough space. - * Typically, 4KB data chunks coming from a bio will be copied to the ring - * buffer, thus the write will fail if not all incoming data can be copied. - * - */ -static void __pblk_rb_write_entry(struct pblk_rb *rb, void *data, - struct pblk_w_ctx w_ctx, - struct pblk_rb_entry *entry) -{ - memcpy(entry->data, data, rb->seg_size); - - entry->w_ctx.lba = w_ctx.lba; - entry->w_ctx.ppa = w_ctx.ppa; -} - -void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data, - struct pblk_w_ctx w_ctx, unsigned int ring_pos) -{ - struct pblk *pblk = container_of(rb, struct pblk, rwb); - struct pblk_rb_entry *entry; - int flags; - - entry = &rb->entries[ring_pos]; - flags = READ_ONCE(entry->w_ctx.flags); -#ifdef CONFIG_NVM_PBLK_DEBUG - /* Caller must guarantee that the entry is free */ - BUG_ON(!(flags & PBLK_WRITABLE_ENTRY)); -#endif - - __pblk_rb_write_entry(rb, data, w_ctx, entry); - - pblk_update_map_cache(pblk, w_ctx.lba, entry->cacheline); - flags = w_ctx.flags | PBLK_WRITTEN_DATA; - - /* Release flags on write context. Protect from writes */ - smp_store_release(&entry->w_ctx.flags, flags); -} - -void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data, - struct pblk_w_ctx w_ctx, struct pblk_line *line, - u64 paddr, unsigned int ring_pos) -{ - struct pblk *pblk = container_of(rb, struct pblk, rwb); - struct pblk_rb_entry *entry; - int flags; - - entry = &rb->entries[ring_pos]; - flags = READ_ONCE(entry->w_ctx.flags); -#ifdef CONFIG_NVM_PBLK_DEBUG - /* Caller must guarantee that the entry is free */ - BUG_ON(!(flags & PBLK_WRITABLE_ENTRY)); -#endif - - __pblk_rb_write_entry(rb, data, w_ctx, entry); - - if (!pblk_update_map_gc(pblk, w_ctx.lba, entry->cacheline, line, paddr)) - entry->w_ctx.lba = ADDR_EMPTY; - - flags = w_ctx.flags | PBLK_WRITTEN_DATA; - - /* Release flags on write context. Protect from writes */ - smp_store_release(&entry->w_ctx.flags, flags); -} - -static int pblk_rb_flush_point_set(struct pblk_rb *rb, struct bio *bio, - unsigned int pos) -{ - struct pblk_rb_entry *entry; - unsigned int sync, flush_point; - - pblk_rb_sync_init(rb, NULL); - sync = READ_ONCE(rb->sync); - - if (pos == sync) { - pblk_rb_sync_end(rb, NULL); - return 0; - } - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_inc(&rb->inflight_flush_point); -#endif - - flush_point = (pos == 0) ? (rb->nr_entries - 1) : (pos - 1); - entry = &rb->entries[flush_point]; - - /* Protect flush points */ - smp_store_release(&rb->flush_point, flush_point); - - if (bio) - bio_list_add(&entry->w_ctx.bios, bio); - - pblk_rb_sync_end(rb, NULL); - - return bio ? 1 : 0; -} - -static int __pblk_rb_may_write(struct pblk_rb *rb, unsigned int nr_entries, - unsigned int *pos) -{ - unsigned int mem; - unsigned int sync; - unsigned int threshold; - - sync = READ_ONCE(rb->sync); - mem = READ_ONCE(rb->mem); - - threshold = nr_entries + rb->back_thres; - - if (pblk_rb_ring_space(rb, mem, sync, rb->nr_entries) < threshold) - return 0; - - if (pblk_rb_update_l2p(rb, nr_entries, mem, sync)) - return 0; - - *pos = mem; - - return 1; -} - -static int pblk_rb_may_write(struct pblk_rb *rb, unsigned int nr_entries, - unsigned int *pos) -{ - if (!__pblk_rb_may_write(rb, nr_entries, pos)) - return 0; - - /* Protect from read count */ - smp_store_release(&rb->mem, pblk_rb_ptr_wrap(rb, *pos, nr_entries)); - return 1; -} - -void pblk_rb_flush(struct pblk_rb *rb) -{ - struct pblk *pblk = container_of(rb, struct pblk, rwb); - unsigned int mem = READ_ONCE(rb->mem); - - if (pblk_rb_flush_point_set(rb, NULL, mem)) - return; - - pblk_write_kick(pblk); -} - -static int pblk_rb_may_write_flush(struct pblk_rb *rb, unsigned int nr_entries, - unsigned int *pos, struct bio *bio, - int *io_ret) -{ - unsigned int mem; - - if (!__pblk_rb_may_write(rb, nr_entries, pos)) - return 0; - - mem = pblk_rb_ptr_wrap(rb, *pos, nr_entries); - *io_ret = NVM_IO_DONE; - - if (bio->bi_opf & REQ_PREFLUSH) { - struct pblk *pblk = container_of(rb, struct pblk, rwb); - - atomic64_inc(&pblk->nr_flush); - if (pblk_rb_flush_point_set(&pblk->rwb, bio, mem)) - *io_ret = NVM_IO_OK; - } - - /* Protect from read count */ - smp_store_release(&rb->mem, mem); - - return 1; -} - -/* - * Atomically check that (i) there is space on the write buffer for the - * incoming I/O, and (ii) the current I/O type has enough budget in the write - * buffer (rate-limiter). - */ -int pblk_rb_may_write_user(struct pblk_rb *rb, struct bio *bio, - unsigned int nr_entries, unsigned int *pos) -{ - struct pblk *pblk = container_of(rb, struct pblk, rwb); - int io_ret; - - spin_lock(&rb->w_lock); - io_ret = pblk_rl_user_may_insert(&pblk->rl, nr_entries); - if (io_ret) { - spin_unlock(&rb->w_lock); - return io_ret; - } - - if (!pblk_rb_may_write_flush(rb, nr_entries, pos, bio, &io_ret)) { - spin_unlock(&rb->w_lock); - return NVM_IO_REQUEUE; - } - - pblk_rl_user_in(&pblk->rl, nr_entries); - spin_unlock(&rb->w_lock); - - return io_ret; -} - -/* - * Look at pblk_rb_may_write_user comment - */ -int pblk_rb_may_write_gc(struct pblk_rb *rb, unsigned int nr_entries, - unsigned int *pos) -{ - struct pblk *pblk = container_of(rb, struct pblk, rwb); - - spin_lock(&rb->w_lock); - if (!pblk_rl_gc_may_insert(&pblk->rl, nr_entries)) { - spin_unlock(&rb->w_lock); - return 0; - } - - if (!pblk_rb_may_write(rb, nr_entries, pos)) { - spin_unlock(&rb->w_lock); - return 0; - } - - pblk_rl_gc_in(&pblk->rl, nr_entries); - spin_unlock(&rb->w_lock); - - return 1; -} - -/* - * Read available entries on rb and add them to the given bio. To avoid a memory - * copy, a page reference to the write buffer is used to be added to the bio. - * - * This function is used by the write thread to form the write bio that will - * persist data on the write buffer to the media. - */ -unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd, - unsigned int pos, unsigned int nr_entries, - unsigned int count) -{ - struct pblk *pblk = container_of(rb, struct pblk, rwb); - struct request_queue *q = pblk->dev->q; - struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); - struct bio *bio = rqd->bio; - struct pblk_rb_entry *entry; - struct page *page; - unsigned int pad = 0, to_read = nr_entries; - unsigned int i; - int flags; - - if (count < nr_entries) { - pad = nr_entries - count; - to_read = count; - } - - /* Add space for packed metadata if in use*/ - pad += (pblk->min_write_pgs - pblk->min_write_pgs_data); - - c_ctx->sentry = pos; - c_ctx->nr_valid = to_read; - c_ctx->nr_padded = pad; - - for (i = 0; i < to_read; i++) { - entry = &rb->entries[pos]; - - /* A write has been allowed into the buffer, but data is still - * being copied to it. It is ok to busy wait. - */ -try: - flags = READ_ONCE(entry->w_ctx.flags); - if (!(flags & PBLK_WRITTEN_DATA)) { - io_schedule(); - goto try; - } - - page = virt_to_page(entry->data); - if (!page) { - pblk_err(pblk, "could not allocate write bio page\n"); - flags &= ~PBLK_WRITTEN_DATA; - flags |= PBLK_SUBMITTED_ENTRY; - /* Release flags on context. Protect from writes */ - smp_store_release(&entry->w_ctx.flags, flags); - return NVM_IO_ERR; - } - - if (bio_add_pc_page(q, bio, page, rb->seg_size, 0) != - rb->seg_size) { - pblk_err(pblk, "could not add page to write bio\n"); - flags &= ~PBLK_WRITTEN_DATA; - flags |= PBLK_SUBMITTED_ENTRY; - /* Release flags on context. Protect from writes */ - smp_store_release(&entry->w_ctx.flags, flags); - return NVM_IO_ERR; - } - - flags &= ~PBLK_WRITTEN_DATA; - flags |= PBLK_SUBMITTED_ENTRY; - - /* Release flags on context. Protect from writes */ - smp_store_release(&entry->w_ctx.flags, flags); - - pos = pblk_rb_ptr_wrap(rb, pos, 1); - } - - if (pad) { - if (pblk_bio_add_pages(pblk, bio, GFP_KERNEL, pad)) { - pblk_err(pblk, "could not pad page in write bio\n"); - return NVM_IO_ERR; - } - - if (pad < pblk->min_write_pgs) - atomic64_inc(&pblk->pad_dist[pad - 1]); - else - pblk_warn(pblk, "padding more than min. sectors\n"); - - atomic64_add(pad, &pblk->pad_wa); - } - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_add(pad, &pblk->padded_writes); -#endif - - return NVM_IO_OK; -} - -/* - * Copy to bio only if the lba matches the one on the given cache entry. - * Otherwise, it means that the entry has been overwritten, and the bio should - * be directed to disk. - */ -int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba, - struct ppa_addr ppa) -{ - struct pblk *pblk = container_of(rb, struct pblk, rwb); - struct pblk_rb_entry *entry; - struct pblk_w_ctx *w_ctx; - struct ppa_addr l2p_ppa; - u64 pos = pblk_addr_to_cacheline(ppa); - void *data; - int flags; - int ret = 1; - - -#ifdef CONFIG_NVM_PBLK_DEBUG - /* Caller must ensure that the access will not cause an overflow */ - BUG_ON(pos >= rb->nr_entries); -#endif - entry = &rb->entries[pos]; - w_ctx = &entry->w_ctx; - flags = READ_ONCE(w_ctx->flags); - - spin_lock(&rb->w_lock); - spin_lock(&pblk->trans_lock); - l2p_ppa = pblk_trans_map_get(pblk, lba); - spin_unlock(&pblk->trans_lock); - - /* Check if the entry has been overwritten or is scheduled to be */ - if (!pblk_ppa_comp(l2p_ppa, ppa) || w_ctx->lba != lba || - flags & PBLK_WRITABLE_ENTRY) { - ret = 0; - goto out; - } - data = bio_data(bio); - memcpy(data, entry->data, rb->seg_size); - -out: - spin_unlock(&rb->w_lock); - return ret; -} - -struct pblk_w_ctx *pblk_rb_w_ctx(struct pblk_rb *rb, unsigned int pos) -{ - unsigned int entry = pblk_rb_ptr_wrap(rb, pos, 0); - - return &rb->entries[entry].w_ctx; -} - -unsigned int pblk_rb_sync_init(struct pblk_rb *rb, unsigned long *flags) - __acquires(&rb->s_lock) -{ - if (flags) - spin_lock_irqsave(&rb->s_lock, *flags); - else - spin_lock_irq(&rb->s_lock); - - return rb->sync; -} - -void pblk_rb_sync_end(struct pblk_rb *rb, unsigned long *flags) - __releases(&rb->s_lock) -{ - lockdep_assert_held(&rb->s_lock); - - if (flags) - spin_unlock_irqrestore(&rb->s_lock, *flags); - else - spin_unlock_irq(&rb->s_lock); -} - -unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries) -{ - unsigned int sync, flush_point; - lockdep_assert_held(&rb->s_lock); - - sync = READ_ONCE(rb->sync); - flush_point = READ_ONCE(rb->flush_point); - - if (flush_point != EMPTY_ENTRY) { - unsigned int secs_to_flush; - - secs_to_flush = pblk_rb_ring_count(flush_point, sync, - rb->nr_entries); - if (secs_to_flush < nr_entries) { - /* Protect flush points */ - smp_store_release(&rb->flush_point, EMPTY_ENTRY); - } - } - - sync = pblk_rb_ptr_wrap(rb, sync, nr_entries); - - /* Protect from counts */ - smp_store_release(&rb->sync, sync); - - return sync; -} - -/* Calculate how many sectors to submit up to the current flush point. */ -unsigned int pblk_rb_flush_point_count(struct pblk_rb *rb) -{ - unsigned int subm, sync, flush_point; - unsigned int submitted, to_flush; - - /* Protect flush points */ - flush_point = smp_load_acquire(&rb->flush_point); - if (flush_point == EMPTY_ENTRY) - return 0; - - /* Protect syncs */ - sync = smp_load_acquire(&rb->sync); - - subm = READ_ONCE(rb->subm); - submitted = pblk_rb_ring_count(subm, sync, rb->nr_entries); - - /* The sync point itself counts as a sector to sync */ - to_flush = pblk_rb_ring_count(flush_point, sync, rb->nr_entries) + 1; - - return (submitted < to_flush) ? (to_flush - submitted) : 0; -} - -int pblk_rb_tear_down_check(struct pblk_rb *rb) -{ - struct pblk_rb_entry *entry; - int i; - int ret = 0; - - spin_lock(&rb->w_lock); - spin_lock_irq(&rb->s_lock); - - if ((rb->mem == rb->subm) && (rb->subm == rb->sync) && - (rb->sync == rb->l2p_update) && - (rb->flush_point == EMPTY_ENTRY)) { - goto out; - } - - if (!rb->entries) { - ret = 1; - goto out; - } - - for (i = 0; i < rb->nr_entries; i++) { - entry = &rb->entries[i]; - - if (!entry->data) { - ret = 1; - goto out; - } - } - -out: - spin_unlock_irq(&rb->s_lock); - spin_unlock(&rb->w_lock); - - return ret; -} - -unsigned int pblk_rb_wrap_pos(struct pblk_rb *rb, unsigned int pos) -{ - return (pos & (rb->nr_entries - 1)); -} - -int pblk_rb_pos_oob(struct pblk_rb *rb, u64 pos) -{ - return (pos >= rb->nr_entries); -} - -ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf) -{ - struct pblk *pblk = container_of(rb, struct pblk, rwb); - struct pblk_c_ctx *c; - ssize_t offset; - int queued_entries = 0; - - spin_lock_irq(&rb->s_lock); - list_for_each_entry(c, &pblk->compl_list, list) - queued_entries++; - spin_unlock_irq(&rb->s_lock); - - if (rb->flush_point != EMPTY_ENTRY) - offset = scnprintf(buf, PAGE_SIZE, - "%u\t%u\t%u\t%u\t%u\t%u\t%u - %u/%u/%u - %d\n", - rb->nr_entries, - rb->mem, - rb->subm, - rb->sync, - rb->l2p_update, -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_read(&rb->inflight_flush_point), -#else - 0, -#endif - rb->flush_point, - pblk_rb_read_count(rb), - pblk_rb_space(rb), - pblk_rb_flush_point_count(rb), - queued_entries); - else - offset = scnprintf(buf, PAGE_SIZE, - "%u\t%u\t%u\t%u\t%u\t%u\tNULL - %u/%u/%u - %d\n", - rb->nr_entries, - rb->mem, - rb->subm, - rb->sync, - rb->l2p_update, -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_read(&rb->inflight_flush_point), -#else - 0, -#endif - pblk_rb_read_count(rb), - pblk_rb_space(rb), - pblk_rb_flush_point_count(rb), - queued_entries); - - return offset; -} diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c deleted file mode 100644 index c28537a489bc..000000000000 --- a/drivers/lightnvm/pblk-read.c +++ /dev/null @@ -1,474 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2016 CNEX Labs - * Initial release: Javier Gonzalez - * Matias Bjorling - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * pblk-read.c - pblk's read path - */ - -#include "pblk.h" - -/* - * There is no guarantee that the value read from cache has not been updated and - * resides at another location in the cache. We guarantee though that if the - * value is read from the cache, it belongs to the mapped lba. In order to - * guarantee and order between writes and reads are ordered, a flush must be - * issued. - */ -static int pblk_read_from_cache(struct pblk *pblk, struct bio *bio, - sector_t lba, struct ppa_addr ppa) -{ -#ifdef CONFIG_NVM_PBLK_DEBUG - /* Callers must ensure that the ppa points to a cache address */ - BUG_ON(pblk_ppa_empty(ppa)); - BUG_ON(!pblk_addr_in_cache(ppa)); -#endif - - return pblk_rb_copy_to_bio(&pblk->rwb, bio, lba, ppa); -} - -static int pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd, - struct bio *bio, sector_t blba, - bool *from_cache) -{ - void *meta_list = rqd->meta_list; - int nr_secs, i; - -retry: - nr_secs = pblk_lookup_l2p_seq(pblk, rqd->ppa_list, blba, rqd->nr_ppas, - from_cache); - - if (!*from_cache) - goto end; - - for (i = 0; i < nr_secs; i++) { - struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i); - sector_t lba = blba + i; - - if (pblk_ppa_empty(rqd->ppa_list[i])) { - __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); - - meta->lba = addr_empty; - } else if (pblk_addr_in_cache(rqd->ppa_list[i])) { - /* - * Try to read from write buffer. The address is later - * checked on the write buffer to prevent retrieving - * overwritten data. - */ - if (!pblk_read_from_cache(pblk, bio, lba, - rqd->ppa_list[i])) { - if (i == 0) { - /* - * We didn't call with bio_advance() - * yet, so we can just retry. - */ - goto retry; - } else { - /* - * We already call bio_advance() - * so we cannot retry and we need - * to quit that function in order - * to allow caller to handle the bio - * splitting in the current sector - * position. - */ - nr_secs = i; - goto end; - } - } - meta->lba = cpu_to_le64(lba); -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_inc(&pblk->cache_reads); -#endif - } - bio_advance(bio, PBLK_EXPOSED_PAGE_SIZE); - } - -end: - if (pblk_io_aligned(pblk, nr_secs)) - rqd->is_seq = 1; - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_add(nr_secs, &pblk->inflight_reads); -#endif - - return nr_secs; -} - - -static void pblk_read_check_seq(struct pblk *pblk, struct nvm_rq *rqd, - sector_t blba) -{ - void *meta_list = rqd->meta_list; - int nr_lbas = rqd->nr_ppas; - int i; - - if (!pblk_is_oob_meta_supported(pblk)) - return; - - for (i = 0; i < nr_lbas; i++) { - struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i); - u64 lba = le64_to_cpu(meta->lba); - - if (lba == ADDR_EMPTY) - continue; - - if (lba != blba + i) { -#ifdef CONFIG_NVM_PBLK_DEBUG - struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd); - - print_ppa(pblk, &ppa_list[i], "seq", i); -#endif - pblk_err(pblk, "corrupted read LBA (%llu/%llu)\n", - lba, (u64)blba + i); - WARN_ON(1); - } - } -} - -/* - * There can be holes in the lba list. - */ -static void pblk_read_check_rand(struct pblk *pblk, struct nvm_rq *rqd, - u64 *lba_list, int nr_lbas) -{ - void *meta_lba_list = rqd->meta_list; - int i, j; - - if (!pblk_is_oob_meta_supported(pblk)) - return; - - for (i = 0, j = 0; i < nr_lbas; i++) { - struct pblk_sec_meta *meta = pblk_get_meta(pblk, - meta_lba_list, j); - u64 lba = lba_list[i]; - u64 meta_lba; - - if (lba == ADDR_EMPTY) - continue; - - meta_lba = le64_to_cpu(meta->lba); - - if (lba != meta_lba) { -#ifdef CONFIG_NVM_PBLK_DEBUG - struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd); - - print_ppa(pblk, &ppa_list[j], "rnd", j); -#endif - pblk_err(pblk, "corrupted read LBA (%llu/%llu)\n", - meta_lba, lba); - WARN_ON(1); - } - - j++; - } - - WARN_ONCE(j != rqd->nr_ppas, "pblk: corrupted random request\n"); -} - -static void pblk_end_user_read(struct bio *bio, int error) -{ - if (error && error != NVM_RSP_WARN_HIGHECC) - bio_io_error(bio); - else - bio_endio(bio); -} - -static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd, - bool put_line) -{ - struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd); - struct bio *int_bio = rqd->bio; - unsigned long start_time = r_ctx->start_time; - - bio_end_io_acct(int_bio, start_time); - - if (rqd->error) - pblk_log_read_err(pblk, rqd); - - pblk_read_check_seq(pblk, rqd, r_ctx->lba); - bio_put(int_bio); - - if (put_line) - pblk_rq_to_line_put(pblk, rqd); - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_add(rqd->nr_ppas, &pblk->sync_reads); - atomic_long_sub(rqd->nr_ppas, &pblk->inflight_reads); -#endif - - pblk_free_rqd(pblk, rqd, PBLK_READ); - atomic_dec(&pblk->inflight_io); -} - -static void pblk_end_io_read(struct nvm_rq *rqd) -{ - struct pblk *pblk = rqd->private; - struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd); - struct bio *bio = (struct bio *)r_ctx->private; - - pblk_end_user_read(bio, rqd->error); - __pblk_end_io_read(pblk, rqd, true); -} - -static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, struct bio *bio, - sector_t lba, bool *from_cache) -{ - struct pblk_sec_meta *meta = pblk_get_meta(pblk, rqd->meta_list, 0); - struct ppa_addr ppa; - - pblk_lookup_l2p_seq(pblk, &ppa, lba, 1, from_cache); - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_inc(&pblk->inflight_reads); -#endif - -retry: - if (pblk_ppa_empty(ppa)) { - __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); - - meta->lba = addr_empty; - return; - } - - /* Try to read from write buffer. The address is later checked on the - * write buffer to prevent retrieving overwritten data. - */ - if (pblk_addr_in_cache(ppa)) { - if (!pblk_read_from_cache(pblk, bio, lba, ppa)) { - pblk_lookup_l2p_seq(pblk, &ppa, lba, 1, from_cache); - goto retry; - } - - meta->lba = cpu_to_le64(lba); - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_inc(&pblk->cache_reads); -#endif - } else { - rqd->ppa_addr = ppa; - } -} - -void pblk_submit_read(struct pblk *pblk, struct bio *bio) -{ - sector_t blba = pblk_get_lba(bio); - unsigned int nr_secs = pblk_get_secs(bio); - bool from_cache; - struct pblk_g_ctx *r_ctx; - struct nvm_rq *rqd; - struct bio *int_bio, *split_bio; - unsigned long start_time; - - start_time = bio_start_io_acct(bio); - - rqd = pblk_alloc_rqd(pblk, PBLK_READ); - - rqd->opcode = NVM_OP_PREAD; - rqd->nr_ppas = nr_secs; - rqd->private = pblk; - rqd->end_io = pblk_end_io_read; - - r_ctx = nvm_rq_to_pdu(rqd); - r_ctx->start_time = start_time; - r_ctx->lba = blba; - - if (pblk_alloc_rqd_meta(pblk, rqd)) { - bio_io_error(bio); - pblk_free_rqd(pblk, rqd, PBLK_READ); - return; - } - - /* Clone read bio to deal internally with: - * -read errors when reading from drive - * -bio_advance() calls during cache reads - */ - int_bio = bio_clone_fast(bio, GFP_KERNEL, &pblk_bio_set); - - if (nr_secs > 1) - nr_secs = pblk_read_ppalist_rq(pblk, rqd, int_bio, blba, - &from_cache); - else - pblk_read_rq(pblk, rqd, int_bio, blba, &from_cache); - -split_retry: - r_ctx->private = bio; /* original bio */ - rqd->bio = int_bio; /* internal bio */ - - if (from_cache && nr_secs == rqd->nr_ppas) { - /* All data was read from cache, we can complete the IO. */ - pblk_end_user_read(bio, 0); - atomic_inc(&pblk->inflight_io); - __pblk_end_io_read(pblk, rqd, false); - } else if (nr_secs != rqd->nr_ppas) { - /* The read bio request could be partially filled by the write - * buffer, but there are some holes that need to be read from - * the drive. In order to handle this, we will use block layer - * mechanism to split this request in to smaller ones and make - * a chain of it. - */ - split_bio = bio_split(bio, nr_secs * NR_PHY_IN_LOG, GFP_KERNEL, - &pblk_bio_set); - bio_chain(split_bio, bio); - submit_bio_noacct(bio); - - /* New bio contains first N sectors of the previous one, so - * we can continue to use existing rqd, but we need to shrink - * the number of PPAs in it. New bio is also guaranteed that - * it contains only either data from cache or from drive, newer - * mix of them. - */ - bio = split_bio; - rqd->nr_ppas = nr_secs; - if (rqd->nr_ppas == 1) - rqd->ppa_addr = rqd->ppa_list[0]; - - /* Recreate int_bio - existing might have some needed internal - * fields modified already. - */ - bio_put(int_bio); - int_bio = bio_clone_fast(bio, GFP_KERNEL, &pblk_bio_set); - goto split_retry; - } else if (pblk_submit_io(pblk, rqd, NULL)) { - /* Submitting IO to drive failed, let's report an error */ - rqd->error = -ENODEV; - pblk_end_io_read(rqd); - } -} - -static int read_ppalist_rq_gc(struct pblk *pblk, struct nvm_rq *rqd, - struct pblk_line *line, u64 *lba_list, - u64 *paddr_list_gc, unsigned int nr_secs) -{ - struct ppa_addr ppa_list_l2p[NVM_MAX_VLBA]; - struct ppa_addr ppa_gc; - int valid_secs = 0; - int i; - - pblk_lookup_l2p_rand(pblk, ppa_list_l2p, lba_list, nr_secs); - - for (i = 0; i < nr_secs; i++) { - if (lba_list[i] == ADDR_EMPTY) - continue; - - ppa_gc = addr_to_gen_ppa(pblk, paddr_list_gc[i], line->id); - if (!pblk_ppa_comp(ppa_list_l2p[i], ppa_gc)) { - paddr_list_gc[i] = lba_list[i] = ADDR_EMPTY; - continue; - } - - rqd->ppa_list[valid_secs++] = ppa_list_l2p[i]; - } - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_add(valid_secs, &pblk->inflight_reads); -#endif - - return valid_secs; -} - -static int read_rq_gc(struct pblk *pblk, struct nvm_rq *rqd, - struct pblk_line *line, sector_t lba, - u64 paddr_gc) -{ - struct ppa_addr ppa_l2p, ppa_gc; - int valid_secs = 0; - - if (lba == ADDR_EMPTY) - goto out; - - /* logic error: lba out-of-bounds */ - if (lba >= pblk->capacity) { - WARN(1, "pblk: read lba out of bounds\n"); - goto out; - } - - spin_lock(&pblk->trans_lock); - ppa_l2p = pblk_trans_map_get(pblk, lba); - spin_unlock(&pblk->trans_lock); - - ppa_gc = addr_to_gen_ppa(pblk, paddr_gc, line->id); - if (!pblk_ppa_comp(ppa_l2p, ppa_gc)) - goto out; - - rqd->ppa_addr = ppa_l2p; - valid_secs = 1; - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_inc(&pblk->inflight_reads); -#endif - -out: - return valid_secs; -} - -int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq) -{ - struct nvm_rq rqd; - int ret = NVM_IO_OK; - - memset(&rqd, 0, sizeof(struct nvm_rq)); - - ret = pblk_alloc_rqd_meta(pblk, &rqd); - if (ret) - return ret; - - if (gc_rq->nr_secs > 1) { - gc_rq->secs_to_gc = read_ppalist_rq_gc(pblk, &rqd, gc_rq->line, - gc_rq->lba_list, - gc_rq->paddr_list, - gc_rq->nr_secs); - if (gc_rq->secs_to_gc == 1) - rqd.ppa_addr = rqd.ppa_list[0]; - } else { - gc_rq->secs_to_gc = read_rq_gc(pblk, &rqd, gc_rq->line, - gc_rq->lba_list[0], - gc_rq->paddr_list[0]); - } - - if (!(gc_rq->secs_to_gc)) - goto out; - - rqd.opcode = NVM_OP_PREAD; - rqd.nr_ppas = gc_rq->secs_to_gc; - - if (pblk_submit_io_sync(pblk, &rqd, gc_rq->data)) { - ret = -EIO; - goto err_free_dma; - } - - pblk_read_check_rand(pblk, &rqd, gc_rq->lba_list, gc_rq->nr_secs); - - atomic_dec(&pblk->inflight_io); - - if (rqd.error) { - atomic_long_inc(&pblk->read_failed_gc); -#ifdef CONFIG_NVM_PBLK_DEBUG - pblk_print_failed_rqd(pblk, &rqd, rqd.error); -#endif - } - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_add(gc_rq->secs_to_gc, &pblk->sync_reads); - atomic_long_add(gc_rq->secs_to_gc, &pblk->recov_gc_reads); - atomic_long_sub(gc_rq->secs_to_gc, &pblk->inflight_reads); -#endif - -out: - pblk_free_rqd_meta(pblk, &rqd); - return ret; - -err_free_dma: - pblk_free_rqd_meta(pblk, &rqd); - return ret; -} diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c deleted file mode 100644 index 0e6f0c76e930..000000000000 --- a/drivers/lightnvm/pblk-recovery.c +++ /dev/null @@ -1,874 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2016 CNEX Labs - * Initial: Javier Gonzalez - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * pblk-recovery.c - pblk's recovery path - * - * The L2P recovery path is single threaded as the L2P table is updated in order - * following the line sequence ID. - */ - -#include "pblk.h" -#include "pblk-trace.h" - -int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta_buf) -{ - u32 crc; - - crc = pblk_calc_emeta_crc(pblk, emeta_buf); - if (le32_to_cpu(emeta_buf->crc) != crc) - return 1; - - if (le32_to_cpu(emeta_buf->header.identifier) != PBLK_MAGIC) - return 1; - - return 0; -} - -static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_emeta *emeta = line->emeta; - struct line_emeta *emeta_buf = emeta->buf; - __le64 *lba_list; - u64 data_start, data_end; - u64 nr_valid_lbas, nr_lbas = 0; - u64 i; - - lba_list = emeta_to_lbas(pblk, emeta_buf); - if (!lba_list) - return 1; - - data_start = pblk_line_smeta_start(pblk, line) + lm->smeta_sec; - data_end = line->emeta_ssec; - nr_valid_lbas = le64_to_cpu(emeta_buf->nr_valid_lbas); - - for (i = data_start; i < data_end; i++) { - struct ppa_addr ppa; - int pos; - - ppa = addr_to_gen_ppa(pblk, i, line->id); - pos = pblk_ppa_to_pos(geo, ppa); - - /* Do not update bad blocks */ - if (test_bit(pos, line->blk_bitmap)) - continue; - - if (le64_to_cpu(lba_list[i]) == ADDR_EMPTY) { - spin_lock(&line->lock); - if (test_and_set_bit(i, line->invalid_bitmap)) - WARN_ONCE(1, "pblk: rec. double invalidate:\n"); - else - le32_add_cpu(line->vsc, -1); - spin_unlock(&line->lock); - - continue; - } - - pblk_update_map(pblk, le64_to_cpu(lba_list[i]), ppa); - nr_lbas++; - } - - if (nr_valid_lbas != nr_lbas) - pblk_err(pblk, "line %d - inconsistent lba list(%llu/%llu)\n", - line->id, nr_valid_lbas, nr_lbas); - - line->left_msecs = 0; - - return 0; -} - -static void pblk_update_line_wp(struct pblk *pblk, struct pblk_line *line, - u64 written_secs) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - int i; - - for (i = 0; i < written_secs; i += pblk->min_write_pgs) - __pblk_alloc_page(pblk, line, pblk->min_write_pgs); - - spin_lock(&l_mg->free_lock); - if (written_secs > line->left_msecs) { - /* - * We have all data sectors written - * and some emeta sectors written too. - */ - line->left_msecs = 0; - } else { - /* We have only some data sectors written. */ - line->left_msecs -= written_secs; - } - spin_unlock(&l_mg->free_lock); -} - -static u64 pblk_sec_in_open_line(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_line_meta *lm = &pblk->lm; - int nr_bb = bitmap_weight(line->blk_bitmap, lm->blk_per_line); - u64 written_secs = 0; - int valid_chunks = 0; - int i; - - for (i = 0; i < lm->blk_per_line; i++) { - struct nvm_chk_meta *chunk = &line->chks[i]; - - if (chunk->state & NVM_CHK_ST_OFFLINE) - continue; - - written_secs += chunk->wp; - valid_chunks++; - } - - if (lm->blk_per_line - nr_bb != valid_chunks) - pblk_err(pblk, "recovery line %d is bad\n", line->id); - - pblk_update_line_wp(pblk, line, written_secs - lm->smeta_sec); - - return written_secs; -} - -struct pblk_recov_alloc { - struct ppa_addr *ppa_list; - void *meta_list; - struct nvm_rq *rqd; - void *data; - dma_addr_t dma_ppa_list; - dma_addr_t dma_meta_list; -}; - -static void pblk_recov_complete(struct kref *ref) -{ - struct pblk_pad_rq *pad_rq = container_of(ref, struct pblk_pad_rq, ref); - - complete(&pad_rq->wait); -} - -static void pblk_end_io_recov(struct nvm_rq *rqd) -{ - struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd); - struct pblk_pad_rq *pad_rq = rqd->private; - struct pblk *pblk = pad_rq->pblk; - - pblk_up_chunk(pblk, ppa_list[0]); - - pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT); - - atomic_dec(&pblk->inflight_io); - kref_put(&pad_rq->ref, pblk_recov_complete); -} - -/* pad line using line bitmap. */ -static int pblk_recov_pad_line(struct pblk *pblk, struct pblk_line *line, - int left_ppas) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - void *meta_list; - struct pblk_pad_rq *pad_rq; - struct nvm_rq *rqd; - struct ppa_addr *ppa_list; - void *data; - __le64 *lba_list = emeta_to_lbas(pblk, line->emeta->buf); - u64 w_ptr = line->cur_sec; - int left_line_ppas, rq_ppas; - int i, j; - int ret = 0; - - spin_lock(&line->lock); - left_line_ppas = line->left_msecs; - spin_unlock(&line->lock); - - pad_rq = kmalloc(sizeof(struct pblk_pad_rq), GFP_KERNEL); - if (!pad_rq) - return -ENOMEM; - - data = vzalloc(array_size(pblk->max_write_pgs, geo->csecs)); - if (!data) { - ret = -ENOMEM; - goto free_rq; - } - - pad_rq->pblk = pblk; - init_completion(&pad_rq->wait); - kref_init(&pad_rq->ref); - -next_pad_rq: - rq_ppas = pblk_calc_secs(pblk, left_ppas, 0, false); - if (rq_ppas < pblk->min_write_pgs) { - pblk_err(pblk, "corrupted pad line %d\n", line->id); - goto fail_complete; - } - - rqd = pblk_alloc_rqd(pblk, PBLK_WRITE_INT); - - ret = pblk_alloc_rqd_meta(pblk, rqd); - if (ret) { - pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT); - goto fail_complete; - } - - rqd->bio = NULL; - rqd->opcode = NVM_OP_PWRITE; - rqd->is_seq = 1; - rqd->nr_ppas = rq_ppas; - rqd->end_io = pblk_end_io_recov; - rqd->private = pad_rq; - - ppa_list = nvm_rq_to_ppa_list(rqd); - meta_list = rqd->meta_list; - - for (i = 0; i < rqd->nr_ppas; ) { - struct ppa_addr ppa; - int pos; - - w_ptr = pblk_alloc_page(pblk, line, pblk->min_write_pgs); - ppa = addr_to_gen_ppa(pblk, w_ptr, line->id); - pos = pblk_ppa_to_pos(geo, ppa); - - while (test_bit(pos, line->blk_bitmap)) { - w_ptr += pblk->min_write_pgs; - ppa = addr_to_gen_ppa(pblk, w_ptr, line->id); - pos = pblk_ppa_to_pos(geo, ppa); - } - - for (j = 0; j < pblk->min_write_pgs; j++, i++, w_ptr++) { - struct ppa_addr dev_ppa; - struct pblk_sec_meta *meta; - __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); - - dev_ppa = addr_to_gen_ppa(pblk, w_ptr, line->id); - - pblk_map_invalidate(pblk, dev_ppa); - lba_list[w_ptr] = addr_empty; - meta = pblk_get_meta(pblk, meta_list, i); - meta->lba = addr_empty; - ppa_list[i] = dev_ppa; - } - } - - kref_get(&pad_rq->ref); - pblk_down_chunk(pblk, ppa_list[0]); - - ret = pblk_submit_io(pblk, rqd, data); - if (ret) { - pblk_err(pblk, "I/O submission failed: %d\n", ret); - pblk_up_chunk(pblk, ppa_list[0]); - kref_put(&pad_rq->ref, pblk_recov_complete); - pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT); - goto fail_complete; - } - - left_line_ppas -= rq_ppas; - left_ppas -= rq_ppas; - if (left_ppas && left_line_ppas) - goto next_pad_rq; - -fail_complete: - kref_put(&pad_rq->ref, pblk_recov_complete); - wait_for_completion(&pad_rq->wait); - - if (!pblk_line_is_full(line)) - pblk_err(pblk, "corrupted padded line: %d\n", line->id); - - vfree(data); -free_rq: - kfree(pad_rq); - return ret; -} - -static int pblk_pad_distance(struct pblk *pblk, struct pblk_line *line) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - int distance = geo->mw_cunits * geo->all_luns * geo->ws_opt; - - return (distance > line->left_msecs) ? line->left_msecs : distance; -} - -/* Return a chunk belonging to a line by stripe(write order) index */ -static struct nvm_chk_meta *pblk_get_stripe_chunk(struct pblk *pblk, - struct pblk_line *line, - int index) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_lun *rlun; - struct ppa_addr ppa; - int pos; - - rlun = &pblk->luns[index]; - ppa = rlun->bppa; - pos = pblk_ppa_to_pos(geo, ppa); - - return &line->chks[pos]; -} - -static int pblk_line_wps_are_unbalanced(struct pblk *pblk, - struct pblk_line *line) -{ - struct pblk_line_meta *lm = &pblk->lm; - int blk_in_line = lm->blk_per_line; - struct nvm_chk_meta *chunk; - u64 max_wp, min_wp; - int i; - - i = find_first_zero_bit(line->blk_bitmap, blk_in_line); - - /* If there is one or zero good chunks in the line, - * the write pointers can't be unbalanced. - */ - if (i >= (blk_in_line - 1)) - return 0; - - chunk = pblk_get_stripe_chunk(pblk, line, i); - max_wp = chunk->wp; - if (max_wp > pblk->max_write_pgs) - min_wp = max_wp - pblk->max_write_pgs; - else - min_wp = 0; - - i = find_next_zero_bit(line->blk_bitmap, blk_in_line, i + 1); - while (i < blk_in_line) { - chunk = pblk_get_stripe_chunk(pblk, line, i); - if (chunk->wp > max_wp || chunk->wp < min_wp) - return 1; - - i = find_next_zero_bit(line->blk_bitmap, blk_in_line, i + 1); - } - - return 0; -} - -static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line, - struct pblk_recov_alloc p) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct pblk_line_meta *lm = &pblk->lm; - struct nvm_geo *geo = &dev->geo; - struct ppa_addr *ppa_list; - void *meta_list; - struct nvm_rq *rqd; - void *data; - dma_addr_t dma_ppa_list, dma_meta_list; - __le64 *lba_list; - u64 paddr = pblk_line_smeta_start(pblk, line) + lm->smeta_sec; - bool padded = false; - int rq_ppas; - int i, j; - int ret; - u64 left_ppas = pblk_sec_in_open_line(pblk, line) - lm->smeta_sec; - - if (pblk_line_wps_are_unbalanced(pblk, line)) - pblk_warn(pblk, "recovering unbalanced line (%d)\n", line->id); - - ppa_list = p.ppa_list; - meta_list = p.meta_list; - rqd = p.rqd; - data = p.data; - dma_ppa_list = p.dma_ppa_list; - dma_meta_list = p.dma_meta_list; - - lba_list = emeta_to_lbas(pblk, line->emeta->buf); - -next_rq: - memset(rqd, 0, pblk_g_rq_size); - - rq_ppas = pblk_calc_secs(pblk, left_ppas, 0, false); - if (!rq_ppas) - rq_ppas = pblk->min_write_pgs; - -retry_rq: - rqd->bio = NULL; - rqd->opcode = NVM_OP_PREAD; - rqd->meta_list = meta_list; - rqd->nr_ppas = rq_ppas; - rqd->ppa_list = ppa_list; - rqd->dma_ppa_list = dma_ppa_list; - rqd->dma_meta_list = dma_meta_list; - ppa_list = nvm_rq_to_ppa_list(rqd); - - if (pblk_io_aligned(pblk, rq_ppas)) - rqd->is_seq = 1; - - for (i = 0; i < rqd->nr_ppas; ) { - struct ppa_addr ppa; - int pos; - - ppa = addr_to_gen_ppa(pblk, paddr, line->id); - pos = pblk_ppa_to_pos(geo, ppa); - - while (test_bit(pos, line->blk_bitmap)) { - paddr += pblk->min_write_pgs; - ppa = addr_to_gen_ppa(pblk, paddr, line->id); - pos = pblk_ppa_to_pos(geo, ppa); - } - - for (j = 0; j < pblk->min_write_pgs; j++, i++) - ppa_list[i] = - addr_to_gen_ppa(pblk, paddr + j, line->id); - } - - ret = pblk_submit_io_sync(pblk, rqd, data); - if (ret) { - pblk_err(pblk, "I/O submission failed: %d\n", ret); - return ret; - } - - atomic_dec(&pblk->inflight_io); - - /* If a read fails, do a best effort by padding the line and retrying */ - if (rqd->error && rqd->error != NVM_RSP_WARN_HIGHECC) { - int pad_distance, ret; - - if (padded) { - pblk_log_read_err(pblk, rqd); - return -EINTR; - } - - pad_distance = pblk_pad_distance(pblk, line); - ret = pblk_recov_pad_line(pblk, line, pad_distance); - if (ret) { - return ret; - } - - padded = true; - goto retry_rq; - } - - pblk_get_packed_meta(pblk, rqd); - - for (i = 0; i < rqd->nr_ppas; i++) { - struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i); - u64 lba = le64_to_cpu(meta->lba); - - lba_list[paddr++] = cpu_to_le64(lba); - - if (lba == ADDR_EMPTY || lba >= pblk->capacity) - continue; - - line->nr_valid_lbas++; - pblk_update_map(pblk, lba, ppa_list[i]); - } - - left_ppas -= rq_ppas; - if (left_ppas > 0) - goto next_rq; - -#ifdef CONFIG_NVM_PBLK_DEBUG - WARN_ON(padded && !pblk_line_is_full(line)); -#endif - - return 0; -} - -/* Scan line for lbas on out of bound area */ -static int pblk_recov_l2p_from_oob(struct pblk *pblk, struct pblk_line *line) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct nvm_rq *rqd; - struct ppa_addr *ppa_list; - void *meta_list; - struct pblk_recov_alloc p; - void *data; - dma_addr_t dma_ppa_list, dma_meta_list; - int ret = 0; - - meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &dma_meta_list); - if (!meta_list) - return -ENOMEM; - - ppa_list = (void *)(meta_list) + pblk_dma_meta_size(pblk); - dma_ppa_list = dma_meta_list + pblk_dma_meta_size(pblk); - - data = kcalloc(pblk->max_write_pgs, geo->csecs, GFP_KERNEL); - if (!data) { - ret = -ENOMEM; - goto free_meta_list; - } - - rqd = mempool_alloc(&pblk->r_rq_pool, GFP_KERNEL); - memset(rqd, 0, pblk_g_rq_size); - - p.ppa_list = ppa_list; - p.meta_list = meta_list; - p.rqd = rqd; - p.data = data; - p.dma_ppa_list = dma_ppa_list; - p.dma_meta_list = dma_meta_list; - - ret = pblk_recov_scan_oob(pblk, line, p); - if (ret) { - pblk_err(pblk, "could not recover L2P form OOB\n"); - goto out; - } - - if (pblk_line_is_full(line)) - pblk_line_recov_close(pblk, line); - -out: - mempool_free(rqd, &pblk->r_rq_pool); - kfree(data); -free_meta_list: - nvm_dev_dma_free(dev->parent, meta_list, dma_meta_list); - - return ret; -} - -/* Insert lines ordered by sequence number (seq_num) on list */ -static void pblk_recov_line_add_ordered(struct list_head *head, - struct pblk_line *line) -{ - struct pblk_line *t = NULL; - - list_for_each_entry(t, head, list) - if (t->seq_nr > line->seq_nr) - break; - - __list_add(&line->list, t->list.prev, &t->list); -} - -static u64 pblk_line_emeta_start(struct pblk *pblk, struct pblk_line *line) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line_meta *lm = &pblk->lm; - unsigned int emeta_secs; - u64 emeta_start; - struct ppa_addr ppa; - int pos; - - emeta_secs = lm->emeta_sec[0]; - emeta_start = lm->sec_per_line; - - while (emeta_secs) { - emeta_start--; - ppa = addr_to_gen_ppa(pblk, emeta_start, line->id); - pos = pblk_ppa_to_pos(geo, ppa); - if (!test_bit(pos, line->blk_bitmap)) - emeta_secs--; - } - - return emeta_start; -} - -static int pblk_recov_check_line_version(struct pblk *pblk, - struct line_emeta *emeta) -{ - struct line_header *header = &emeta->header; - - if (header->version_major != EMETA_VERSION_MAJOR) { - pblk_err(pblk, "line major version mismatch: %d, expected: %d\n", - header->version_major, EMETA_VERSION_MAJOR); - return 1; - } - -#ifdef CONFIG_NVM_PBLK_DEBUG - if (header->version_minor > EMETA_VERSION_MINOR) - pblk_info(pblk, "newer line minor version found: %d\n", - header->version_minor); -#endif - - return 0; -} - -static void pblk_recov_wa_counters(struct pblk *pblk, - struct line_emeta *emeta) -{ - struct pblk_line_meta *lm = &pblk->lm; - struct line_header *header = &emeta->header; - struct wa_counters *wa = emeta_to_wa(lm, emeta); - - /* WA counters were introduced in emeta version 0.2 */ - if (header->version_major > 0 || header->version_minor >= 2) { - u64 user = le64_to_cpu(wa->user); - u64 pad = le64_to_cpu(wa->pad); - u64 gc = le64_to_cpu(wa->gc); - - atomic64_set(&pblk->user_wa, user); - atomic64_set(&pblk->pad_wa, pad); - atomic64_set(&pblk->gc_wa, gc); - - pblk->user_rst_wa = user; - pblk->pad_rst_wa = pad; - pblk->gc_rst_wa = gc; - } -} - -static int pblk_line_was_written(struct pblk_line *line, - struct pblk *pblk) -{ - - struct pblk_line_meta *lm = &pblk->lm; - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct nvm_chk_meta *chunk; - struct ppa_addr bppa; - int smeta_blk; - - if (line->state == PBLK_LINESTATE_BAD) - return 0; - - smeta_blk = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line); - if (smeta_blk >= lm->blk_per_line) - return 0; - - bppa = pblk->luns[smeta_blk].bppa; - chunk = &line->chks[pblk_ppa_to_pos(geo, bppa)]; - - if (chunk->state & NVM_CHK_ST_CLOSED || - (chunk->state & NVM_CHK_ST_OPEN - && chunk->wp >= lm->smeta_sec)) - return 1; - - return 0; -} - -static bool pblk_line_is_open(struct pblk *pblk, struct pblk_line *line) -{ - struct pblk_line_meta *lm = &pblk->lm; - int i; - - for (i = 0; i < lm->blk_per_line; i++) - if (line->chks[i].state & NVM_CHK_ST_OPEN) - return true; - - return false; -} - -struct pblk_line *pblk_recov_l2p(struct pblk *pblk) -{ - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line *line, *tline, *data_line = NULL; - struct pblk_smeta *smeta; - struct pblk_emeta *emeta; - struct line_smeta *smeta_buf; - int found_lines = 0, recovered_lines = 0, open_lines = 0; - int is_next = 0; - int meta_line; - int i, valid_uuid = 0; - LIST_HEAD(recov_list); - - /* TODO: Implement FTL snapshot */ - - /* Scan recovery - takes place when FTL snapshot fails */ - spin_lock(&l_mg->free_lock); - meta_line = find_first_zero_bit(&l_mg->meta_bitmap, PBLK_DATA_LINES); - set_bit(meta_line, &l_mg->meta_bitmap); - smeta = l_mg->sline_meta[meta_line]; - emeta = l_mg->eline_meta[meta_line]; - smeta_buf = (struct line_smeta *)smeta; - spin_unlock(&l_mg->free_lock); - - /* Order data lines using their sequence number */ - for (i = 0; i < l_mg->nr_lines; i++) { - u32 crc; - - line = &pblk->lines[i]; - - memset(smeta, 0, lm->smeta_len); - line->smeta = smeta; - line->lun_bitmap = ((void *)(smeta_buf)) + - sizeof(struct line_smeta); - - if (!pblk_line_was_written(line, pblk)) - continue; - - /* Lines that cannot be read are assumed as not written here */ - if (pblk_line_smeta_read(pblk, line)) - continue; - - crc = pblk_calc_smeta_crc(pblk, smeta_buf); - if (le32_to_cpu(smeta_buf->crc) != crc) - continue; - - if (le32_to_cpu(smeta_buf->header.identifier) != PBLK_MAGIC) - continue; - - if (smeta_buf->header.version_major != SMETA_VERSION_MAJOR) { - pblk_err(pblk, "found incompatible line version %u\n", - smeta_buf->header.version_major); - return ERR_PTR(-EINVAL); - } - - /* The first valid instance uuid is used for initialization */ - if (!valid_uuid) { - import_guid(&pblk->instance_uuid, smeta_buf->header.uuid); - valid_uuid = 1; - } - - if (!guid_equal(&pblk->instance_uuid, - (guid_t *)&smeta_buf->header.uuid)) { - pblk_debug(pblk, "ignore line %u due to uuid mismatch\n", - i); - continue; - } - - /* Update line metadata */ - spin_lock(&line->lock); - line->id = le32_to_cpu(smeta_buf->header.id); - line->type = le16_to_cpu(smeta_buf->header.type); - line->seq_nr = le64_to_cpu(smeta_buf->seq_nr); - spin_unlock(&line->lock); - - /* Update general metadata */ - spin_lock(&l_mg->free_lock); - if (line->seq_nr >= l_mg->d_seq_nr) - l_mg->d_seq_nr = line->seq_nr + 1; - l_mg->nr_free_lines--; - spin_unlock(&l_mg->free_lock); - - if (pblk_line_recov_alloc(pblk, line)) - goto out; - - pblk_recov_line_add_ordered(&recov_list, line); - found_lines++; - pblk_debug(pblk, "recovering data line %d, seq:%llu\n", - line->id, smeta_buf->seq_nr); - } - - if (!found_lines) { - guid_gen(&pblk->instance_uuid); - - spin_lock(&l_mg->free_lock); - WARN_ON_ONCE(!test_and_clear_bit(meta_line, - &l_mg->meta_bitmap)); - spin_unlock(&l_mg->free_lock); - - goto out; - } - - /* Verify closed blocks and recover this portion of L2P table*/ - list_for_each_entry_safe(line, tline, &recov_list, list) { - recovered_lines++; - - line->emeta_ssec = pblk_line_emeta_start(pblk, line); - line->emeta = emeta; - memset(line->emeta->buf, 0, lm->emeta_len[0]); - - if (pblk_line_is_open(pblk, line)) { - pblk_recov_l2p_from_oob(pblk, line); - goto next; - } - - if (pblk_line_emeta_read(pblk, line, line->emeta->buf)) { - pblk_recov_l2p_from_oob(pblk, line); - goto next; - } - - if (pblk_recov_check_emeta(pblk, line->emeta->buf)) { - pblk_recov_l2p_from_oob(pblk, line); - goto next; - } - - if (pblk_recov_check_line_version(pblk, line->emeta->buf)) - return ERR_PTR(-EINVAL); - - pblk_recov_wa_counters(pblk, line->emeta->buf); - - if (pblk_recov_l2p_from_emeta(pblk, line)) - pblk_recov_l2p_from_oob(pblk, line); - -next: - if (pblk_line_is_full(line)) { - struct list_head *move_list; - - spin_lock(&line->lock); - line->state = PBLK_LINESTATE_CLOSED; - trace_pblk_line_state(pblk_disk_name(pblk), line->id, - line->state); - move_list = pblk_line_gc_list(pblk, line); - spin_unlock(&line->lock); - - spin_lock(&l_mg->gc_lock); - list_move_tail(&line->list, move_list); - spin_unlock(&l_mg->gc_lock); - - mempool_free(line->map_bitmap, l_mg->bitmap_pool); - line->map_bitmap = NULL; - line->smeta = NULL; - line->emeta = NULL; - } else { - spin_lock(&line->lock); - line->state = PBLK_LINESTATE_OPEN; - spin_unlock(&line->lock); - - line->emeta->mem = 0; - atomic_set(&line->emeta->sync, 0); - - trace_pblk_line_state(pblk_disk_name(pblk), line->id, - line->state); - - data_line = line; - line->meta_line = meta_line; - - open_lines++; - } - } - - if (!open_lines) { - spin_lock(&l_mg->free_lock); - WARN_ON_ONCE(!test_and_clear_bit(meta_line, - &l_mg->meta_bitmap)); - spin_unlock(&l_mg->free_lock); - } else { - spin_lock(&l_mg->free_lock); - l_mg->data_line = data_line; - /* Allocate next line for preparation */ - l_mg->data_next = pblk_line_get(pblk); - if (l_mg->data_next) { - l_mg->data_next->seq_nr = l_mg->d_seq_nr++; - l_mg->data_next->type = PBLK_LINETYPE_DATA; - is_next = 1; - } - spin_unlock(&l_mg->free_lock); - } - - if (is_next) - pblk_line_erase(pblk, l_mg->data_next); - -out: - if (found_lines != recovered_lines) - pblk_err(pblk, "failed to recover all found lines %d/%d\n", - found_lines, recovered_lines); - - return data_line; -} - -/* - * Pad current line - */ -int pblk_recov_pad(struct pblk *pblk) -{ - struct pblk_line *line; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - int left_msecs; - int ret = 0; - - spin_lock(&l_mg->free_lock); - line = l_mg->data_line; - left_msecs = line->left_msecs; - spin_unlock(&l_mg->free_lock); - - ret = pblk_recov_pad_line(pblk, line, left_msecs); - if (ret) { - pblk_err(pblk, "tear down padding failed (%d)\n", ret); - return ret; - } - - pblk_line_close_meta(pblk, line); - return ret; -} diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c deleted file mode 100644 index a5f8bc2defbc..000000000000 --- a/drivers/lightnvm/pblk-rl.c +++ /dev/null @@ -1,254 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2016 CNEX Labs - * Initial release: Javier Gonzalez - * Matias Bjorling - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * pblk-rl.c - pblk's rate limiter for user I/O - * - */ - -#include "pblk.h" - -static void pblk_rl_kick_u_timer(struct pblk_rl *rl) -{ - mod_timer(&rl->u_timer, jiffies + msecs_to_jiffies(5000)); -} - -int pblk_rl_is_limit(struct pblk_rl *rl) -{ - int rb_space; - - rb_space = atomic_read(&rl->rb_space); - - return (rb_space == 0); -} - -int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries) -{ - int rb_user_cnt = atomic_read(&rl->rb_user_cnt); - int rb_space = atomic_read(&rl->rb_space); - - if (unlikely(rb_space >= 0) && (rb_space - nr_entries < 0)) - return NVM_IO_ERR; - - if (rb_user_cnt >= rl->rb_user_max) - return NVM_IO_REQUEUE; - - return NVM_IO_OK; -} - -void pblk_rl_inserted(struct pblk_rl *rl, int nr_entries) -{ - int rb_space = atomic_read(&rl->rb_space); - - if (unlikely(rb_space >= 0)) - atomic_sub(nr_entries, &rl->rb_space); -} - -int pblk_rl_gc_may_insert(struct pblk_rl *rl, int nr_entries) -{ - int rb_gc_cnt = atomic_read(&rl->rb_gc_cnt); - int rb_user_active; - - /* If there is no user I/O let GC take over space on the write buffer */ - rb_user_active = READ_ONCE(rl->rb_user_active); - return (!(rb_gc_cnt >= rl->rb_gc_max && rb_user_active)); -} - -void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries) -{ - atomic_add(nr_entries, &rl->rb_user_cnt); - - /* Release user I/O state. Protect from GC */ - smp_store_release(&rl->rb_user_active, 1); - pblk_rl_kick_u_timer(rl); -} - -void pblk_rl_werr_line_in(struct pblk_rl *rl) -{ - atomic_inc(&rl->werr_lines); -} - -void pblk_rl_werr_line_out(struct pblk_rl *rl) -{ - atomic_dec(&rl->werr_lines); -} - -void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries) -{ - atomic_add(nr_entries, &rl->rb_gc_cnt); -} - -void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc) -{ - atomic_sub(nr_user, &rl->rb_user_cnt); - atomic_sub(nr_gc, &rl->rb_gc_cnt); -} - -unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl) -{ - return atomic_read(&rl->free_blocks); -} - -unsigned long pblk_rl_nr_user_free_blks(struct pblk_rl *rl) -{ - return atomic_read(&rl->free_user_blocks); -} - -static void __pblk_rl_update_rates(struct pblk_rl *rl, - unsigned long free_blocks) -{ - struct pblk *pblk = container_of(rl, struct pblk, rl); - int max = rl->rb_budget; - int werr_gc_needed = atomic_read(&rl->werr_lines); - - if (free_blocks >= rl->high) { - if (werr_gc_needed) { - /* Allocate a small budget for recovering - * lines with write errors - */ - rl->rb_gc_max = 1 << rl->rb_windows_pw; - rl->rb_user_max = max - rl->rb_gc_max; - rl->rb_state = PBLK_RL_WERR; - } else { - rl->rb_user_max = max; - rl->rb_gc_max = 0; - rl->rb_state = PBLK_RL_OFF; - } - } else if (free_blocks < rl->high) { - int shift = rl->high_pw - rl->rb_windows_pw; - int user_windows = free_blocks >> shift; - int user_max = user_windows << ilog2(NVM_MAX_VLBA); - - rl->rb_user_max = user_max; - rl->rb_gc_max = max - user_max; - - if (free_blocks <= rl->rsv_blocks) { - rl->rb_user_max = 0; - rl->rb_gc_max = max; - } - - /* In the worst case, we will need to GC lines in the low list - * (high valid sector count). If there are lines to GC on high - * or mid lists, these will be prioritized - */ - rl->rb_state = PBLK_RL_LOW; - } - - if (rl->rb_state != PBLK_RL_OFF) - pblk_gc_should_start(pblk); - else - pblk_gc_should_stop(pblk); -} - -void pblk_rl_update_rates(struct pblk_rl *rl) -{ - __pblk_rl_update_rates(rl, pblk_rl_nr_user_free_blks(rl)); -} - -void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line) -{ - int blk_in_line = atomic_read(&line->blk_in_line); - int free_blocks; - - atomic_add(blk_in_line, &rl->free_blocks); - free_blocks = atomic_add_return(blk_in_line, &rl->free_user_blocks); - - __pblk_rl_update_rates(rl, free_blocks); -} - -void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line, - bool used) -{ - int blk_in_line = atomic_read(&line->blk_in_line); - int free_blocks; - - atomic_sub(blk_in_line, &rl->free_blocks); - - if (used) - free_blocks = atomic_sub_return(blk_in_line, - &rl->free_user_blocks); - else - free_blocks = atomic_read(&rl->free_user_blocks); - - __pblk_rl_update_rates(rl, free_blocks); -} - -int pblk_rl_high_thrs(struct pblk_rl *rl) -{ - return rl->high; -} - -int pblk_rl_max_io(struct pblk_rl *rl) -{ - return rl->rb_max_io; -} - -static void pblk_rl_u_timer(struct timer_list *t) -{ - struct pblk_rl *rl = from_timer(rl, t, u_timer); - - /* Release user I/O state. Protect from GC */ - smp_store_release(&rl->rb_user_active, 0); -} - -void pblk_rl_free(struct pblk_rl *rl) -{ - del_timer(&rl->u_timer); -} - -void pblk_rl_init(struct pblk_rl *rl, int budget, int threshold) -{ - struct pblk *pblk = container_of(rl, struct pblk, rl); - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line_meta *lm = &pblk->lm; - int sec_meta, blk_meta; - unsigned int rb_windows; - - /* Consider sectors used for metadata */ - sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines; - blk_meta = DIV_ROUND_UP(sec_meta, geo->clba); - - rl->high = pblk->op_blks - blk_meta - lm->blk_per_line; - rl->high_pw = get_count_order(rl->high); - - rl->rsv_blocks = pblk_get_min_chks(pblk); - - /* This will always be a power-of-2 */ - rb_windows = budget / NVM_MAX_VLBA; - rl->rb_windows_pw = get_count_order(rb_windows); - - /* To start with, all buffer is available to user I/O writers */ - rl->rb_budget = budget; - rl->rb_user_max = budget; - rl->rb_gc_max = 0; - rl->rb_state = PBLK_RL_HIGH; - - /* Maximize I/O size and ansure that back threshold is respected */ - if (threshold) - rl->rb_max_io = budget - pblk->min_write_pgs_data - threshold; - else - rl->rb_max_io = budget - pblk->min_write_pgs_data - 1; - - atomic_set(&rl->rb_user_cnt, 0); - atomic_set(&rl->rb_gc_cnt, 0); - atomic_set(&rl->rb_space, -1); - atomic_set(&rl->werr_lines, 0); - - timer_setup(&rl->u_timer, pblk_rl_u_timer, 0); - - rl->rb_user_active = 0; - rl->rb_gc_active = 0; -} diff --git a/drivers/lightnvm/pblk-sysfs.c b/drivers/lightnvm/pblk-sysfs.c deleted file mode 100644 index 6387302b03f2..000000000000 --- a/drivers/lightnvm/pblk-sysfs.c +++ /dev/null @@ -1,728 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2016 CNEX Labs - * Initial release: Javier Gonzalez - * Matias Bjorling - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * Implementation of a physical block-device target for Open-channel SSDs. - * - * pblk-sysfs.c - pblk's sysfs - * - */ - -#include "pblk.h" - -static ssize_t pblk_sysfs_luns_show(struct pblk *pblk, char *page) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_lun *rlun; - ssize_t sz = 0; - int i; - - for (i = 0; i < geo->all_luns; i++) { - int active = 1; - - rlun = &pblk->luns[i]; - if (!down_trylock(&rlun->wr_sem)) { - active = 0; - up(&rlun->wr_sem); - } - sz += scnprintf(page + sz, PAGE_SIZE - sz, - "pblk: pos:%d, ch:%d, lun:%d - %d\n", - i, - rlun->bppa.a.ch, - rlun->bppa.a.lun, - active); - } - - return sz; -} - -static ssize_t pblk_sysfs_rate_limiter(struct pblk *pblk, char *page) -{ - int free_blocks, free_user_blocks, total_blocks; - int rb_user_max, rb_user_cnt; - int rb_gc_max, rb_gc_cnt, rb_budget, rb_state; - - free_blocks = pblk_rl_nr_free_blks(&pblk->rl); - free_user_blocks = pblk_rl_nr_user_free_blks(&pblk->rl); - rb_user_max = pblk->rl.rb_user_max; - rb_user_cnt = atomic_read(&pblk->rl.rb_user_cnt); - rb_gc_max = pblk->rl.rb_gc_max; - rb_gc_cnt = atomic_read(&pblk->rl.rb_gc_cnt); - rb_budget = pblk->rl.rb_budget; - rb_state = pblk->rl.rb_state; - - total_blocks = pblk->rl.total_blocks; - - return snprintf(page, PAGE_SIZE, - "u:%u/%u,gc:%u/%u(%u)(stop:<%u,full:>%u,free:%d/%d/%d)-%d\n", - rb_user_cnt, - rb_user_max, - rb_gc_cnt, - rb_gc_max, - rb_state, - rb_budget, - pblk->rl.high, - free_blocks, - free_user_blocks, - total_blocks, - READ_ONCE(pblk->rl.rb_user_active)); -} - -static ssize_t pblk_sysfs_gc_state_show(struct pblk *pblk, char *page) -{ - int gc_enabled, gc_active; - - pblk_gc_sysfs_state_show(pblk, &gc_enabled, &gc_active); - return snprintf(page, PAGE_SIZE, "gc_enabled=%d, gc_active=%d\n", - gc_enabled, gc_active); -} - -static ssize_t pblk_sysfs_stats(struct pblk *pblk, char *page) -{ - ssize_t sz; - - sz = snprintf(page, PAGE_SIZE, - "read_failed=%lu, read_high_ecc=%lu, read_empty=%lu, read_failed_gc=%lu, write_failed=%lu, erase_failed=%lu\n", - atomic_long_read(&pblk->read_failed), - atomic_long_read(&pblk->read_high_ecc), - atomic_long_read(&pblk->read_empty), - atomic_long_read(&pblk->read_failed_gc), - atomic_long_read(&pblk->write_failed), - atomic_long_read(&pblk->erase_failed)); - - return sz; -} - -static ssize_t pblk_sysfs_write_buffer(struct pblk *pblk, char *page) -{ - return pblk_rb_sysfs(&pblk->rwb, page); -} - -static ssize_t pblk_sysfs_ppaf(struct pblk *pblk, char *page) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - ssize_t sz = 0; - - if (geo->version == NVM_OCSSD_SPEC_12) { - struct nvm_addrf_12 *ppaf = (struct nvm_addrf_12 *)&pblk->addrf; - struct nvm_addrf_12 *gppaf = (struct nvm_addrf_12 *)&geo->addrf; - - sz = scnprintf(page, PAGE_SIZE, - "g:(b:%d)blk:%d/%d,pg:%d/%d,lun:%d/%d,ch:%d/%d,pl:%d/%d,sec:%d/%d\n", - pblk->addrf_len, - ppaf->blk_offset, ppaf->blk_len, - ppaf->pg_offset, ppaf->pg_len, - ppaf->lun_offset, ppaf->lun_len, - ppaf->ch_offset, ppaf->ch_len, - ppaf->pln_offset, ppaf->pln_len, - ppaf->sec_offset, ppaf->sec_len); - - sz += scnprintf(page + sz, PAGE_SIZE - sz, - "d:blk:%d/%d,pg:%d/%d,lun:%d/%d,ch:%d/%d,pl:%d/%d,sec:%d/%d\n", - gppaf->blk_offset, gppaf->blk_len, - gppaf->pg_offset, gppaf->pg_len, - gppaf->lun_offset, gppaf->lun_len, - gppaf->ch_offset, gppaf->ch_len, - gppaf->pln_offset, gppaf->pln_len, - gppaf->sec_offset, gppaf->sec_len); - } else { - struct nvm_addrf *ppaf = &pblk->addrf; - struct nvm_addrf *gppaf = &geo->addrf; - - sz = scnprintf(page, PAGE_SIZE, - "pblk:(s:%d)ch:%d/%d,lun:%d/%d,chk:%d/%d/sec:%d/%d\n", - pblk->addrf_len, - ppaf->ch_offset, ppaf->ch_len, - ppaf->lun_offset, ppaf->lun_len, - ppaf->chk_offset, ppaf->chk_len, - ppaf->sec_offset, ppaf->sec_len); - - sz += scnprintf(page + sz, PAGE_SIZE - sz, - "device:ch:%d/%d,lun:%d/%d,chk:%d/%d,sec:%d/%d\n", - gppaf->ch_offset, gppaf->ch_len, - gppaf->lun_offset, gppaf->lun_len, - gppaf->chk_offset, gppaf->chk_len, - gppaf->sec_offset, gppaf->sec_len); - } - - return sz; -} - -static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line *line; - ssize_t sz = 0; - int nr_free_lines; - int cur_data, cur_log; - int free_line_cnt = 0, closed_line_cnt = 0, emeta_line_cnt = 0; - int d_line_cnt = 0, l_line_cnt = 0; - int gc_full = 0, gc_high = 0, gc_mid = 0, gc_low = 0, gc_empty = 0; - int gc_werr = 0; - - int bad = 0, cor = 0; - int msecs = 0, cur_sec = 0, vsc = 0, sec_in_line = 0; - int map_weight = 0, meta_weight = 0; - - spin_lock(&l_mg->free_lock); - cur_data = (l_mg->data_line) ? l_mg->data_line->id : -1; - cur_log = (l_mg->log_line) ? l_mg->log_line->id : -1; - nr_free_lines = l_mg->nr_free_lines; - - list_for_each_entry(line, &l_mg->free_list, list) - free_line_cnt++; - spin_unlock(&l_mg->free_lock); - - spin_lock(&l_mg->close_lock); - list_for_each_entry(line, &l_mg->emeta_list, list) - emeta_line_cnt++; - spin_unlock(&l_mg->close_lock); - - spin_lock(&l_mg->gc_lock); - list_for_each_entry(line, &l_mg->gc_full_list, list) { - if (line->type == PBLK_LINETYPE_DATA) - d_line_cnt++; - else if (line->type == PBLK_LINETYPE_LOG) - l_line_cnt++; - closed_line_cnt++; - gc_full++; - } - - list_for_each_entry(line, &l_mg->gc_high_list, list) { - if (line->type == PBLK_LINETYPE_DATA) - d_line_cnt++; - else if (line->type == PBLK_LINETYPE_LOG) - l_line_cnt++; - closed_line_cnt++; - gc_high++; - } - - list_for_each_entry(line, &l_mg->gc_mid_list, list) { - if (line->type == PBLK_LINETYPE_DATA) - d_line_cnt++; - else if (line->type == PBLK_LINETYPE_LOG) - l_line_cnt++; - closed_line_cnt++; - gc_mid++; - } - - list_for_each_entry(line, &l_mg->gc_low_list, list) { - if (line->type == PBLK_LINETYPE_DATA) - d_line_cnt++; - else if (line->type == PBLK_LINETYPE_LOG) - l_line_cnt++; - closed_line_cnt++; - gc_low++; - } - - list_for_each_entry(line, &l_mg->gc_empty_list, list) { - if (line->type == PBLK_LINETYPE_DATA) - d_line_cnt++; - else if (line->type == PBLK_LINETYPE_LOG) - l_line_cnt++; - closed_line_cnt++; - gc_empty++; - } - - list_for_each_entry(line, &l_mg->gc_werr_list, list) { - if (line->type == PBLK_LINETYPE_DATA) - d_line_cnt++; - else if (line->type == PBLK_LINETYPE_LOG) - l_line_cnt++; - closed_line_cnt++; - gc_werr++; - } - - list_for_each_entry(line, &l_mg->bad_list, list) - bad++; - list_for_each_entry(line, &l_mg->corrupt_list, list) - cor++; - spin_unlock(&l_mg->gc_lock); - - spin_lock(&l_mg->free_lock); - if (l_mg->data_line) { - cur_sec = l_mg->data_line->cur_sec; - msecs = l_mg->data_line->left_msecs; - vsc = le32_to_cpu(*l_mg->data_line->vsc); - sec_in_line = l_mg->data_line->sec_in_line; - meta_weight = bitmap_weight(&l_mg->meta_bitmap, - PBLK_DATA_LINES); - - spin_lock(&l_mg->data_line->lock); - if (l_mg->data_line->map_bitmap) - map_weight = bitmap_weight(l_mg->data_line->map_bitmap, - lm->sec_per_line); - else - map_weight = 0; - spin_unlock(&l_mg->data_line->lock); - } - spin_unlock(&l_mg->free_lock); - - if (nr_free_lines != free_line_cnt) - pblk_err(pblk, "corrupted free line list:%d/%d\n", - nr_free_lines, free_line_cnt); - - sz = scnprintf(page, PAGE_SIZE - sz, - "line: nluns:%d, nblks:%d, nsecs:%d\n", - geo->all_luns, lm->blk_per_line, lm->sec_per_line); - - sz += scnprintf(page + sz, PAGE_SIZE - sz, - "lines:d:%d,l:%d-f:%d,m:%d/%d,c:%d,b:%d,co:%d(d:%d,l:%d)t:%d\n", - cur_data, cur_log, - nr_free_lines, - emeta_line_cnt, meta_weight, - closed_line_cnt, - bad, cor, - d_line_cnt, l_line_cnt, - l_mg->nr_lines); - - sz += scnprintf(page + sz, PAGE_SIZE - sz, - "GC: full:%d, high:%d, mid:%d, low:%d, empty:%d, werr: %d, queue:%d\n", - gc_full, gc_high, gc_mid, gc_low, gc_empty, gc_werr, - atomic_read(&pblk->gc.read_inflight_gc)); - - sz += scnprintf(page + sz, PAGE_SIZE - sz, - "data (%d) cur:%d, left:%d, vsc:%d, s:%d, map:%d/%d (%d)\n", - cur_data, cur_sec, msecs, vsc, sec_in_line, - map_weight, lm->sec_per_line, - atomic_read(&pblk->inflight_io)); - - return sz; -} - -static ssize_t pblk_sysfs_lines_info(struct pblk *pblk, char *page) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line_meta *lm = &pblk->lm; - ssize_t sz = 0; - - sz = scnprintf(page, PAGE_SIZE - sz, - "smeta - len:%d, secs:%d\n", - lm->smeta_len, lm->smeta_sec); - sz += scnprintf(page + sz, PAGE_SIZE - sz, - "emeta - len:%d, sec:%d, bb_start:%d\n", - lm->emeta_len[0], lm->emeta_sec[0], - lm->emeta_bb); - sz += scnprintf(page + sz, PAGE_SIZE - sz, - "bitmap lengths: sec:%d, blk:%d, lun:%d\n", - lm->sec_bitmap_len, - lm->blk_bitmap_len, - lm->lun_bitmap_len); - sz += scnprintf(page + sz, PAGE_SIZE - sz, - "blk_line:%d, sec_line:%d, sec_blk:%d\n", - lm->blk_per_line, - lm->sec_per_line, - geo->clba); - - return sz; -} - -static ssize_t pblk_sysfs_get_sec_per_write(struct pblk *pblk, char *page) -{ - return snprintf(page, PAGE_SIZE, "%d\n", pblk->sec_per_write); -} - -static ssize_t pblk_get_write_amp(u64 user, u64 gc, u64 pad, - char *page) -{ - int sz; - - sz = scnprintf(page, PAGE_SIZE, - "user:%lld gc:%lld pad:%lld WA:", - user, gc, pad); - - if (!user) { - sz += scnprintf(page + sz, PAGE_SIZE - sz, "NaN\n"); - } else { - u64 wa_int; - u32 wa_frac; - - wa_int = (user + gc + pad) * 100000; - wa_int = div64_u64(wa_int, user); - wa_int = div_u64_rem(wa_int, 100000, &wa_frac); - - sz += scnprintf(page + sz, PAGE_SIZE - sz, "%llu.%05u\n", - wa_int, wa_frac); - } - - return sz; -} - -static ssize_t pblk_sysfs_get_write_amp_mileage(struct pblk *pblk, char *page) -{ - return pblk_get_write_amp(atomic64_read(&pblk->user_wa), - atomic64_read(&pblk->gc_wa), atomic64_read(&pblk->pad_wa), - page); -} - -static ssize_t pblk_sysfs_get_write_amp_trip(struct pblk *pblk, char *page) -{ - return pblk_get_write_amp( - atomic64_read(&pblk->user_wa) - pblk->user_rst_wa, - atomic64_read(&pblk->gc_wa) - pblk->gc_rst_wa, - atomic64_read(&pblk->pad_wa) - pblk->pad_rst_wa, page); -} - -static long long bucket_percentage(unsigned long long bucket, - unsigned long long total) -{ - int p = bucket * 100; - - p = div_u64(p, total); - - return p; -} - -static ssize_t pblk_sysfs_get_padding_dist(struct pblk *pblk, char *page) -{ - int sz = 0; - unsigned long long total; - unsigned long long total_buckets = 0; - int buckets = pblk->min_write_pgs - 1; - int i; - - total = atomic64_read(&pblk->nr_flush) - pblk->nr_flush_rst; - if (!total) { - for (i = 0; i < (buckets + 1); i++) - sz += scnprintf(page + sz, PAGE_SIZE - sz, - "%d:0 ", i); - sz += scnprintf(page + sz, PAGE_SIZE - sz, "\n"); - - return sz; - } - - for (i = 0; i < buckets; i++) - total_buckets += atomic64_read(&pblk->pad_dist[i]); - - sz += scnprintf(page + sz, PAGE_SIZE - sz, "0:%lld%% ", - bucket_percentage(total - total_buckets, total)); - - for (i = 0; i < buckets; i++) { - unsigned long long p; - - p = bucket_percentage(atomic64_read(&pblk->pad_dist[i]), - total); - sz += scnprintf(page + sz, PAGE_SIZE - sz, "%d:%lld%% ", - i + 1, p); - } - sz += scnprintf(page + sz, PAGE_SIZE - sz, "\n"); - - return sz; -} - -#ifdef CONFIG_NVM_PBLK_DEBUG -static ssize_t pblk_sysfs_stats_debug(struct pblk *pblk, char *page) -{ - return snprintf(page, PAGE_SIZE, - "%lu\t%lu\t%ld\t%llu\t%ld\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\n", - atomic_long_read(&pblk->inflight_writes), - atomic_long_read(&pblk->inflight_reads), - atomic_long_read(&pblk->req_writes), - (u64)atomic64_read(&pblk->nr_flush), - atomic_long_read(&pblk->padded_writes), - atomic_long_read(&pblk->padded_wb), - atomic_long_read(&pblk->sub_writes), - atomic_long_read(&pblk->sync_writes), - atomic_long_read(&pblk->recov_writes), - atomic_long_read(&pblk->recov_gc_writes), - atomic_long_read(&pblk->recov_gc_reads), - atomic_long_read(&pblk->cache_reads), - atomic_long_read(&pblk->sync_reads)); -} -#endif - -static ssize_t pblk_sysfs_gc_force(struct pblk *pblk, const char *page, - size_t len) -{ - size_t c_len; - int force; - - c_len = strcspn(page, "\n"); - if (c_len >= len) - return -EINVAL; - - if (kstrtouint(page, 0, &force)) - return -EINVAL; - - pblk_gc_sysfs_force(pblk, force); - - return len; -} - -static ssize_t pblk_sysfs_set_sec_per_write(struct pblk *pblk, - const char *page, size_t len) -{ - size_t c_len; - int sec_per_write; - - c_len = strcspn(page, "\n"); - if (c_len >= len) - return -EINVAL; - - if (kstrtouint(page, 0, &sec_per_write)) - return -EINVAL; - - if (!pblk_is_oob_meta_supported(pblk)) { - /* For packed metadata case it is - * not allowed to change sec_per_write. - */ - return -EINVAL; - } - - if (sec_per_write < pblk->min_write_pgs - || sec_per_write > pblk->max_write_pgs - || sec_per_write % pblk->min_write_pgs != 0) - return -EINVAL; - - pblk_set_sec_per_write(pblk, sec_per_write); - - return len; -} - -static ssize_t pblk_sysfs_set_write_amp_trip(struct pblk *pblk, - const char *page, size_t len) -{ - size_t c_len; - int reset_value; - - c_len = strcspn(page, "\n"); - if (c_len >= len) - return -EINVAL; - - if (kstrtouint(page, 0, &reset_value)) - return -EINVAL; - - if (reset_value != 0) - return -EINVAL; - - pblk->user_rst_wa = atomic64_read(&pblk->user_wa); - pblk->pad_rst_wa = atomic64_read(&pblk->pad_wa); - pblk->gc_rst_wa = atomic64_read(&pblk->gc_wa); - - return len; -} - - -static ssize_t pblk_sysfs_set_padding_dist(struct pblk *pblk, - const char *page, size_t len) -{ - size_t c_len; - int reset_value; - int buckets = pblk->min_write_pgs - 1; - int i; - - c_len = strcspn(page, "\n"); - if (c_len >= len) - return -EINVAL; - - if (kstrtouint(page, 0, &reset_value)) - return -EINVAL; - - if (reset_value != 0) - return -EINVAL; - - for (i = 0; i < buckets; i++) - atomic64_set(&pblk->pad_dist[i], 0); - - pblk->nr_flush_rst = atomic64_read(&pblk->nr_flush); - - return len; -} - -static struct attribute sys_write_luns = { - .name = "write_luns", - .mode = 0444, -}; - -static struct attribute sys_rate_limiter_attr = { - .name = "rate_limiter", - .mode = 0444, -}; - -static struct attribute sys_gc_state = { - .name = "gc_state", - .mode = 0444, -}; - -static struct attribute sys_errors_attr = { - .name = "errors", - .mode = 0444, -}; - -static struct attribute sys_rb_attr = { - .name = "write_buffer", - .mode = 0444, -}; - -static struct attribute sys_stats_ppaf_attr = { - .name = "ppa_format", - .mode = 0444, -}; - -static struct attribute sys_lines_attr = { - .name = "lines", - .mode = 0444, -}; - -static struct attribute sys_lines_info_attr = { - .name = "lines_info", - .mode = 0444, -}; - -static struct attribute sys_gc_force = { - .name = "gc_force", - .mode = 0200, -}; - -static struct attribute sys_max_sec_per_write = { - .name = "max_sec_per_write", - .mode = 0644, -}; - -static struct attribute sys_write_amp_mileage = { - .name = "write_amp_mileage", - .mode = 0444, -}; - -static struct attribute sys_write_amp_trip = { - .name = "write_amp_trip", - .mode = 0644, -}; - -static struct attribute sys_padding_dist = { - .name = "padding_dist", - .mode = 0644, -}; - -#ifdef CONFIG_NVM_PBLK_DEBUG -static struct attribute sys_stats_debug_attr = { - .name = "stats", - .mode = 0444, -}; -#endif - -static struct attribute *pblk_attrs[] = { - &sys_write_luns, - &sys_rate_limiter_attr, - &sys_errors_attr, - &sys_gc_state, - &sys_gc_force, - &sys_max_sec_per_write, - &sys_rb_attr, - &sys_stats_ppaf_attr, - &sys_lines_attr, - &sys_lines_info_attr, - &sys_write_amp_mileage, - &sys_write_amp_trip, - &sys_padding_dist, -#ifdef CONFIG_NVM_PBLK_DEBUG - &sys_stats_debug_attr, -#endif - NULL, -}; - -static ssize_t pblk_sysfs_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct pblk *pblk = container_of(kobj, struct pblk, kobj); - - if (strcmp(attr->name, "rate_limiter") == 0) - return pblk_sysfs_rate_limiter(pblk, buf); - else if (strcmp(attr->name, "write_luns") == 0) - return pblk_sysfs_luns_show(pblk, buf); - else if (strcmp(attr->name, "gc_state") == 0) - return pblk_sysfs_gc_state_show(pblk, buf); - else if (strcmp(attr->name, "errors") == 0) - return pblk_sysfs_stats(pblk, buf); - else if (strcmp(attr->name, "write_buffer") == 0) - return pblk_sysfs_write_buffer(pblk, buf); - else if (strcmp(attr->name, "ppa_format") == 0) - return pblk_sysfs_ppaf(pblk, buf); - else if (strcmp(attr->name, "lines") == 0) - return pblk_sysfs_lines(pblk, buf); - else if (strcmp(attr->name, "lines_info") == 0) - return pblk_sysfs_lines_info(pblk, buf); - else if (strcmp(attr->name, "max_sec_per_write") == 0) - return pblk_sysfs_get_sec_per_write(pblk, buf); - else if (strcmp(attr->name, "write_amp_mileage") == 0) - return pblk_sysfs_get_write_amp_mileage(pblk, buf); - else if (strcmp(attr->name, "write_amp_trip") == 0) - return pblk_sysfs_get_write_amp_trip(pblk, buf); - else if (strcmp(attr->name, "padding_dist") == 0) - return pblk_sysfs_get_padding_dist(pblk, buf); -#ifdef CONFIG_NVM_PBLK_DEBUG - else if (strcmp(attr->name, "stats") == 0) - return pblk_sysfs_stats_debug(pblk, buf); -#endif - return 0; -} - -static ssize_t pblk_sysfs_store(struct kobject *kobj, struct attribute *attr, - const char *buf, size_t len) -{ - struct pblk *pblk = container_of(kobj, struct pblk, kobj); - - if (strcmp(attr->name, "gc_force") == 0) - return pblk_sysfs_gc_force(pblk, buf, len); - else if (strcmp(attr->name, "max_sec_per_write") == 0) - return pblk_sysfs_set_sec_per_write(pblk, buf, len); - else if (strcmp(attr->name, "write_amp_trip") == 0) - return pblk_sysfs_set_write_amp_trip(pblk, buf, len); - else if (strcmp(attr->name, "padding_dist") == 0) - return pblk_sysfs_set_padding_dist(pblk, buf, len); - return 0; -} - -static const struct sysfs_ops pblk_sysfs_ops = { - .show = pblk_sysfs_show, - .store = pblk_sysfs_store, -}; - -static struct kobj_type pblk_ktype = { - .sysfs_ops = &pblk_sysfs_ops, - .default_attrs = pblk_attrs, -}; - -int pblk_sysfs_init(struct gendisk *tdisk) -{ - struct pblk *pblk = tdisk->private_data; - struct device *parent_dev = disk_to_dev(pblk->disk); - int ret; - - ret = kobject_init_and_add(&pblk->kobj, &pblk_ktype, - kobject_get(&parent_dev->kobj), - "%s", "pblk"); - if (ret) { - pblk_err(pblk, "could not register\n"); - return ret; - } - - kobject_uevent(&pblk->kobj, KOBJ_ADD); - return 0; -} - -void pblk_sysfs_exit(struct gendisk *tdisk) -{ - struct pblk *pblk = tdisk->private_data; - - kobject_uevent(&pblk->kobj, KOBJ_REMOVE); - kobject_del(&pblk->kobj); - kobject_put(&pblk->kobj); -} diff --git a/drivers/lightnvm/pblk-trace.h b/drivers/lightnvm/pblk-trace.h deleted file mode 100644 index 47b67c6bff7a..000000000000 --- a/drivers/lightnvm/pblk-trace.h +++ /dev/null @@ -1,145 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#undef TRACE_SYSTEM -#define TRACE_SYSTEM pblk - -#if !defined(_TRACE_PBLK_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_PBLK_H - -#include - -struct ppa_addr; - -#define show_chunk_flags(state) __print_flags(state, "", \ - { NVM_CHK_ST_FREE, "FREE", }, \ - { NVM_CHK_ST_CLOSED, "CLOSED", }, \ - { NVM_CHK_ST_OPEN, "OPEN", }, \ - { NVM_CHK_ST_OFFLINE, "OFFLINE", }) - -#define show_line_state(state) __print_symbolic(state, \ - { PBLK_LINESTATE_NEW, "NEW", }, \ - { PBLK_LINESTATE_FREE, "FREE", }, \ - { PBLK_LINESTATE_OPEN, "OPEN", }, \ - { PBLK_LINESTATE_CLOSED, "CLOSED", }, \ - { PBLK_LINESTATE_GC, "GC", }, \ - { PBLK_LINESTATE_BAD, "BAD", }, \ - { PBLK_LINESTATE_CORRUPT, "CORRUPT" }) - - -#define show_pblk_state(state) __print_symbolic(state, \ - { PBLK_STATE_RUNNING, "RUNNING", }, \ - { PBLK_STATE_STOPPING, "STOPPING", }, \ - { PBLK_STATE_RECOVERING, "RECOVERING", }, \ - { PBLK_STATE_STOPPED, "STOPPED" }) - -#define show_chunk_erase_state(state) __print_symbolic(state, \ - { PBLK_CHUNK_RESET_START, "START", }, \ - { PBLK_CHUNK_RESET_DONE, "OK", }, \ - { PBLK_CHUNK_RESET_FAILED, "FAILED" }) - - -TRACE_EVENT(pblk_chunk_reset, - - TP_PROTO(const char *name, struct ppa_addr *ppa, int state), - - TP_ARGS(name, ppa, state), - - TP_STRUCT__entry( - __string(name, name) - __field(u64, ppa) - __field(int, state) - ), - - TP_fast_assign( - __assign_str(name, name); - __entry->ppa = ppa->ppa; - __entry->state = state; - ), - - TP_printk("dev=%s grp=%llu pu=%llu chk=%llu state=%s", __get_str(name), - (u64)(((struct ppa_addr *)(&__entry->ppa))->m.grp), - (u64)(((struct ppa_addr *)(&__entry->ppa))->m.pu), - (u64)(((struct ppa_addr *)(&__entry->ppa))->m.chk), - show_chunk_erase_state((int)__entry->state)) - -); - -TRACE_EVENT(pblk_chunk_state, - - TP_PROTO(const char *name, struct ppa_addr *ppa, int state), - - TP_ARGS(name, ppa, state), - - TP_STRUCT__entry( - __string(name, name) - __field(u64, ppa) - __field(int, state) - ), - - TP_fast_assign( - __assign_str(name, name); - __entry->ppa = ppa->ppa; - __entry->state = state; - ), - - TP_printk("dev=%s grp=%llu pu=%llu chk=%llu state=%s", __get_str(name), - (u64)(((struct ppa_addr *)(&__entry->ppa))->m.grp), - (u64)(((struct ppa_addr *)(&__entry->ppa))->m.pu), - (u64)(((struct ppa_addr *)(&__entry->ppa))->m.chk), - show_chunk_flags((int)__entry->state)) - -); - -TRACE_EVENT(pblk_line_state, - - TP_PROTO(const char *name, int line, int state), - - TP_ARGS(name, line, state), - - TP_STRUCT__entry( - __string(name, name) - __field(int, line) - __field(int, state) - ), - - TP_fast_assign( - __assign_str(name, name); - __entry->line = line; - __entry->state = state; - ), - - TP_printk("dev=%s line=%d state=%s", __get_str(name), - (int)__entry->line, - show_line_state((int)__entry->state)) - -); - -TRACE_EVENT(pblk_state, - - TP_PROTO(const char *name, int state), - - TP_ARGS(name, state), - - TP_STRUCT__entry( - __string(name, name) - __field(int, state) - ), - - TP_fast_assign( - __assign_str(name, name); - __entry->state = state; - ), - - TP_printk("dev=%s state=%s", __get_str(name), - show_pblk_state((int)__entry->state)) - -); - -#endif /* !defined(_TRACE_PBLK_H) || defined(TRACE_HEADER_MULTI_READ) */ - -/* This part must be outside protection */ - -#undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH ../../drivers/lightnvm -#undef TRACE_INCLUDE_FILE -#define TRACE_INCLUDE_FILE pblk-trace -#include diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c deleted file mode 100644 index b9a2aeba95ab..000000000000 --- a/drivers/lightnvm/pblk-write.c +++ /dev/null @@ -1,665 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2016 CNEX Labs - * Initial release: Javier Gonzalez - * Matias Bjorling - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * pblk-write.c - pblk's write path from write buffer to media - */ - -#include "pblk.h" -#include "pblk-trace.h" - -static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd, - struct pblk_c_ctx *c_ctx) -{ - struct bio *original_bio; - struct pblk_rb *rwb = &pblk->rwb; - unsigned long ret; - int i; - - for (i = 0; i < c_ctx->nr_valid; i++) { - struct pblk_w_ctx *w_ctx; - int pos = c_ctx->sentry + i; - int flags; - - w_ctx = pblk_rb_w_ctx(rwb, pos); - flags = READ_ONCE(w_ctx->flags); - - if (flags & PBLK_FLUSH_ENTRY) { - flags &= ~PBLK_FLUSH_ENTRY; - /* Release flags on context. Protect from writes */ - smp_store_release(&w_ctx->flags, flags); - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_dec(&rwb->inflight_flush_point); -#endif - } - - while ((original_bio = bio_list_pop(&w_ctx->bios))) - bio_endio(original_bio); - } - - if (c_ctx->nr_padded) - pblk_bio_free_pages(pblk, rqd->bio, c_ctx->nr_valid, - c_ctx->nr_padded); - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_add(rqd->nr_ppas, &pblk->sync_writes); -#endif - - ret = pblk_rb_sync_advance(&pblk->rwb, c_ctx->nr_valid); - - bio_put(rqd->bio); - pblk_free_rqd(pblk, rqd, PBLK_WRITE); - - return ret; -} - -static unsigned long pblk_end_queued_w_bio(struct pblk *pblk, - struct nvm_rq *rqd, - struct pblk_c_ctx *c_ctx) -{ - list_del(&c_ctx->list); - return pblk_end_w_bio(pblk, rqd, c_ctx); -} - -static void pblk_complete_write(struct pblk *pblk, struct nvm_rq *rqd, - struct pblk_c_ctx *c_ctx) -{ - struct pblk_c_ctx *c, *r; - unsigned long flags; - unsigned long pos; - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_sub(c_ctx->nr_valid, &pblk->inflight_writes); -#endif - pblk_up_rq(pblk, c_ctx->lun_bitmap); - - pos = pblk_rb_sync_init(&pblk->rwb, &flags); - if (pos == c_ctx->sentry) { - pos = pblk_end_w_bio(pblk, rqd, c_ctx); - -retry: - list_for_each_entry_safe(c, r, &pblk->compl_list, list) { - rqd = nvm_rq_from_c_ctx(c); - if (c->sentry == pos) { - pos = pblk_end_queued_w_bio(pblk, rqd, c); - goto retry; - } - } - } else { - WARN_ON(nvm_rq_from_c_ctx(c_ctx) != rqd); - list_add_tail(&c_ctx->list, &pblk->compl_list); - } - pblk_rb_sync_end(&pblk->rwb, &flags); -} - -/* Map remaining sectors in chunk, starting from ppa */ -static void pblk_map_remaining(struct pblk *pblk, struct ppa_addr *ppa, - int rqd_ppas) -{ - struct pblk_line *line; - struct ppa_addr map_ppa = *ppa; - __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); - __le64 *lba_list; - u64 paddr; - int done = 0; - int n = 0; - - line = pblk_ppa_to_line(pblk, *ppa); - lba_list = emeta_to_lbas(pblk, line->emeta->buf); - - spin_lock(&line->lock); - - while (!done) { - paddr = pblk_dev_ppa_to_line_addr(pblk, map_ppa); - - if (!test_and_set_bit(paddr, line->map_bitmap)) - line->left_msecs--; - - if (n < rqd_ppas && lba_list[paddr] != addr_empty) - line->nr_valid_lbas--; - - lba_list[paddr] = addr_empty; - - if (!test_and_set_bit(paddr, line->invalid_bitmap)) - le32_add_cpu(line->vsc, -1); - - done = nvm_next_ppa_in_chk(pblk->dev, &map_ppa); - - n++; - } - - line->w_err_gc->has_write_err = 1; - spin_unlock(&line->lock); -} - -static void pblk_prepare_resubmit(struct pblk *pblk, unsigned int sentry, - unsigned int nr_entries) -{ - struct pblk_rb *rb = &pblk->rwb; - struct pblk_rb_entry *entry; - struct pblk_line *line; - struct pblk_w_ctx *w_ctx; - struct ppa_addr ppa_l2p; - int flags; - unsigned int i; - - spin_lock(&pblk->trans_lock); - for (i = 0; i < nr_entries; i++) { - entry = &rb->entries[pblk_rb_ptr_wrap(rb, sentry, i)]; - w_ctx = &entry->w_ctx; - - /* Check if the lba has been overwritten */ - if (w_ctx->lba != ADDR_EMPTY) { - ppa_l2p = pblk_trans_map_get(pblk, w_ctx->lba); - if (!pblk_ppa_comp(ppa_l2p, entry->cacheline)) - w_ctx->lba = ADDR_EMPTY; - } - - /* Mark up the entry as submittable again */ - flags = READ_ONCE(w_ctx->flags); - flags |= PBLK_WRITTEN_DATA; - /* Release flags on write context. Protect from writes */ - smp_store_release(&w_ctx->flags, flags); - - /* Decrease the reference count to the line as we will - * re-map these entries - */ - line = pblk_ppa_to_line(pblk, w_ctx->ppa); - atomic_dec(&line->sec_to_update); - kref_put(&line->ref, pblk_line_put); - } - spin_unlock(&pblk->trans_lock); -} - -static void pblk_queue_resubmit(struct pblk *pblk, struct pblk_c_ctx *c_ctx) -{ - struct pblk_c_ctx *r_ctx; - - r_ctx = kzalloc(sizeof(struct pblk_c_ctx), GFP_KERNEL); - if (!r_ctx) - return; - - r_ctx->lun_bitmap = NULL; - r_ctx->sentry = c_ctx->sentry; - r_ctx->nr_valid = c_ctx->nr_valid; - r_ctx->nr_padded = c_ctx->nr_padded; - - spin_lock(&pblk->resubmit_lock); - list_add_tail(&r_ctx->list, &pblk->resubmit_list); - spin_unlock(&pblk->resubmit_lock); - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_add(c_ctx->nr_valid, &pblk->recov_writes); -#endif -} - -static void pblk_submit_rec(struct work_struct *work) -{ - struct pblk_rec_ctx *recovery = - container_of(work, struct pblk_rec_ctx, ws_rec); - struct pblk *pblk = recovery->pblk; - struct nvm_rq *rqd = recovery->rqd; - struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); - struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd); - - pblk_log_write_err(pblk, rqd); - - pblk_map_remaining(pblk, ppa_list, rqd->nr_ppas); - pblk_queue_resubmit(pblk, c_ctx); - - pblk_up_rq(pblk, c_ctx->lun_bitmap); - if (c_ctx->nr_padded) - pblk_bio_free_pages(pblk, rqd->bio, c_ctx->nr_valid, - c_ctx->nr_padded); - bio_put(rqd->bio); - pblk_free_rqd(pblk, rqd, PBLK_WRITE); - mempool_free(recovery, &pblk->rec_pool); - - atomic_dec(&pblk->inflight_io); - pblk_write_kick(pblk); -} - - -static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd) -{ - struct pblk_rec_ctx *recovery; - - recovery = mempool_alloc(&pblk->rec_pool, GFP_ATOMIC); - if (!recovery) { - pblk_err(pblk, "could not allocate recovery work\n"); - return; - } - - recovery->pblk = pblk; - recovery->rqd = rqd; - - INIT_WORK(&recovery->ws_rec, pblk_submit_rec); - queue_work(pblk->close_wq, &recovery->ws_rec); -} - -static void pblk_end_io_write(struct nvm_rq *rqd) -{ - struct pblk *pblk = rqd->private; - struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); - - if (rqd->error) { - pblk_end_w_fail(pblk, rqd); - return; - } else { - if (trace_pblk_chunk_state_enabled()) - pblk_check_chunk_state_update(pblk, rqd); -#ifdef CONFIG_NVM_PBLK_DEBUG - WARN_ONCE(rqd->bio->bi_status, "pblk: corrupted write error\n"); -#endif - } - - pblk_complete_write(pblk, rqd, c_ctx); - atomic_dec(&pblk->inflight_io); -} - -static void pblk_end_io_write_meta(struct nvm_rq *rqd) -{ - struct pblk *pblk = rqd->private; - struct pblk_g_ctx *m_ctx = nvm_rq_to_pdu(rqd); - struct pblk_line *line = m_ctx->private; - struct pblk_emeta *emeta = line->emeta; - struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd); - int sync; - - pblk_up_chunk(pblk, ppa_list[0]); - - if (rqd->error) { - pblk_log_write_err(pblk, rqd); - pblk_err(pblk, "metadata I/O failed. Line %d\n", line->id); - line->w_err_gc->has_write_err = 1; - } else { - if (trace_pblk_chunk_state_enabled()) - pblk_check_chunk_state_update(pblk, rqd); - } - - sync = atomic_add_return(rqd->nr_ppas, &emeta->sync); - if (sync == emeta->nr_entries) - pblk_gen_run_ws(pblk, line, NULL, pblk_line_close_ws, - GFP_ATOMIC, pblk->close_wq); - - pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT); - - atomic_dec(&pblk->inflight_io); -} - -static int pblk_alloc_w_rq(struct pblk *pblk, struct nvm_rq *rqd, - unsigned int nr_secs, nvm_end_io_fn(*end_io)) -{ - /* Setup write request */ - rqd->opcode = NVM_OP_PWRITE; - rqd->nr_ppas = nr_secs; - rqd->is_seq = 1; - rqd->private = pblk; - rqd->end_io = end_io; - - return pblk_alloc_rqd_meta(pblk, rqd); -} - -static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd, - struct ppa_addr *erase_ppa) -{ - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_line *e_line = pblk_line_get_erase(pblk); - struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); - unsigned int valid = c_ctx->nr_valid; - unsigned int padded = c_ctx->nr_padded; - unsigned int nr_secs = valid + padded; - unsigned long *lun_bitmap; - int ret; - - lun_bitmap = kzalloc(lm->lun_bitmap_len, GFP_KERNEL); - if (!lun_bitmap) - return -ENOMEM; - c_ctx->lun_bitmap = lun_bitmap; - - ret = pblk_alloc_w_rq(pblk, rqd, nr_secs, pblk_end_io_write); - if (ret) { - kfree(lun_bitmap); - return ret; - } - - if (likely(!e_line || !atomic_read(&e_line->left_eblks))) - ret = pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, - valid, 0); - else - ret = pblk_map_erase_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, - valid, erase_ppa); - - return ret; -} - -static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail, - unsigned int secs_to_flush) -{ - int secs_to_sync; - - secs_to_sync = pblk_calc_secs(pblk, secs_avail, secs_to_flush, true); - -#ifdef CONFIG_NVM_PBLK_DEBUG - if ((!secs_to_sync && secs_to_flush) - || (secs_to_sync < 0) - || (secs_to_sync > secs_avail && !secs_to_flush)) { - pblk_err(pblk, "bad sector calculation (a:%d,s:%d,f:%d)\n", - secs_avail, secs_to_sync, secs_to_flush); - } -#endif - - return secs_to_sync; -} - -int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_emeta *emeta = meta_line->emeta; - struct ppa_addr *ppa_list; - struct pblk_g_ctx *m_ctx; - struct nvm_rq *rqd; - void *data; - u64 paddr; - int rq_ppas = pblk->min_write_pgs; - int id = meta_line->id; - int rq_len; - int i, j; - int ret; - - rqd = pblk_alloc_rqd(pblk, PBLK_WRITE_INT); - - m_ctx = nvm_rq_to_pdu(rqd); - m_ctx->private = meta_line; - - rq_len = rq_ppas * geo->csecs; - data = ((void *)emeta->buf) + emeta->mem; - - ret = pblk_alloc_w_rq(pblk, rqd, rq_ppas, pblk_end_io_write_meta); - if (ret) - goto fail_free_rqd; - - ppa_list = nvm_rq_to_ppa_list(rqd); - for (i = 0; i < rqd->nr_ppas; ) { - spin_lock(&meta_line->lock); - paddr = __pblk_alloc_page(pblk, meta_line, rq_ppas); - spin_unlock(&meta_line->lock); - for (j = 0; j < rq_ppas; j++, i++, paddr++) - ppa_list[i] = addr_to_gen_ppa(pblk, paddr, id); - } - - spin_lock(&l_mg->close_lock); - emeta->mem += rq_len; - if (emeta->mem >= lm->emeta_len[0]) - list_del(&meta_line->list); - spin_unlock(&l_mg->close_lock); - - pblk_down_chunk(pblk, ppa_list[0]); - - ret = pblk_submit_io(pblk, rqd, data); - if (ret) { - pblk_err(pblk, "emeta I/O submission failed: %d\n", ret); - goto fail_rollback; - } - - return NVM_IO_OK; - -fail_rollback: - pblk_up_chunk(pblk, ppa_list[0]); - spin_lock(&l_mg->close_lock); - pblk_dealloc_page(pblk, meta_line, rq_ppas); - list_add(&meta_line->list, &meta_line->list); - spin_unlock(&l_mg->close_lock); -fail_free_rqd: - pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT); - return ret; -} - -static inline bool pblk_valid_meta_ppa(struct pblk *pblk, - struct pblk_line *meta_line, - struct nvm_rq *data_rqd) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_c_ctx *data_c_ctx = nvm_rq_to_pdu(data_rqd); - struct pblk_line *data_line = pblk_line_get_data(pblk); - struct ppa_addr ppa, ppa_opt; - u64 paddr; - int pos_opt; - - /* Schedule a metadata I/O that is half the distance from the data I/O - * with regards to the number of LUNs forming the pblk instance. This - * balances LUN conflicts across every I/O. - * - * When the LUN configuration changes (e.g., due to GC), this distance - * can align, which would result on metadata and data I/Os colliding. In - * this case, modify the distance to not be optimal, but move the - * optimal in the right direction. - */ - paddr = pblk_lookup_page(pblk, meta_line); - ppa = addr_to_gen_ppa(pblk, paddr, 0); - ppa_opt = addr_to_gen_ppa(pblk, paddr + data_line->meta_distance, 0); - pos_opt = pblk_ppa_to_pos(geo, ppa_opt); - - if (test_bit(pos_opt, data_c_ctx->lun_bitmap) || - test_bit(pos_opt, data_line->blk_bitmap)) - return true; - - if (unlikely(pblk_ppa_comp(ppa_opt, ppa))) - data_line->meta_distance--; - - return false; -} - -static struct pblk_line *pblk_should_submit_meta_io(struct pblk *pblk, - struct nvm_rq *data_rqd) -{ - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line *meta_line; - - spin_lock(&l_mg->close_lock); - if (list_empty(&l_mg->emeta_list)) { - spin_unlock(&l_mg->close_lock); - return NULL; - } - meta_line = list_first_entry(&l_mg->emeta_list, struct pblk_line, list); - if (meta_line->emeta->mem >= lm->emeta_len[0]) { - spin_unlock(&l_mg->close_lock); - return NULL; - } - spin_unlock(&l_mg->close_lock); - - if (!pblk_valid_meta_ppa(pblk, meta_line, data_rqd)) - return NULL; - - return meta_line; -} - -static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd) -{ - struct ppa_addr erase_ppa; - struct pblk_line *meta_line; - int err; - - pblk_ppa_set_empty(&erase_ppa); - - /* Assign lbas to ppas and populate request structure */ - err = pblk_setup_w_rq(pblk, rqd, &erase_ppa); - if (err) { - pblk_err(pblk, "could not setup write request: %d\n", err); - return NVM_IO_ERR; - } - - meta_line = pblk_should_submit_meta_io(pblk, rqd); - - /* Submit data write for current data line */ - err = pblk_submit_io(pblk, rqd, NULL); - if (err) { - pblk_err(pblk, "data I/O submission failed: %d\n", err); - return NVM_IO_ERR; - } - - if (!pblk_ppa_empty(erase_ppa)) { - /* Submit erase for next data line */ - if (pblk_blk_erase_async(pblk, erase_ppa)) { - struct pblk_line *e_line = pblk_line_get_erase(pblk); - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - int bit; - - atomic_inc(&e_line->left_eblks); - bit = pblk_ppa_to_pos(geo, erase_ppa); - WARN_ON(!test_and_clear_bit(bit, e_line->erase_bitmap)); - } - } - - if (meta_line) { - /* Submit metadata write for previous data line */ - err = pblk_submit_meta_io(pblk, meta_line); - if (err) { - pblk_err(pblk, "metadata I/O submission failed: %d", - err); - return NVM_IO_ERR; - } - } - - return NVM_IO_OK; -} - -static void pblk_free_write_rqd(struct pblk *pblk, struct nvm_rq *rqd) -{ - struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); - struct bio *bio = rqd->bio; - - if (c_ctx->nr_padded) - pblk_bio_free_pages(pblk, bio, c_ctx->nr_valid, - c_ctx->nr_padded); -} - -static int pblk_submit_write(struct pblk *pblk, int *secs_left) -{ - struct bio *bio; - struct nvm_rq *rqd; - unsigned int secs_avail, secs_to_sync, secs_to_com; - unsigned int secs_to_flush, packed_meta_pgs; - unsigned long pos; - unsigned int resubmit; - - *secs_left = 0; - - spin_lock(&pblk->resubmit_lock); - resubmit = !list_empty(&pblk->resubmit_list); - spin_unlock(&pblk->resubmit_lock); - - /* Resubmit failed writes first */ - if (resubmit) { - struct pblk_c_ctx *r_ctx; - - spin_lock(&pblk->resubmit_lock); - r_ctx = list_first_entry(&pblk->resubmit_list, - struct pblk_c_ctx, list); - list_del(&r_ctx->list); - spin_unlock(&pblk->resubmit_lock); - - secs_avail = r_ctx->nr_valid; - pos = r_ctx->sentry; - - pblk_prepare_resubmit(pblk, pos, secs_avail); - secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, - secs_avail); - - kfree(r_ctx); - } else { - /* If there are no sectors in the cache, - * flushes (bios without data) will be cleared on - * the cache threads - */ - secs_avail = pblk_rb_read_count(&pblk->rwb); - if (!secs_avail) - return 0; - - secs_to_flush = pblk_rb_flush_point_count(&pblk->rwb); - if (!secs_to_flush && secs_avail < pblk->min_write_pgs_data) - return 0; - - secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, - secs_to_flush); - if (secs_to_sync > pblk->max_write_pgs) { - pblk_err(pblk, "bad buffer sync calculation\n"); - return 0; - } - - secs_to_com = (secs_to_sync > secs_avail) ? - secs_avail : secs_to_sync; - pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com); - } - - packed_meta_pgs = (pblk->min_write_pgs - pblk->min_write_pgs_data); - bio = bio_alloc(GFP_KERNEL, secs_to_sync + packed_meta_pgs); - - bio->bi_iter.bi_sector = 0; /* internal bio */ - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - - rqd = pblk_alloc_rqd(pblk, PBLK_WRITE); - rqd->bio = bio; - - if (pblk_rb_read_to_bio(&pblk->rwb, rqd, pos, secs_to_sync, - secs_avail)) { - pblk_err(pblk, "corrupted write bio\n"); - goto fail_put_bio; - } - - if (pblk_submit_io_set(pblk, rqd)) - goto fail_free_bio; - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_long_add(secs_to_sync, &pblk->sub_writes); -#endif - - *secs_left = 1; - return 0; - -fail_free_bio: - pblk_free_write_rqd(pblk, rqd); -fail_put_bio: - bio_put(bio); - pblk_free_rqd(pblk, rqd, PBLK_WRITE); - - return -EINTR; -} - -int pblk_write_ts(void *data) -{ - struct pblk *pblk = data; - int secs_left; - int write_failure = 0; - - while (!kthread_should_stop()) { - if (!write_failure) { - write_failure = pblk_submit_write(pblk, &secs_left); - - if (secs_left) - continue; - } - set_current_state(TASK_INTERRUPTIBLE); - io_schedule(); - } - - return 0; -} diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h deleted file mode 100644 index 86ffa875bfe1..000000000000 --- a/drivers/lightnvm/pblk.h +++ /dev/null @@ -1,1358 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2015 IT University of Copenhagen (rrpc.h) - * Copyright (C) 2016 CNEX Labs - * Initial release: Matias Bjorling - * Write buffering: Javier Gonzalez - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * Implementation of a Physical Block-device target for Open-channel SSDs. - * - */ - -#ifndef PBLK_H_ -#define PBLK_H_ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -/* Run only GC if less than 1/X blocks are free */ -#define GC_LIMIT_INVERSE 5 -#define GC_TIME_MSECS 1000 - -#define PBLK_SECTOR (512) -#define PBLK_EXPOSED_PAGE_SIZE (4096) - -#define PBLK_NR_CLOSE_JOBS (4) - -#define PBLK_CACHE_NAME_LEN (DISK_NAME_LEN + 16) - -/* Max 512 LUNs per device */ -#define PBLK_MAX_LUNS_BITMAP (4) - -#define NR_PHY_IN_LOG (PBLK_EXPOSED_PAGE_SIZE / PBLK_SECTOR) - -/* Static pool sizes */ -#define PBLK_GEN_WS_POOL_SIZE (2) - -#define PBLK_DEFAULT_OP (11) - -enum { - PBLK_READ = READ, - PBLK_WRITE = WRITE,/* Write from write buffer */ - PBLK_WRITE_INT, /* Internal write - no write buffer */ - PBLK_READ_RECOV, /* Recovery read - errors allowed */ - PBLK_ERASE, -}; - -enum { - /* IO Types */ - PBLK_IOTYPE_USER = 1 << 0, - PBLK_IOTYPE_GC = 1 << 1, - - /* Write buffer flags */ - PBLK_FLUSH_ENTRY = 1 << 2, - PBLK_WRITTEN_DATA = 1 << 3, - PBLK_SUBMITTED_ENTRY = 1 << 4, - PBLK_WRITABLE_ENTRY = 1 << 5, -}; - -enum { - PBLK_BLK_ST_OPEN = 0x1, - PBLK_BLK_ST_CLOSED = 0x2, -}; - -enum { - PBLK_CHUNK_RESET_START, - PBLK_CHUNK_RESET_DONE, - PBLK_CHUNK_RESET_FAILED, -}; - -struct pblk_sec_meta { - u64 reserved; - __le64 lba; -}; - -/* The number of GC lists and the rate-limiter states go together. This way the - * rate-limiter can dictate how much GC is needed based on resource utilization. - */ -#define PBLK_GC_NR_LISTS 4 - -enum { - PBLK_RL_OFF = 0, - PBLK_RL_WERR = 1, - PBLK_RL_HIGH = 2, - PBLK_RL_MID = 3, - PBLK_RL_LOW = 4 -}; - -#define pblk_dma_ppa_size (sizeof(u64) * NVM_MAX_VLBA) - -/* write buffer completion context */ -struct pblk_c_ctx { - struct list_head list; /* Head for out-of-order completion */ - - unsigned long *lun_bitmap; /* Luns used on current request */ - unsigned int sentry; - unsigned int nr_valid; - unsigned int nr_padded; -}; - -/* read context */ -struct pblk_g_ctx { - void *private; - unsigned long start_time; - u64 lba; -}; - -/* Pad context */ -struct pblk_pad_rq { - struct pblk *pblk; - struct completion wait; - struct kref ref; -}; - -/* Recovery context */ -struct pblk_rec_ctx { - struct pblk *pblk; - struct nvm_rq *rqd; - struct work_struct ws_rec; -}; - -/* Write context */ -struct pblk_w_ctx { - struct bio_list bios; /* Original bios - used for completion - * in REQ_FUA, REQ_FLUSH case - */ - u64 lba; /* Logic addr. associated with entry */ - struct ppa_addr ppa; /* Physic addr. associated with entry */ - int flags; /* Write context flags */ -}; - -struct pblk_rb_entry { - struct ppa_addr cacheline; /* Cacheline for this entry */ - void *data; /* Pointer to data on this entry */ - struct pblk_w_ctx w_ctx; /* Context for this entry */ - struct list_head index; /* List head to enable indexes */ -}; - -#define EMPTY_ENTRY (~0U) - -struct pblk_rb_pages { - struct page *pages; - int order; - struct list_head list; -}; - -struct pblk_rb { - struct pblk_rb_entry *entries; /* Ring buffer entries */ - unsigned int mem; /* Write offset - points to next - * writable entry in memory - */ - unsigned int subm; /* Read offset - points to last entry - * that has been submitted to the media - * to be persisted - */ - unsigned int sync; /* Synced - backpointer that signals - * the last submitted entry that has - * been successfully persisted to media - */ - unsigned int flush_point; /* Sync point - last entry that must be - * flushed to the media. Used with - * REQ_FLUSH and REQ_FUA - */ - unsigned int l2p_update; /* l2p update point - next entry for - * which l2p mapping will be updated to - * contain a device ppa address (instead - * of a cacheline - */ - unsigned int nr_entries; /* Number of entries in write buffer - - * must be a power of two - */ - unsigned int seg_size; /* Size of the data segments being - * stored on each entry. Typically this - * will be 4KB - */ - - unsigned int back_thres; /* Threshold that shall be maintained by - * the backpointer in order to respect - * geo->mw_cunits on a per chunk basis - */ - - struct list_head pages; /* List of data pages */ - - spinlock_t w_lock; /* Write lock */ - spinlock_t s_lock; /* Sync lock */ - -#ifdef CONFIG_NVM_PBLK_DEBUG - atomic_t inflight_flush_point; /* Not served REQ_FLUSH | REQ_FUA */ -#endif -}; - -#define PBLK_RECOVERY_SECTORS 16 - -struct pblk_lun { - struct ppa_addr bppa; - struct semaphore wr_sem; -}; - -struct pblk_gc_rq { - struct pblk_line *line; - void *data; - u64 paddr_list[NVM_MAX_VLBA]; - u64 lba_list[NVM_MAX_VLBA]; - int nr_secs; - int secs_to_gc; - struct list_head list; -}; - -struct pblk_gc { - /* These states are not protected by a lock since (i) they are in the - * fast path, and (ii) they are not critical. - */ - int gc_active; - int gc_enabled; - int gc_forced; - - struct task_struct *gc_ts; - struct task_struct *gc_writer_ts; - struct task_struct *gc_reader_ts; - - struct workqueue_struct *gc_line_reader_wq; - struct workqueue_struct *gc_reader_wq; - - struct timer_list gc_timer; - - struct semaphore gc_sem; - atomic_t read_inflight_gc; /* Number of lines with inflight GC reads */ - atomic_t pipeline_gc; /* Number of lines in the GC pipeline - - * started reads to finished writes - */ - int w_entries; - - struct list_head w_list; - struct list_head r_list; - - spinlock_t lock; - spinlock_t w_lock; - spinlock_t r_lock; -}; - -struct pblk_rl { - unsigned int high; /* Upper threshold for rate limiter (free run - - * user I/O rate limiter - */ - unsigned int high_pw; /* High rounded up as a power of 2 */ - -#define PBLK_USER_HIGH_THRS 8 /* Begin write limit at 12% available blks */ -#define PBLK_USER_LOW_THRS 10 /* Aggressive GC at 10% available blocks */ - - int rb_windows_pw; /* Number of rate windows in the write buffer - * given as a power-of-2. This guarantees that - * when user I/O is being rate limited, there - * will be reserved enough space for the GC to - * place its payload. A window is of - * pblk->max_write_pgs size, which in NVMe is - * 64, i.e., 256kb. - */ - int rb_budget; /* Total number of entries available for I/O */ - int rb_user_max; /* Max buffer entries available for user I/O */ - int rb_gc_max; /* Max buffer entries available for GC I/O */ - int rb_gc_rsv; /* Reserved buffer entries for GC I/O */ - int rb_state; /* Rate-limiter current state */ - int rb_max_io; /* Maximum size for an I/O giving the config */ - - atomic_t rb_user_cnt; /* User I/O buffer counter */ - atomic_t rb_gc_cnt; /* GC I/O buffer counter */ - atomic_t rb_space; /* Space limit in case of reaching capacity */ - - int rsv_blocks; /* Reserved blocks for GC */ - - int rb_user_active; - int rb_gc_active; - - atomic_t werr_lines; /* Number of write error lines that needs gc */ - - struct timer_list u_timer; - - unsigned long total_blocks; - - atomic_t free_blocks; /* Total number of free blocks (+ OP) */ - atomic_t free_user_blocks; /* Number of user free blocks (no OP) */ -}; - -#define PBLK_LINE_EMPTY (~0U) - -enum { - /* Line Types */ - PBLK_LINETYPE_FREE = 0, - PBLK_LINETYPE_LOG = 1, - PBLK_LINETYPE_DATA = 2, - - /* Line state */ - PBLK_LINESTATE_NEW = 9, - PBLK_LINESTATE_FREE = 10, - PBLK_LINESTATE_OPEN = 11, - PBLK_LINESTATE_CLOSED = 12, - PBLK_LINESTATE_GC = 13, - PBLK_LINESTATE_BAD = 14, - PBLK_LINESTATE_CORRUPT = 15, - - /* GC group */ - PBLK_LINEGC_NONE = 20, - PBLK_LINEGC_EMPTY = 21, - PBLK_LINEGC_LOW = 22, - PBLK_LINEGC_MID = 23, - PBLK_LINEGC_HIGH = 24, - PBLK_LINEGC_FULL = 25, - PBLK_LINEGC_WERR = 26 -}; - -#define PBLK_MAGIC 0x70626c6b /*pblk*/ - -/* emeta/smeta persistent storage format versions: - * Changes in major version requires offline migration. - * Changes in minor version are handled automatically during - * recovery. - */ - -#define SMETA_VERSION_MAJOR (0) -#define SMETA_VERSION_MINOR (1) - -#define EMETA_VERSION_MAJOR (0) -#define EMETA_VERSION_MINOR (2) - -struct line_header { - __le32 crc; - __le32 identifier; /* pblk identifier */ - __u8 uuid[16]; /* instance uuid */ - __le16 type; /* line type */ - __u8 version_major; /* version major */ - __u8 version_minor; /* version minor */ - __le32 id; /* line id for current line */ -}; - -struct line_smeta { - struct line_header header; - - __le32 crc; /* Full structure including struct crc */ - /* Previous line metadata */ - __le32 prev_id; /* Line id for previous line */ - - /* Current line metadata */ - __le64 seq_nr; /* Sequence number for current line */ - - /* Active writers */ - __le32 window_wr_lun; /* Number of parallel LUNs to write */ - - __le32 rsvd[2]; - - __le64 lun_bitmap[]; -}; - - -/* - * Metadata layout in media: - * First sector: - * 1. struct line_emeta - * 2. bad block bitmap (u64 * window_wr_lun) - * 3. write amplification counters - * Mid sectors (start at lbas_sector): - * 3. nr_lbas (u64) forming lba list - * Last sectors (start at vsc_sector): - * 4. u32 valid sector count (vsc) for all lines (~0U: free line) - */ -struct line_emeta { - struct line_header header; - - __le32 crc; /* Full structure including struct crc */ - - /* Previous line metadata */ - __le32 prev_id; /* Line id for prev line */ - - /* Current line metadata */ - __le64 seq_nr; /* Sequence number for current line */ - - /* Active writers */ - __le32 window_wr_lun; /* Number of parallel LUNs to write */ - - /* Bookkeeping for recovery */ - __le32 next_id; /* Line id for next line */ - __le64 nr_lbas; /* Number of lbas mapped in line */ - __le64 nr_valid_lbas; /* Number of valid lbas mapped in line */ - __le64 bb_bitmap[]; /* Updated bad block bitmap for line */ -}; - - -/* Write amplification counters stored on media */ -struct wa_counters { - __le64 user; /* Number of user written sectors */ - __le64 gc; /* Number of sectors written by GC*/ - __le64 pad; /* Number of padded sectors */ -}; - -struct pblk_emeta { - struct line_emeta *buf; /* emeta buffer in media format */ - int mem; /* Write offset - points to next - * writable entry in memory - */ - atomic_t sync; /* Synced - backpointer that signals the - * last entry that has been successfully - * persisted to media - */ - unsigned int nr_entries; /* Number of emeta entries */ -}; - -struct pblk_smeta { - struct line_smeta *buf; /* smeta buffer in persistent format */ -}; - -struct pblk_w_err_gc { - int has_write_err; - int has_gc_err; - __le64 *lba_list; -}; - -struct pblk_line { - struct pblk *pblk; - unsigned int id; /* Line number corresponds to the - * block line - */ - unsigned int seq_nr; /* Unique line sequence number */ - - int state; /* PBLK_LINESTATE_X */ - int type; /* PBLK_LINETYPE_X */ - int gc_group; /* PBLK_LINEGC_X */ - struct list_head list; /* Free, GC lists */ - - unsigned long *lun_bitmap; /* Bitmap for LUNs mapped in line */ - - struct nvm_chk_meta *chks; /* Chunks forming line */ - - struct pblk_smeta *smeta; /* Start metadata */ - struct pblk_emeta *emeta; /* End medatada */ - - int meta_line; /* Metadata line id */ - int meta_distance; /* Distance between data and metadata */ - - u64 emeta_ssec; /* Sector where emeta starts */ - - unsigned int sec_in_line; /* Number of usable secs in line */ - - atomic_t blk_in_line; /* Number of good blocks in line */ - unsigned long *blk_bitmap; /* Bitmap for valid/invalid blocks */ - unsigned long *erase_bitmap; /* Bitmap for erased blocks */ - - unsigned long *map_bitmap; /* Bitmap for mapped sectors in line */ - unsigned long *invalid_bitmap; /* Bitmap for invalid sectors in line */ - - atomic_t left_eblks; /* Blocks left for erasing */ - atomic_t left_seblks; /* Blocks left for sync erasing */ - - int left_msecs; /* Sectors left for mapping */ - unsigned int cur_sec; /* Sector map pointer */ - unsigned int nr_valid_lbas; /* Number of valid lbas in line */ - - __le32 *vsc; /* Valid sector count in line */ - - struct kref ref; /* Write buffer L2P references */ - atomic_t sec_to_update; /* Outstanding L2P updates to ppa */ - - struct pblk_w_err_gc *w_err_gc; /* Write error gc recovery metadata */ - - spinlock_t lock; /* Necessary for invalid_bitmap only */ -}; - -#define PBLK_DATA_LINES 4 - -enum { - PBLK_EMETA_TYPE_HEADER = 1, /* struct line_emeta first sector */ - PBLK_EMETA_TYPE_LLBA = 2, /* lba list - type: __le64 */ - PBLK_EMETA_TYPE_VSC = 3, /* vsc list - type: __le32 */ -}; - -struct pblk_line_mgmt { - int nr_lines; /* Total number of full lines */ - int nr_free_lines; /* Number of full lines in free list */ - - /* Free lists - use free_lock */ - struct list_head free_list; /* Full lines ready to use */ - struct list_head corrupt_list; /* Full lines corrupted */ - struct list_head bad_list; /* Full lines bad */ - - /* GC lists - use gc_lock */ - struct list_head *gc_lists[PBLK_GC_NR_LISTS]; - struct list_head gc_high_list; /* Full lines ready to GC, high isc */ - struct list_head gc_mid_list; /* Full lines ready to GC, mid isc */ - struct list_head gc_low_list; /* Full lines ready to GC, low isc */ - - struct list_head gc_werr_list; /* Write err recovery list */ - - struct list_head gc_full_list; /* Full lines ready to GC, no valid */ - struct list_head gc_empty_list; /* Full lines close, all valid */ - - struct pblk_line *log_line; /* Current FTL log line */ - struct pblk_line *data_line; /* Current data line */ - struct pblk_line *log_next; /* Next FTL log line */ - struct pblk_line *data_next; /* Next data line */ - - struct list_head emeta_list; /* Lines queued to schedule emeta */ - - __le32 *vsc_list; /* Valid sector counts for all lines */ - - /* Pre-allocated metadata for data lines */ - struct pblk_smeta *sline_meta[PBLK_DATA_LINES]; - struct pblk_emeta *eline_meta[PBLK_DATA_LINES]; - unsigned long meta_bitmap; - - /* Cache and mempool for map/invalid bitmaps */ - struct kmem_cache *bitmap_cache; - mempool_t *bitmap_pool; - - /* Helpers for fast bitmap calculations */ - unsigned long *bb_template; - unsigned long *bb_aux; - - unsigned long d_seq_nr; /* Data line unique sequence number */ - unsigned long l_seq_nr; /* Log line unique sequence number */ - - spinlock_t free_lock; - spinlock_t close_lock; - spinlock_t gc_lock; -}; - -struct pblk_line_meta { - unsigned int smeta_len; /* Total length for smeta */ - unsigned int smeta_sec; /* Sectors needed for smeta */ - - unsigned int emeta_len[4]; /* Lengths for emeta: - * [0]: Total - * [1]: struct line_emeta + - * bb_bitmap + struct wa_counters - * [2]: L2P portion - * [3]: vsc - */ - unsigned int emeta_sec[4]; /* Sectors needed for emeta. Same layout - * as emeta_len - */ - - unsigned int emeta_bb; /* Boundary for bb that affects emeta */ - - unsigned int vsc_list_len; /* Length for vsc list */ - unsigned int sec_bitmap_len; /* Length for sector bitmap in line */ - unsigned int blk_bitmap_len; /* Length for block bitmap in line */ - unsigned int lun_bitmap_len; /* Length for lun bitmap in line */ - - unsigned int blk_per_line; /* Number of blocks in a full line */ - unsigned int sec_per_line; /* Number of sectors in a line */ - unsigned int dsec_per_line; /* Number of data sectors in a line */ - unsigned int min_blk_line; /* Min. number of good blocks in line */ - - unsigned int mid_thrs; /* Threshold for GC mid list */ - unsigned int high_thrs; /* Threshold for GC high list */ - - unsigned int meta_distance; /* Distance between data and metadata */ -}; - -enum { - PBLK_STATE_RUNNING = 0, - PBLK_STATE_STOPPING = 1, - PBLK_STATE_RECOVERING = 2, - PBLK_STATE_STOPPED = 3, -}; - -/* Internal format to support not power-of-2 device formats */ -struct pblk_addrf { - /* gen to dev */ - int sec_stripe; - int ch_stripe; - int lun_stripe; - - /* dev to gen */ - int sec_lun_stripe; - int sec_ws_stripe; -}; - -struct pblk { - struct nvm_tgt_dev *dev; - struct gendisk *disk; - - struct kobject kobj; - - struct pblk_lun *luns; - - struct pblk_line *lines; /* Line array */ - struct pblk_line_mgmt l_mg; /* Line management */ - struct pblk_line_meta lm; /* Line metadata */ - - struct nvm_addrf addrf; /* Aligned address format */ - struct pblk_addrf uaddrf; /* Unaligned address format */ - int addrf_len; - - struct pblk_rb rwb; - - int state; /* pblk line state */ - - int min_write_pgs; /* Minimum amount of pages required by controller */ - int min_write_pgs_data; /* Minimum amount of payload pages */ - int max_write_pgs; /* Maximum amount of pages supported by controller */ - int oob_meta_size; /* Size of OOB sector metadata */ - - sector_t capacity; /* Device capacity when bad blocks are subtracted */ - - int op; /* Percentage of device used for over-provisioning */ - int op_blks; /* Number of blocks used for over-provisioning */ - - /* pblk provisioning values. Used by rate limiter */ - struct pblk_rl rl; - - int sec_per_write; - - guid_t instance_uuid; - - /* Persistent write amplification counters, 4kb sector I/Os */ - atomic64_t user_wa; /* Sectors written by user */ - atomic64_t gc_wa; /* Sectors written by GC */ - atomic64_t pad_wa; /* Padded sectors written */ - - /* Reset values for delta write amplification measurements */ - u64 user_rst_wa; - u64 gc_rst_wa; - u64 pad_rst_wa; - - /* Counters used for calculating padding distribution */ - atomic64_t *pad_dist; /* Padding distribution buckets */ - u64 nr_flush_rst; /* Flushes reset value for pad dist.*/ - atomic64_t nr_flush; /* Number of flush/fua I/O */ - -#ifdef CONFIG_NVM_PBLK_DEBUG - /* Non-persistent debug counters, 4kb sector I/Os */ - atomic_long_t inflight_writes; /* Inflight writes (user and gc) */ - atomic_long_t padded_writes; /* Sectors padded due to flush/fua */ - atomic_long_t padded_wb; /* Sectors padded in write buffer */ - atomic_long_t req_writes; /* Sectors stored on write buffer */ - atomic_long_t sub_writes; /* Sectors submitted from buffer */ - atomic_long_t sync_writes; /* Sectors synced to media */ - atomic_long_t inflight_reads; /* Inflight sector read requests */ - atomic_long_t cache_reads; /* Read requests that hit the cache */ - atomic_long_t sync_reads; /* Completed sector read requests */ - atomic_long_t recov_writes; /* Sectors submitted from recovery */ - atomic_long_t recov_gc_writes; /* Sectors submitted from write GC */ - atomic_long_t recov_gc_reads; /* Sectors submitted from read GC */ -#endif - - spinlock_t lock; - - atomic_long_t read_failed; - atomic_long_t read_empty; - atomic_long_t read_high_ecc; - atomic_long_t read_failed_gc; - atomic_long_t write_failed; - atomic_long_t erase_failed; - - atomic_t inflight_io; /* General inflight I/O counter */ - - struct task_struct *writer_ts; - - /* Simple translation map of logical addresses to physical addresses. - * The logical addresses is known by the host system, while the physical - * addresses are used when writing to the disk block device. - */ - unsigned char *trans_map; - spinlock_t trans_lock; - - struct list_head compl_list; - - spinlock_t resubmit_lock; /* Resubmit list lock */ - struct list_head resubmit_list; /* Resubmit list for failed writes*/ - - mempool_t page_bio_pool; - mempool_t gen_ws_pool; - mempool_t rec_pool; - mempool_t r_rq_pool; - mempool_t w_rq_pool; - mempool_t e_rq_pool; - - struct workqueue_struct *close_wq; - struct workqueue_struct *bb_wq; - struct workqueue_struct *r_end_wq; - - struct timer_list wtimer; - - struct pblk_gc gc; -}; - -struct pblk_line_ws { - struct pblk *pblk; - struct pblk_line *line; - void *priv; - struct work_struct ws; -}; - -#define pblk_g_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_g_ctx)) -#define pblk_w_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_c_ctx)) - -#define pblk_err(pblk, fmt, ...) \ - pr_err("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__) -#define pblk_info(pblk, fmt, ...) \ - pr_info("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__) -#define pblk_warn(pblk, fmt, ...) \ - pr_warn("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__) -#define pblk_debug(pblk, fmt, ...) \ - pr_debug("pblk %s: " fmt, pblk->disk->disk_name, ##__VA_ARGS__) - -/* - * pblk ring buffer operations - */ -int pblk_rb_init(struct pblk_rb *rb, unsigned int size, unsigned int threshold, - unsigned int seg_sz); -int pblk_rb_may_write_user(struct pblk_rb *rb, struct bio *bio, - unsigned int nr_entries, unsigned int *pos); -int pblk_rb_may_write_gc(struct pblk_rb *rb, unsigned int nr_entries, - unsigned int *pos); -void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data, - struct pblk_w_ctx w_ctx, unsigned int pos); -void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data, - struct pblk_w_ctx w_ctx, struct pblk_line *line, - u64 paddr, unsigned int pos); -struct pblk_w_ctx *pblk_rb_w_ctx(struct pblk_rb *rb, unsigned int pos); -void pblk_rb_flush(struct pblk_rb *rb); - -void pblk_rb_sync_l2p(struct pblk_rb *rb); -unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd, - unsigned int pos, unsigned int nr_entries, - unsigned int count); -int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba, - struct ppa_addr ppa); -unsigned int pblk_rb_read_commit(struct pblk_rb *rb, unsigned int entries); - -unsigned int pblk_rb_sync_init(struct pblk_rb *rb, unsigned long *flags); -unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries); -unsigned int pblk_rb_ptr_wrap(struct pblk_rb *rb, unsigned int p, - unsigned int nr_entries); -void pblk_rb_sync_end(struct pblk_rb *rb, unsigned long *flags); -unsigned int pblk_rb_flush_point_count(struct pblk_rb *rb); - -unsigned int pblk_rb_read_count(struct pblk_rb *rb); -unsigned int pblk_rb_sync_count(struct pblk_rb *rb); -unsigned int pblk_rb_wrap_pos(struct pblk_rb *rb, unsigned int pos); - -int pblk_rb_tear_down_check(struct pblk_rb *rb); -int pblk_rb_pos_oob(struct pblk_rb *rb, u64 pos); -void pblk_rb_free(struct pblk_rb *rb); -ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf); - -/* - * pblk core - */ -struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int type); -void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int type); -int pblk_alloc_rqd_meta(struct pblk *pblk, struct nvm_rq *rqd); -void pblk_free_rqd_meta(struct pblk *pblk, struct nvm_rq *rqd); -void pblk_set_sec_per_write(struct pblk *pblk, int sec_per_write); -int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd, - struct pblk_c_ctx *c_ctx); -void pblk_discard(struct pblk *pblk, struct bio *bio); -struct nvm_chk_meta *pblk_get_chunk_meta(struct pblk *pblk); -struct nvm_chk_meta *pblk_chunk_get_off(struct pblk *pblk, - struct nvm_chk_meta *lp, - struct ppa_addr ppa); -void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd); -void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd); -int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd, void *buf); -int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd, void *buf); -int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line); -void pblk_check_chunk_state_update(struct pblk *pblk, struct nvm_rq *rqd); -struct pblk_line *pblk_line_get(struct pblk *pblk); -struct pblk_line *pblk_line_get_first_data(struct pblk *pblk); -struct pblk_line *pblk_line_replace_data(struct pblk *pblk); -void pblk_ppa_to_line_put(struct pblk *pblk, struct ppa_addr ppa); -void pblk_rq_to_line_put(struct pblk *pblk, struct nvm_rq *rqd); -int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line); -void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line); -struct pblk_line *pblk_line_get_data(struct pblk *pblk); -struct pblk_line *pblk_line_get_erase(struct pblk *pblk); -int pblk_line_erase(struct pblk *pblk, struct pblk_line *line); -int pblk_line_is_full(struct pblk_line *line); -void pblk_line_free(struct pblk_line *line); -void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line); -void pblk_line_close(struct pblk *pblk, struct pblk_line *line); -void pblk_line_close_ws(struct work_struct *work); -void pblk_pipeline_stop(struct pblk *pblk); -void __pblk_pipeline_stop(struct pblk *pblk); -void __pblk_pipeline_flush(struct pblk *pblk); -void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv, - void (*work)(struct work_struct *), gfp_t gfp_mask, - struct workqueue_struct *wq); -u64 pblk_line_smeta_start(struct pblk *pblk, struct pblk_line *line); -int pblk_line_smeta_read(struct pblk *pblk, struct pblk_line *line); -int pblk_line_emeta_read(struct pblk *pblk, struct pblk_line *line, - void *emeta_buf); -int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr erase_ppa); -void pblk_line_put(struct kref *ref); -void pblk_line_put_wq(struct kref *ref); -struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line); -u64 pblk_lookup_page(struct pblk *pblk, struct pblk_line *line); -void pblk_dealloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs); -u64 pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs); -u64 __pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs); -int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail, - unsigned long secs_to_flush, bool skip_meta); -void pblk_down_rq(struct pblk *pblk, struct ppa_addr ppa, - unsigned long *lun_bitmap); -void pblk_down_chunk(struct pblk *pblk, struct ppa_addr ppa); -void pblk_up_chunk(struct pblk *pblk, struct ppa_addr ppa); -void pblk_up_rq(struct pblk *pblk, unsigned long *lun_bitmap); -int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags, - int nr_pages); -void pblk_bio_free_pages(struct pblk *pblk, struct bio *bio, int off, - int nr_pages); -void pblk_map_invalidate(struct pblk *pblk, struct ppa_addr ppa); -void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line, - u64 paddr); -void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa); -void pblk_update_map_cache(struct pblk *pblk, sector_t lba, - struct ppa_addr ppa); -void pblk_update_map_dev(struct pblk *pblk, sector_t lba, - struct ppa_addr ppa, struct ppa_addr entry_line); -int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa, - struct pblk_line *gc_line, u64 paddr); -void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas, - u64 *lba_list, int nr_secs); -int pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas, - sector_t blba, int nr_secs, bool *from_cache); -void *pblk_get_meta_for_writes(struct pblk *pblk, struct nvm_rq *rqd); -void pblk_get_packed_meta(struct pblk *pblk, struct nvm_rq *rqd); - -/* - * pblk user I/O write path - */ -void pblk_write_to_cache(struct pblk *pblk, struct bio *bio, - unsigned long flags); -int pblk_write_gc_to_cache(struct pblk *pblk, struct pblk_gc_rq *gc_rq); - -/* - * pblk map - */ -int pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd, - unsigned int sentry, unsigned long *lun_bitmap, - unsigned int valid_secs, struct ppa_addr *erase_ppa); -int pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry, - unsigned long *lun_bitmap, unsigned int valid_secs, - unsigned int off); - -/* - * pblk write thread - */ -int pblk_write_ts(void *data); -void pblk_write_timer_fn(struct timer_list *t); -void pblk_write_should_kick(struct pblk *pblk); -void pblk_write_kick(struct pblk *pblk); - -/* - * pblk read path - */ -extern struct bio_set pblk_bio_set; -void pblk_submit_read(struct pblk *pblk, struct bio *bio); -int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq); -/* - * pblk recovery - */ -struct pblk_line *pblk_recov_l2p(struct pblk *pblk); -int pblk_recov_pad(struct pblk *pblk); -int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta); - -/* - * pblk gc - */ -#define PBLK_GC_MAX_READERS 8 /* Max number of outstanding GC reader jobs */ -#define PBLK_GC_RQ_QD 128 /* Queue depth for inflight GC requests */ -#define PBLK_GC_L_QD 4 /* Queue depth for inflight GC lines */ - -int pblk_gc_init(struct pblk *pblk); -void pblk_gc_exit(struct pblk *pblk, bool graceful); -void pblk_gc_should_start(struct pblk *pblk); -void pblk_gc_should_stop(struct pblk *pblk); -void pblk_gc_should_kick(struct pblk *pblk); -void pblk_gc_free_full_lines(struct pblk *pblk); -void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled, - int *gc_active); -int pblk_gc_sysfs_force(struct pblk *pblk, int force); -void pblk_put_line_back(struct pblk *pblk, struct pblk_line *line); - -/* - * pblk rate limiter - */ -void pblk_rl_init(struct pblk_rl *rl, int budget, int threshold); -void pblk_rl_free(struct pblk_rl *rl); -void pblk_rl_update_rates(struct pblk_rl *rl); -int pblk_rl_high_thrs(struct pblk_rl *rl); -unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl); -unsigned long pblk_rl_nr_user_free_blks(struct pblk_rl *rl); -int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries); -void pblk_rl_inserted(struct pblk_rl *rl, int nr_entries); -void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries); -int pblk_rl_gc_may_insert(struct pblk_rl *rl, int nr_entries); -void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries); -void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc); -int pblk_rl_max_io(struct pblk_rl *rl); -void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line); -void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line, - bool used); -int pblk_rl_is_limit(struct pblk_rl *rl); - -void pblk_rl_werr_line_in(struct pblk_rl *rl); -void pblk_rl_werr_line_out(struct pblk_rl *rl); - -/* - * pblk sysfs - */ -int pblk_sysfs_init(struct gendisk *tdisk); -void pblk_sysfs_exit(struct gendisk *tdisk); - -static inline struct nvm_rq *nvm_rq_from_c_ctx(void *c_ctx) -{ - return c_ctx - sizeof(struct nvm_rq); -} - -static inline void *emeta_to_bb(struct line_emeta *emeta) -{ - return emeta->bb_bitmap; -} - -static inline void *emeta_to_wa(struct pblk_line_meta *lm, - struct line_emeta *emeta) -{ - return emeta->bb_bitmap + lm->blk_bitmap_len; -} - -static inline void *emeta_to_lbas(struct pblk *pblk, struct line_emeta *emeta) -{ - return ((void *)emeta + pblk->lm.emeta_len[1]); -} - -static inline void *emeta_to_vsc(struct pblk *pblk, struct line_emeta *emeta) -{ - return (emeta_to_lbas(pblk, emeta) + pblk->lm.emeta_len[2]); -} - -static inline int pblk_line_vsc(struct pblk_line *line) -{ - return le32_to_cpu(*line->vsc); -} - -static inline int pblk_ppa_to_line_id(struct ppa_addr p) -{ - return p.a.blk; -} - -static inline struct pblk_line *pblk_ppa_to_line(struct pblk *pblk, - struct ppa_addr p) -{ - return &pblk->lines[pblk_ppa_to_line_id(p)]; -} - -static inline int pblk_ppa_to_pos(struct nvm_geo *geo, struct ppa_addr p) -{ - return p.a.lun * geo->num_ch + p.a.ch; -} - -static inline struct ppa_addr addr_to_gen_ppa(struct pblk *pblk, u64 paddr, - u64 line_id) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct ppa_addr ppa; - - if (geo->version == NVM_OCSSD_SPEC_12) { - struct nvm_addrf_12 *ppaf = (struct nvm_addrf_12 *)&pblk->addrf; - - ppa.ppa = 0; - ppa.g.blk = line_id; - ppa.g.pg = (paddr & ppaf->pg_mask) >> ppaf->pg_offset; - ppa.g.lun = (paddr & ppaf->lun_mask) >> ppaf->lun_offset; - ppa.g.ch = (paddr & ppaf->ch_mask) >> ppaf->ch_offset; - ppa.g.pl = (paddr & ppaf->pln_mask) >> ppaf->pln_offset; - ppa.g.sec = (paddr & ppaf->sec_mask) >> ppaf->sec_offset; - } else { - struct pblk_addrf *uaddrf = &pblk->uaddrf; - int secs, chnls, luns; - - ppa.ppa = 0; - - ppa.m.chk = line_id; - - paddr = div_u64_rem(paddr, uaddrf->sec_stripe, &secs); - ppa.m.sec = secs; - - paddr = div_u64_rem(paddr, uaddrf->ch_stripe, &chnls); - ppa.m.grp = chnls; - - paddr = div_u64_rem(paddr, uaddrf->lun_stripe, &luns); - ppa.m.pu = luns; - - ppa.m.sec += uaddrf->sec_stripe * paddr; - } - - return ppa; -} - -static inline struct nvm_chk_meta *pblk_dev_ppa_to_chunk(struct pblk *pblk, - struct ppa_addr p) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line *line = pblk_ppa_to_line(pblk, p); - int pos = pblk_ppa_to_pos(geo, p); - - return &line->chks[pos]; -} - -static inline u64 pblk_dev_ppa_to_chunk_addr(struct pblk *pblk, - struct ppa_addr p) -{ - struct nvm_tgt_dev *dev = pblk->dev; - - return dev_to_chunk_addr(dev->parent, &pblk->addrf, p); -} - -static inline u64 pblk_dev_ppa_to_line_addr(struct pblk *pblk, - struct ppa_addr p) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - u64 paddr; - - if (geo->version == NVM_OCSSD_SPEC_12) { - struct nvm_addrf_12 *ppaf = (struct nvm_addrf_12 *)&pblk->addrf; - - paddr = (u64)p.g.ch << ppaf->ch_offset; - paddr |= (u64)p.g.lun << ppaf->lun_offset; - paddr |= (u64)p.g.pg << ppaf->pg_offset; - paddr |= (u64)p.g.pl << ppaf->pln_offset; - paddr |= (u64)p.g.sec << ppaf->sec_offset; - } else { - struct pblk_addrf *uaddrf = &pblk->uaddrf; - u64 secs = p.m.sec; - int sec_stripe; - - paddr = (u64)p.m.grp * uaddrf->sec_stripe; - paddr += (u64)p.m.pu * uaddrf->sec_lun_stripe; - - secs = div_u64_rem(secs, uaddrf->sec_stripe, &sec_stripe); - paddr += secs * uaddrf->sec_ws_stripe; - paddr += sec_stripe; - } - - return paddr; -} - -static inline struct ppa_addr pblk_ppa32_to_ppa64(struct pblk *pblk, u32 ppa32) -{ - struct nvm_tgt_dev *dev = pblk->dev; - - return nvm_ppa32_to_ppa64(dev->parent, &pblk->addrf, ppa32); -} - -static inline u32 pblk_ppa64_to_ppa32(struct pblk *pblk, struct ppa_addr ppa64) -{ - struct nvm_tgt_dev *dev = pblk->dev; - - return nvm_ppa64_to_ppa32(dev->parent, &pblk->addrf, ppa64); -} - -static inline struct ppa_addr pblk_trans_map_get(struct pblk *pblk, - sector_t lba) -{ - struct ppa_addr ppa; - - if (pblk->addrf_len < 32) { - u32 *map = (u32 *)pblk->trans_map; - - ppa = pblk_ppa32_to_ppa64(pblk, map[lba]); - } else { - struct ppa_addr *map = (struct ppa_addr *)pblk->trans_map; - - ppa = map[lba]; - } - - return ppa; -} - -static inline void pblk_trans_map_set(struct pblk *pblk, sector_t lba, - struct ppa_addr ppa) -{ - if (pblk->addrf_len < 32) { - u32 *map = (u32 *)pblk->trans_map; - - map[lba] = pblk_ppa64_to_ppa32(pblk, ppa); - } else { - u64 *map = (u64 *)pblk->trans_map; - - map[lba] = ppa.ppa; - } -} - -static inline int pblk_ppa_empty(struct ppa_addr ppa_addr) -{ - return (ppa_addr.ppa == ADDR_EMPTY); -} - -static inline void pblk_ppa_set_empty(struct ppa_addr *ppa_addr) -{ - ppa_addr->ppa = ADDR_EMPTY; -} - -static inline bool pblk_ppa_comp(struct ppa_addr lppa, struct ppa_addr rppa) -{ - return (lppa.ppa == rppa.ppa); -} - -static inline int pblk_addr_in_cache(struct ppa_addr ppa) -{ - return (ppa.ppa != ADDR_EMPTY && ppa.c.is_cached); -} - -static inline int pblk_addr_to_cacheline(struct ppa_addr ppa) -{ - return ppa.c.line; -} - -static inline struct ppa_addr pblk_cacheline_to_addr(int addr) -{ - struct ppa_addr p; - - p.c.line = addr; - p.c.is_cached = 1; - - return p; -} - -static inline u32 pblk_calc_meta_header_crc(struct pblk *pblk, - struct line_header *header) -{ - u32 crc = ~(u32)0; - - crc = crc32_le(crc, (unsigned char *)header + sizeof(crc), - sizeof(struct line_header) - sizeof(crc)); - - return crc; -} - -static inline u32 pblk_calc_smeta_crc(struct pblk *pblk, - struct line_smeta *smeta) -{ - struct pblk_line_meta *lm = &pblk->lm; - u32 crc = ~(u32)0; - - crc = crc32_le(crc, (unsigned char *)smeta + - sizeof(struct line_header) + sizeof(crc), - lm->smeta_len - - sizeof(struct line_header) - sizeof(crc)); - - return crc; -} - -static inline u32 pblk_calc_emeta_crc(struct pblk *pblk, - struct line_emeta *emeta) -{ - struct pblk_line_meta *lm = &pblk->lm; - u32 crc = ~(u32)0; - - crc = crc32_le(crc, (unsigned char *)emeta + - sizeof(struct line_header) + sizeof(crc), - lm->emeta_len[0] - - sizeof(struct line_header) - sizeof(crc)); - - return crc; -} - -static inline int pblk_io_aligned(struct pblk *pblk, int nr_secs) -{ - return !(nr_secs % pblk->min_write_pgs); -} - -#ifdef CONFIG_NVM_PBLK_DEBUG -static inline void print_ppa(struct pblk *pblk, struct ppa_addr *p, - char *msg, int error) -{ - struct nvm_geo *geo = &pblk->dev->geo; - - if (p->c.is_cached) { - pblk_err(pblk, "ppa: (%s: %x) cache line: %llu\n", - msg, error, (u64)p->c.line); - } else if (geo->version == NVM_OCSSD_SPEC_12) { - pblk_err(pblk, "ppa: (%s: %x):ch:%d,lun:%d,blk:%d,pg:%d,pl:%d,sec:%d\n", - msg, error, - p->g.ch, p->g.lun, p->g.blk, - p->g.pg, p->g.pl, p->g.sec); - } else { - pblk_err(pblk, "ppa: (%s: %x):ch:%d,lun:%d,chk:%d,sec:%d\n", - msg, error, - p->m.grp, p->m.pu, p->m.chk, p->m.sec); - } -} - -static inline void pblk_print_failed_rqd(struct pblk *pblk, struct nvm_rq *rqd, - int error) -{ - int bit = -1; - - if (rqd->nr_ppas == 1) { - print_ppa(pblk, &rqd->ppa_addr, "rqd", error); - return; - } - - while ((bit = find_next_bit((void *)&rqd->ppa_status, rqd->nr_ppas, - bit + 1)) < rqd->nr_ppas) { - print_ppa(pblk, &rqd->ppa_list[bit], "rqd", error); - } - - pblk_err(pblk, "error:%d, ppa_status:%llx\n", error, rqd->ppa_status); -} - -static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev, - struct ppa_addr *ppas, int nr_ppas) -{ - struct nvm_geo *geo = &tgt_dev->geo; - struct ppa_addr *ppa; - int i; - - for (i = 0; i < nr_ppas; i++) { - ppa = &ppas[i]; - - if (geo->version == NVM_OCSSD_SPEC_12) { - if (!ppa->c.is_cached && - ppa->g.ch < geo->num_ch && - ppa->g.lun < geo->num_lun && - ppa->g.pl < geo->num_pln && - ppa->g.blk < geo->num_chk && - ppa->g.pg < geo->num_pg && - ppa->g.sec < geo->ws_min) - continue; - } else { - if (!ppa->c.is_cached && - ppa->m.grp < geo->num_ch && - ppa->m.pu < geo->num_lun && - ppa->m.chk < geo->num_chk && - ppa->m.sec < geo->clba) - continue; - } - - print_ppa(tgt_dev->q->queuedata, ppa, "boundary", i); - - return 1; - } - return 0; -} - -static inline int pblk_check_io(struct pblk *pblk, struct nvm_rq *rqd) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd); - - if (pblk_boundary_ppa_checks(dev, ppa_list, rqd->nr_ppas)) { - WARN_ON(1); - return -EINVAL; - } - - if (rqd->opcode == NVM_OP_PWRITE) { - struct pblk_line *line; - int i; - - for (i = 0; i < rqd->nr_ppas; i++) { - line = pblk_ppa_to_line(pblk, ppa_list[i]); - - spin_lock(&line->lock); - if (line->state != PBLK_LINESTATE_OPEN) { - pblk_err(pblk, "bad ppa: line:%d,state:%d\n", - line->id, line->state); - WARN_ON(1); - spin_unlock(&line->lock); - return -EINVAL; - } - spin_unlock(&line->lock); - } - } - - return 0; -} -#endif - -static inline int pblk_boundary_paddr_checks(struct pblk *pblk, u64 paddr) -{ - struct pblk_line_meta *lm = &pblk->lm; - - if (paddr > lm->sec_per_line) - return 1; - - return 0; -} - -static inline unsigned int pblk_get_bi_idx(struct bio *bio) -{ - return bio->bi_iter.bi_idx; -} - -static inline sector_t pblk_get_lba(struct bio *bio) -{ - return bio->bi_iter.bi_sector / NR_PHY_IN_LOG; -} - -static inline unsigned int pblk_get_secs(struct bio *bio) -{ - return bio->bi_iter.bi_size / PBLK_EXPOSED_PAGE_SIZE; -} - -static inline char *pblk_disk_name(struct pblk *pblk) -{ - struct gendisk *disk = pblk->disk; - - return disk->disk_name; -} - -static inline unsigned int pblk_get_min_chks(struct pblk *pblk) -{ - struct pblk_line_meta *lm = &pblk->lm; - /* In a worst-case scenario every line will have OP invalid sectors. - * We will then need a minimum of 1/OP lines to free up a single line - */ - - return DIV_ROUND_UP(100, pblk->op) * lm->blk_per_line; -} - -static inline struct pblk_sec_meta *pblk_get_meta(struct pblk *pblk, - void *meta, int index) -{ - return meta + - max_t(int, sizeof(struct pblk_sec_meta), pblk->oob_meta_size) - * index; -} - -static inline int pblk_dma_meta_size(struct pblk *pblk) -{ - return max_t(int, sizeof(struct pblk_sec_meta), pblk->oob_meta_size) - * NVM_MAX_VLBA; -} - -static inline int pblk_is_oob_meta_supported(struct pblk *pblk) -{ - return pblk->oob_meta_size >= sizeof(struct pblk_sec_meta); -} -#endif /* PBLK_H_ */ diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile index cbc509784b2e..dfaacd472e5d 100644 --- a/drivers/nvme/host/Makefile +++ b/drivers/nvme/host/Makefile @@ -12,7 +12,6 @@ obj-$(CONFIG_NVME_TCP) += nvme-tcp.o nvme-core-y := core.o ioctl.o nvme-core-$(CONFIG_TRACING) += trace.o nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o -nvme-core-$(CONFIG_NVM) += lightnvm.o nvme-core-$(CONFIG_BLK_DEV_ZONED) += zns.o nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS) += fault_inject.o nvme-core-$(CONFIG_NVME_HWMON) += hwmon.o diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index dfd9dec0c1f6..ce33014e3eb0 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -587,9 +587,6 @@ static void nvme_free_ns(struct kref *kref) { struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref); - if (ns->ndev) - nvme_nvm_unregister(ns); - put_disk(ns->disk); nvme_put_ns_head(ns->head); nvme_put_ctrl(ns->ctrl); @@ -3218,9 +3215,6 @@ static const struct attribute_group nvme_ns_id_attr_group = { const struct attribute_group *nvme_ns_id_attr_groups[] = { &nvme_ns_id_attr_group, -#ifdef CONFIG_NVM - &nvme_nvm_attr_group, -#endif NULL, }; @@ -3767,13 +3761,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, if (nvme_update_ns_info(ns, id)) goto out_put_disk; - if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) { - if (nvme_nvm_register(ns, disk->disk_name, node)) { - dev_warn(ctrl->device, "LightNVM init failure\n"); - goto out_put_disk; - } - } - down_write(&ctrl->namespaces_rwsem); list_add_tail(&ns->list, &ctrl->namespaces); up_write(&ctrl->namespaces_rwsem); diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 305ddd415e45..22314962842d 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -342,9 +342,7 @@ static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd, case NVME_IOCTL_IO64_CMD: return nvme_user_cmd64(ns->ctrl, ns, argp); default: - if (!ns->ndev) - return -ENOTTY; - return nvme_nvm_ioctl(ns, cmd, argp); + return -ENOTTY; } } diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c deleted file mode 100644 index e9d9ad47f70f..000000000000 --- a/drivers/nvme/host/lightnvm.c +++ /dev/null @@ -1,1274 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * nvme-lightnvm.c - LightNVM NVMe device - * - * Copyright (C) 2014-2015 IT University of Copenhagen - * Initial release: Matias Bjorling - */ - -#include "nvme.h" - -#include -#include -#include -#include -#include -#include - -enum nvme_nvm_admin_opcode { - nvme_nvm_admin_identity = 0xe2, - nvme_nvm_admin_get_bb_tbl = 0xf2, - nvme_nvm_admin_set_bb_tbl = 0xf1, -}; - -enum nvme_nvm_log_page { - NVME_NVM_LOG_REPORT_CHUNK = 0xca, -}; - -struct nvme_nvm_ph_rw { - __u8 opcode; - __u8 flags; - __u16 command_id; - __le32 nsid; - __u64 rsvd2; - __le64 metadata; - __le64 prp1; - __le64 prp2; - __le64 spba; - __le16 length; - __le16 control; - __le32 dsmgmt; - __le64 resv; -}; - -struct nvme_nvm_erase_blk { - __u8 opcode; - __u8 flags; - __u16 command_id; - __le32 nsid; - __u64 rsvd[2]; - __le64 prp1; - __le64 prp2; - __le64 spba; - __le16 length; - __le16 control; - __le32 dsmgmt; - __le64 resv; -}; - -struct nvme_nvm_identity { - __u8 opcode; - __u8 flags; - __u16 command_id; - __le32 nsid; - __u64 rsvd[2]; - __le64 prp1; - __le64 prp2; - __u32 rsvd11[6]; -}; - -struct nvme_nvm_getbbtbl { - __u8 opcode; - __u8 flags; - __u16 command_id; - __le32 nsid; - __u64 rsvd[2]; - __le64 prp1; - __le64 prp2; - __le64 spba; - __u32 rsvd4[4]; -}; - -struct nvme_nvm_setbbtbl { - __u8 opcode; - __u8 flags; - __u16 command_id; - __le32 nsid; - __le64 rsvd[2]; - __le64 prp1; - __le64 prp2; - __le64 spba; - __le16 nlb; - __u8 value; - __u8 rsvd3; - __u32 rsvd4[3]; -}; - -struct nvme_nvm_command { - union { - struct nvme_common_command common; - struct nvme_nvm_ph_rw ph_rw; - struct nvme_nvm_erase_blk erase; - struct nvme_nvm_identity identity; - struct nvme_nvm_getbbtbl get_bb; - struct nvme_nvm_setbbtbl set_bb; - }; -}; - -struct nvme_nvm_id12_grp { - __u8 mtype; - __u8 fmtype; - __le16 res16; - __u8 num_ch; - __u8 num_lun; - __u8 num_pln; - __u8 rsvd1; - __le16 num_chk; - __le16 num_pg; - __le16 fpg_sz; - __le16 csecs; - __le16 sos; - __le16 rsvd2; - __le32 trdt; - __le32 trdm; - __le32 tprt; - __le32 tprm; - __le32 tbet; - __le32 tbem; - __le32 mpos; - __le32 mccap; - __le16 cpar; - __u8 reserved[906]; -} __packed; - -struct nvme_nvm_id12_addrf { - __u8 ch_offset; - __u8 ch_len; - __u8 lun_offset; - __u8 lun_len; - __u8 pln_offset; - __u8 pln_len; - __u8 blk_offset; - __u8 blk_len; - __u8 pg_offset; - __u8 pg_len; - __u8 sec_offset; - __u8 sec_len; - __u8 res[4]; -} __packed; - -struct nvme_nvm_id12 { - __u8 ver_id; - __u8 vmnt; - __u8 cgrps; - __u8 res; - __le32 cap; - __le32 dom; - struct nvme_nvm_id12_addrf ppaf; - __u8 resv[228]; - struct nvme_nvm_id12_grp grp; - __u8 resv2[2880]; -} __packed; - -struct nvme_nvm_bb_tbl { - __u8 tblid[4]; - __le16 verid; - __le16 revid; - __le32 rvsd1; - __le32 tblks; - __le32 tfact; - __le32 tgrown; - __le32 tdresv; - __le32 thresv; - __le32 rsvd2[8]; - __u8 blk[]; -}; - -struct nvme_nvm_id20_addrf { - __u8 grp_len; - __u8 pu_len; - __u8 chk_len; - __u8 lba_len; - __u8 resv[4]; -}; - -struct nvme_nvm_id20 { - __u8 mjr; - __u8 mnr; - __u8 resv[6]; - - struct nvme_nvm_id20_addrf lbaf; - - __le32 mccap; - __u8 resv2[12]; - - __u8 wit; - __u8 resv3[31]; - - /* Geometry */ - __le16 num_grp; - __le16 num_pu; - __le32 num_chk; - __le32 clba; - __u8 resv4[52]; - - /* Write data requirements */ - __le32 ws_min; - __le32 ws_opt; - __le32 mw_cunits; - __le32 maxoc; - __le32 maxocpu; - __u8 resv5[44]; - - /* Performance related metrics */ - __le32 trdt; - __le32 trdm; - __le32 twrt; - __le32 twrm; - __le32 tcrst; - __le32 tcrsm; - __u8 resv6[40]; - - /* Reserved area */ - __u8 resv7[2816]; - - /* Vendor specific */ - __u8 vs[1024]; -}; - -struct nvme_nvm_chk_meta { - __u8 state; - __u8 type; - __u8 wi; - __u8 rsvd[5]; - __le64 slba; - __le64 cnlb; - __le64 wp; -}; - -/* - * Check we didn't inadvertently grow the command struct - */ -static inline void _nvme_nvm_check_size(void) -{ - BUILD_BUG_ON(sizeof(struct nvme_nvm_identity) != 64); - BUILD_BUG_ON(sizeof(struct nvme_nvm_ph_rw) != 64); - BUILD_BUG_ON(sizeof(struct nvme_nvm_erase_blk) != 64); - BUILD_BUG_ON(sizeof(struct nvme_nvm_getbbtbl) != 64); - BUILD_BUG_ON(sizeof(struct nvme_nvm_setbbtbl) != 64); - BUILD_BUG_ON(sizeof(struct nvme_nvm_id12_grp) != 960); - BUILD_BUG_ON(sizeof(struct nvme_nvm_id12_addrf) != 16); - BUILD_BUG_ON(sizeof(struct nvme_nvm_id12) != NVME_IDENTIFY_DATA_SIZE); - BUILD_BUG_ON(sizeof(struct nvme_nvm_bb_tbl) != 64); - BUILD_BUG_ON(sizeof(struct nvme_nvm_id20_addrf) != 8); - BUILD_BUG_ON(sizeof(struct nvme_nvm_id20) != NVME_IDENTIFY_DATA_SIZE); - BUILD_BUG_ON(sizeof(struct nvme_nvm_chk_meta) != 32); - BUILD_BUG_ON(sizeof(struct nvme_nvm_chk_meta) != - sizeof(struct nvm_chk_meta)); -} - -static void nvme_nvm_set_addr_12(struct nvm_addrf_12 *dst, - struct nvme_nvm_id12_addrf *src) -{ - dst->ch_len = src->ch_len; - dst->lun_len = src->lun_len; - dst->blk_len = src->blk_len; - dst->pg_len = src->pg_len; - dst->pln_len = src->pln_len; - dst->sec_len = src->sec_len; - - dst->ch_offset = src->ch_offset; - dst->lun_offset = src->lun_offset; - dst->blk_offset = src->blk_offset; - dst->pg_offset = src->pg_offset; - dst->pln_offset = src->pln_offset; - dst->sec_offset = src->sec_offset; - - dst->ch_mask = ((1ULL << dst->ch_len) - 1) << dst->ch_offset; - dst->lun_mask = ((1ULL << dst->lun_len) - 1) << dst->lun_offset; - dst->blk_mask = ((1ULL << dst->blk_len) - 1) << dst->blk_offset; - dst->pg_mask = ((1ULL << dst->pg_len) - 1) << dst->pg_offset; - dst->pln_mask = ((1ULL << dst->pln_len) - 1) << dst->pln_offset; - dst->sec_mask = ((1ULL << dst->sec_len) - 1) << dst->sec_offset; -} - -static int nvme_nvm_setup_12(struct nvme_nvm_id12 *id, - struct nvm_geo *geo) -{ - struct nvme_nvm_id12_grp *src; - int sec_per_pg, sec_per_pl, pg_per_blk; - - if (id->cgrps != 1) - return -EINVAL; - - src = &id->grp; - - if (src->mtype != 0) { - pr_err("nvm: memory type not supported\n"); - return -EINVAL; - } - - /* 1.2 spec. only reports a single version id - unfold */ - geo->major_ver_id = id->ver_id; - geo->minor_ver_id = 2; - - /* Set compacted version for upper layers */ - geo->version = NVM_OCSSD_SPEC_12; - - geo->num_ch = src->num_ch; - geo->num_lun = src->num_lun; - geo->all_luns = geo->num_ch * geo->num_lun; - - geo->num_chk = le16_to_cpu(src->num_chk); - - geo->csecs = le16_to_cpu(src->csecs); - geo->sos = le16_to_cpu(src->sos); - - pg_per_blk = le16_to_cpu(src->num_pg); - sec_per_pg = le16_to_cpu(src->fpg_sz) / geo->csecs; - sec_per_pl = sec_per_pg * src->num_pln; - geo->clba = sec_per_pl * pg_per_blk; - - geo->all_chunks = geo->all_luns * geo->num_chk; - geo->total_secs = geo->clba * geo->all_chunks; - - geo->ws_min = sec_per_pg; - geo->ws_opt = sec_per_pg; - geo->mw_cunits = geo->ws_opt << 3; /* default to MLC safe values */ - - /* Do not impose values for maximum number of open blocks as it is - * unspecified in 1.2. Users of 1.2 must be aware of this and eventually - * specify these values through a quirk if restrictions apply. - */ - geo->maxoc = geo->all_luns * geo->num_chk; - geo->maxocpu = geo->num_chk; - - geo->mccap = le32_to_cpu(src->mccap); - - geo->trdt = le32_to_cpu(src->trdt); - geo->trdm = le32_to_cpu(src->trdm); - geo->tprt = le32_to_cpu(src->tprt); - geo->tprm = le32_to_cpu(src->tprm); - geo->tbet = le32_to_cpu(src->tbet); - geo->tbem = le32_to_cpu(src->tbem); - - /* 1.2 compatibility */ - geo->vmnt = id->vmnt; - geo->cap = le32_to_cpu(id->cap); - geo->dom = le32_to_cpu(id->dom); - - geo->mtype = src->mtype; - geo->fmtype = src->fmtype; - - geo->cpar = le16_to_cpu(src->cpar); - geo->mpos = le32_to_cpu(src->mpos); - - geo->pln_mode = NVM_PLANE_SINGLE; - - if (geo->mpos & 0x020202) { - geo->pln_mode = NVM_PLANE_DOUBLE; - geo->ws_opt <<= 1; - } else if (geo->mpos & 0x040404) { - geo->pln_mode = NVM_PLANE_QUAD; - geo->ws_opt <<= 2; - } - - geo->num_pln = src->num_pln; - geo->num_pg = le16_to_cpu(src->num_pg); - geo->fpg_sz = le16_to_cpu(src->fpg_sz); - - nvme_nvm_set_addr_12((struct nvm_addrf_12 *)&geo->addrf, &id->ppaf); - - return 0; -} - -static void nvme_nvm_set_addr_20(struct nvm_addrf *dst, - struct nvme_nvm_id20_addrf *src) -{ - dst->ch_len = src->grp_len; - dst->lun_len = src->pu_len; - dst->chk_len = src->chk_len; - dst->sec_len = src->lba_len; - - dst->sec_offset = 0; - dst->chk_offset = dst->sec_len; - dst->lun_offset = dst->chk_offset + dst->chk_len; - dst->ch_offset = dst->lun_offset + dst->lun_len; - - dst->ch_mask = ((1ULL << dst->ch_len) - 1) << dst->ch_offset; - dst->lun_mask = ((1ULL << dst->lun_len) - 1) << dst->lun_offset; - dst->chk_mask = ((1ULL << dst->chk_len) - 1) << dst->chk_offset; - dst->sec_mask = ((1ULL << dst->sec_len) - 1) << dst->sec_offset; -} - -static int nvme_nvm_setup_20(struct nvme_nvm_id20 *id, - struct nvm_geo *geo) -{ - geo->major_ver_id = id->mjr; - geo->minor_ver_id = id->mnr; - - /* Set compacted version for upper layers */ - geo->version = NVM_OCSSD_SPEC_20; - - geo->num_ch = le16_to_cpu(id->num_grp); - geo->num_lun = le16_to_cpu(id->num_pu); - geo->all_luns = geo->num_ch * geo->num_lun; - - geo->num_chk = le32_to_cpu(id->num_chk); - geo->clba = le32_to_cpu(id->clba); - - geo->all_chunks = geo->all_luns * geo->num_chk; - geo->total_secs = geo->clba * geo->all_chunks; - - geo->ws_min = le32_to_cpu(id->ws_min); - geo->ws_opt = le32_to_cpu(id->ws_opt); - geo->mw_cunits = le32_to_cpu(id->mw_cunits); - geo->maxoc = le32_to_cpu(id->maxoc); - geo->maxocpu = le32_to_cpu(id->maxocpu); - - geo->trdt = le32_to_cpu(id->trdt); - geo->trdm = le32_to_cpu(id->trdm); - geo->tprt = le32_to_cpu(id->twrt); - geo->tprm = le32_to_cpu(id->twrm); - geo->tbet = le32_to_cpu(id->tcrst); - geo->tbem = le32_to_cpu(id->tcrsm); - - nvme_nvm_set_addr_20(&geo->addrf, &id->lbaf); - - return 0; -} - -static int nvme_nvm_identity(struct nvm_dev *nvmdev) -{ - struct nvme_ns *ns = nvmdev->q->queuedata; - struct nvme_nvm_id12 *id; - struct nvme_nvm_command c = {}; - int ret; - - c.identity.opcode = nvme_nvm_admin_identity; - c.identity.nsid = cpu_to_le32(ns->head->ns_id); - - id = kmalloc(sizeof(struct nvme_nvm_id12), GFP_KERNEL); - if (!id) - return -ENOMEM; - - ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, (struct nvme_command *)&c, - id, sizeof(struct nvme_nvm_id12)); - if (ret) { - ret = -EIO; - goto out; - } - - /* - * The 1.2 and 2.0 specifications share the first byte in their geometry - * command to make it possible to know what version a device implements. - */ - switch (id->ver_id) { - case 1: - ret = nvme_nvm_setup_12(id, &nvmdev->geo); - break; - case 2: - ret = nvme_nvm_setup_20((struct nvme_nvm_id20 *)id, - &nvmdev->geo); - break; - default: - dev_err(ns->ctrl->device, "OCSSD revision not supported (%d)\n", - id->ver_id); - ret = -EINVAL; - } - -out: - kfree(id); - return ret; -} - -static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa, - u8 *blks) -{ - struct request_queue *q = nvmdev->q; - struct nvm_geo *geo = &nvmdev->geo; - struct nvme_ns *ns = q->queuedata; - struct nvme_ctrl *ctrl = ns->ctrl; - struct nvme_nvm_command c = {}; - struct nvme_nvm_bb_tbl *bb_tbl; - int nr_blks = geo->num_chk * geo->num_pln; - int tblsz = sizeof(struct nvme_nvm_bb_tbl) + nr_blks; - int ret = 0; - - c.get_bb.opcode = nvme_nvm_admin_get_bb_tbl; - c.get_bb.nsid = cpu_to_le32(ns->head->ns_id); - c.get_bb.spba = cpu_to_le64(ppa.ppa); - - bb_tbl = kzalloc(tblsz, GFP_KERNEL); - if (!bb_tbl) - return -ENOMEM; - - ret = nvme_submit_sync_cmd(ctrl->admin_q, (struct nvme_command *)&c, - bb_tbl, tblsz); - if (ret) { - dev_err(ctrl->device, "get bad block table failed (%d)\n", ret); - ret = -EIO; - goto out; - } - - if (bb_tbl->tblid[0] != 'B' || bb_tbl->tblid[1] != 'B' || - bb_tbl->tblid[2] != 'L' || bb_tbl->tblid[3] != 'T') { - dev_err(ctrl->device, "bbt format mismatch\n"); - ret = -EINVAL; - goto out; - } - - if (le16_to_cpu(bb_tbl->verid) != 1) { - ret = -EINVAL; - dev_err(ctrl->device, "bbt version not supported\n"); - goto out; - } - - if (le32_to_cpu(bb_tbl->tblks) != nr_blks) { - ret = -EINVAL; - dev_err(ctrl->device, - "bbt unsuspected blocks returned (%u!=%u)", - le32_to_cpu(bb_tbl->tblks), nr_blks); - goto out; - } - - memcpy(blks, bb_tbl->blk, geo->num_chk * geo->num_pln); -out: - kfree(bb_tbl); - return ret; -} - -static int nvme_nvm_set_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr *ppas, - int nr_ppas, int type) -{ - struct nvme_ns *ns = nvmdev->q->queuedata; - struct nvme_nvm_command c = {}; - int ret = 0; - - c.set_bb.opcode = nvme_nvm_admin_set_bb_tbl; - c.set_bb.nsid = cpu_to_le32(ns->head->ns_id); - c.set_bb.spba = cpu_to_le64(ppas->ppa); - c.set_bb.nlb = cpu_to_le16(nr_ppas - 1); - c.set_bb.value = type; - - ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, (struct nvme_command *)&c, - NULL, 0); - if (ret) - dev_err(ns->ctrl->device, "set bad block table failed (%d)\n", - ret); - return ret; -} - -/* - * Expect the lba in device format - */ -static int nvme_nvm_get_chk_meta(struct nvm_dev *ndev, - sector_t slba, int nchks, - struct nvm_chk_meta *meta) -{ - struct nvm_geo *geo = &ndev->geo; - struct nvme_ns *ns = ndev->q->queuedata; - struct nvme_ctrl *ctrl = ns->ctrl; - struct nvme_nvm_chk_meta *dev_meta, *dev_meta_off; - struct ppa_addr ppa; - size_t left = nchks * sizeof(struct nvme_nvm_chk_meta); - size_t log_pos, offset, len; - int i, max_len; - int ret = 0; - - /* - * limit requests to maximum 256K to avoid issuing arbitrary large - * requests when the device does not specific a maximum transfer size. - */ - max_len = min_t(unsigned int, ctrl->max_hw_sectors << 9, 256 * 1024); - - dev_meta = kmalloc(max_len, GFP_KERNEL); - if (!dev_meta) - return -ENOMEM; - - /* Normalize lba address space to obtain log offset */ - ppa.ppa = slba; - ppa = dev_to_generic_addr(ndev, ppa); - - log_pos = ppa.m.chk; - log_pos += ppa.m.pu * geo->num_chk; - log_pos += ppa.m.grp * geo->num_lun * geo->num_chk; - - offset = log_pos * sizeof(struct nvme_nvm_chk_meta); - - while (left) { - len = min_t(unsigned int, left, max_len); - - memset(dev_meta, 0, max_len); - dev_meta_off = dev_meta; - - ret = nvme_get_log(ctrl, ns->head->ns_id, - NVME_NVM_LOG_REPORT_CHUNK, 0, NVME_CSI_NVM, - dev_meta, len, offset); - if (ret) { - dev_err(ctrl->device, "Get REPORT CHUNK log error\n"); - break; - } - - for (i = 0; i < len; i += sizeof(struct nvme_nvm_chk_meta)) { - meta->state = dev_meta_off->state; - meta->type = dev_meta_off->type; - meta->wi = dev_meta_off->wi; - meta->slba = le64_to_cpu(dev_meta_off->slba); - meta->cnlb = le64_to_cpu(dev_meta_off->cnlb); - meta->wp = le64_to_cpu(dev_meta_off->wp); - - meta++; - dev_meta_off++; - } - - offset += len; - left -= len; - } - - kfree(dev_meta); - - return ret; -} - -static inline void nvme_nvm_rqtocmd(struct nvm_rq *rqd, struct nvme_ns *ns, - struct nvme_nvm_command *c) -{ - c->ph_rw.opcode = rqd->opcode; - c->ph_rw.nsid = cpu_to_le32(ns->head->ns_id); - c->ph_rw.spba = cpu_to_le64(rqd->ppa_addr.ppa); - c->ph_rw.metadata = cpu_to_le64(rqd->dma_meta_list); - c->ph_rw.control = cpu_to_le16(rqd->flags); - c->ph_rw.length = cpu_to_le16(rqd->nr_ppas - 1); -} - -static void nvme_nvm_end_io(struct request *rq, blk_status_t status) -{ - struct nvm_rq *rqd = rq->end_io_data; - - rqd->ppa_status = le64_to_cpu(nvme_req(rq)->result.u64); - rqd->error = nvme_req(rq)->status; - nvm_end_io(rqd); - - kfree(nvme_req(rq)->cmd); - blk_mq_free_request(rq); -} - -static struct request *nvme_nvm_alloc_request(struct request_queue *q, - struct nvm_rq *rqd, - struct nvme_nvm_command *cmd) -{ - struct nvme_ns *ns = q->queuedata; - struct request *rq; - - nvme_nvm_rqtocmd(rqd, ns, cmd); - - rq = nvme_alloc_request(q, (struct nvme_command *)cmd, 0); - if (IS_ERR(rq)) - return rq; - - rq->cmd_flags &= ~REQ_FAILFAST_DRIVER; - - if (rqd->bio) - blk_rq_append_bio(rq, rqd->bio); - else - rq->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM); - - return rq; -} - -static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd, - void *buf) -{ - struct nvm_geo *geo = &dev->geo; - struct request_queue *q = dev->q; - struct nvme_nvm_command *cmd; - struct request *rq; - int ret; - - cmd = kzalloc(sizeof(struct nvme_nvm_command), GFP_KERNEL); - if (!cmd) - return -ENOMEM; - - rq = nvme_nvm_alloc_request(q, rqd, cmd); - if (IS_ERR(rq)) { - ret = PTR_ERR(rq); - goto err_free_cmd; - } - - if (buf) { - ret = blk_rq_map_kern(q, rq, buf, geo->csecs * rqd->nr_ppas, - GFP_KERNEL); - if (ret) - goto err_free_cmd; - } - - rq->end_io_data = rqd; - - blk_execute_rq_nowait(NULL, rq, 0, nvme_nvm_end_io); - - return 0; - -err_free_cmd: - kfree(cmd); - return ret; -} - -static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name, - int size) -{ - struct nvme_ns *ns = nvmdev->q->queuedata; - - return dma_pool_create(name, ns->ctrl->dev, size, PAGE_SIZE, 0); -} - -static void nvme_nvm_destroy_dma_pool(void *pool) -{ - struct dma_pool *dma_pool = pool; - - dma_pool_destroy(dma_pool); -} - -static void *nvme_nvm_dev_dma_alloc(struct nvm_dev *dev, void *pool, - gfp_t mem_flags, dma_addr_t *dma_handler) -{ - return dma_pool_alloc(pool, mem_flags, dma_handler); -} - -static void nvme_nvm_dev_dma_free(void *pool, void *addr, - dma_addr_t dma_handler) -{ - dma_pool_free(pool, addr, dma_handler); -} - -static struct nvm_dev_ops nvme_nvm_dev_ops = { - .identity = nvme_nvm_identity, - - .get_bb_tbl = nvme_nvm_get_bb_tbl, - .set_bb_tbl = nvme_nvm_set_bb_tbl, - - .get_chk_meta = nvme_nvm_get_chk_meta, - - .submit_io = nvme_nvm_submit_io, - - .create_dma_pool = nvme_nvm_create_dma_pool, - .destroy_dma_pool = nvme_nvm_destroy_dma_pool, - .dev_dma_alloc = nvme_nvm_dev_dma_alloc, - .dev_dma_free = nvme_nvm_dev_dma_free, -}; - -static int nvme_nvm_submit_user_cmd(struct request_queue *q, - struct nvme_ns *ns, - struct nvme_nvm_command *vcmd, - void __user *ubuf, unsigned int bufflen, - void __user *meta_buf, unsigned int meta_len, - void __user *ppa_buf, unsigned int ppa_len, - u32 *result, u64 *status, unsigned int timeout) -{ - bool write = nvme_is_write((struct nvme_command *)vcmd); - struct nvm_dev *dev = ns->ndev; - struct request *rq; - struct bio *bio = NULL; - __le64 *ppa_list = NULL; - dma_addr_t ppa_dma; - __le64 *metadata = NULL; - dma_addr_t metadata_dma; - DECLARE_COMPLETION_ONSTACK(wait); - int ret = 0; - - rq = nvme_alloc_request(q, (struct nvme_command *)vcmd, 0); - if (IS_ERR(rq)) { - ret = -ENOMEM; - goto err_cmd; - } - - if (timeout) - rq->timeout = timeout; - - if (ppa_buf && ppa_len) { - ppa_list = dma_pool_alloc(dev->dma_pool, GFP_KERNEL, &ppa_dma); - if (!ppa_list) { - ret = -ENOMEM; - goto err_rq; - } - if (copy_from_user(ppa_list, (void __user *)ppa_buf, - sizeof(u64) * (ppa_len + 1))) { - ret = -EFAULT; - goto err_ppa; - } - vcmd->ph_rw.spba = cpu_to_le64(ppa_dma); - } else { - vcmd->ph_rw.spba = cpu_to_le64((uintptr_t)ppa_buf); - } - - if (ubuf && bufflen) { - ret = blk_rq_map_user(q, rq, NULL, ubuf, bufflen, GFP_KERNEL); - if (ret) - goto err_ppa; - bio = rq->bio; - - if (meta_buf && meta_len) { - metadata = dma_pool_alloc(dev->dma_pool, GFP_KERNEL, - &metadata_dma); - if (!metadata) { - ret = -ENOMEM; - goto err_map; - } - - if (write) { - if (copy_from_user(metadata, - (void __user *)meta_buf, - meta_len)) { - ret = -EFAULT; - goto err_meta; - } - } - vcmd->ph_rw.metadata = cpu_to_le64(metadata_dma); - } - - bio_set_dev(bio, ns->disk->part0); - } - - blk_execute_rq(NULL, rq, 0); - - if (nvme_req(rq)->flags & NVME_REQ_CANCELLED) - ret = -EINTR; - else if (nvme_req(rq)->status & 0x7ff) - ret = -EIO; - if (result) - *result = nvme_req(rq)->status & 0x7ff; - if (status) - *status = le64_to_cpu(nvme_req(rq)->result.u64); - - if (metadata && !ret && !write) { - if (copy_to_user(meta_buf, (void *)metadata, meta_len)) - ret = -EFAULT; - } -err_meta: - if (meta_buf && meta_len) - dma_pool_free(dev->dma_pool, metadata, metadata_dma); -err_map: - if (bio) - blk_rq_unmap_user(bio); -err_ppa: - if (ppa_buf && ppa_len) - dma_pool_free(dev->dma_pool, ppa_list, ppa_dma); -err_rq: - blk_mq_free_request(rq); -err_cmd: - return ret; -} - -static int nvme_nvm_submit_vio(struct nvme_ns *ns, - struct nvm_user_vio __user *uvio) -{ - struct nvm_user_vio vio; - struct nvme_nvm_command c; - unsigned int length; - int ret; - - if (copy_from_user(&vio, uvio, sizeof(vio))) - return -EFAULT; - if (vio.flags) - return -EINVAL; - - memset(&c, 0, sizeof(c)); - c.ph_rw.opcode = vio.opcode; - c.ph_rw.nsid = cpu_to_le32(ns->head->ns_id); - c.ph_rw.control = cpu_to_le16(vio.control); - c.ph_rw.length = cpu_to_le16(vio.nppas); - - length = (vio.nppas + 1) << ns->lba_shift; - - ret = nvme_nvm_submit_user_cmd(ns->queue, ns, &c, - (void __user *)(uintptr_t)vio.addr, length, - (void __user *)(uintptr_t)vio.metadata, - vio.metadata_len, - (void __user *)(uintptr_t)vio.ppa_list, vio.nppas, - &vio.result, &vio.status, 0); - - if (ret && copy_to_user(uvio, &vio, sizeof(vio))) - return -EFAULT; - - return ret; -} - -static int nvme_nvm_user_vcmd(struct nvme_ns *ns, int admin, - struct nvm_passthru_vio __user *uvcmd) -{ - struct nvm_passthru_vio vcmd; - struct nvme_nvm_command c; - struct request_queue *q; - unsigned int timeout = 0; - int ret; - - if (copy_from_user(&vcmd, uvcmd, sizeof(vcmd))) - return -EFAULT; - if ((vcmd.opcode != 0xF2) && (!capable(CAP_SYS_ADMIN))) - return -EACCES; - if (vcmd.flags) - return -EINVAL; - - memset(&c, 0, sizeof(c)); - c.common.opcode = vcmd.opcode; - c.common.nsid = cpu_to_le32(ns->head->ns_id); - c.common.cdw2[0] = cpu_to_le32(vcmd.cdw2); - c.common.cdw2[1] = cpu_to_le32(vcmd.cdw3); - /* cdw11-12 */ - c.ph_rw.length = cpu_to_le16(vcmd.nppas); - c.ph_rw.control = cpu_to_le16(vcmd.control); - c.common.cdw13 = cpu_to_le32(vcmd.cdw13); - c.common.cdw14 = cpu_to_le32(vcmd.cdw14); - c.common.cdw15 = cpu_to_le32(vcmd.cdw15); - - if (vcmd.timeout_ms) - timeout = msecs_to_jiffies(vcmd.timeout_ms); - - q = admin ? ns->ctrl->admin_q : ns->queue; - - ret = nvme_nvm_submit_user_cmd(q, ns, - (struct nvme_nvm_command *)&c, - (void __user *)(uintptr_t)vcmd.addr, vcmd.data_len, - (void __user *)(uintptr_t)vcmd.metadata, - vcmd.metadata_len, - (void __user *)(uintptr_t)vcmd.ppa_list, vcmd.nppas, - &vcmd.result, &vcmd.status, timeout); - - if (ret && copy_to_user(uvcmd, &vcmd, sizeof(vcmd))) - return -EFAULT; - - return ret; -} - -int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, void __user *argp) -{ - switch (cmd) { - case NVME_NVM_IOCTL_ADMIN_VIO: - return nvme_nvm_user_vcmd(ns, 1, argp); - case NVME_NVM_IOCTL_IO_VIO: - return nvme_nvm_user_vcmd(ns, 0, argp); - case NVME_NVM_IOCTL_SUBMIT_VIO: - return nvme_nvm_submit_vio(ns, argp); - default: - return -ENOTTY; - } -} - -int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node) -{ - struct request_queue *q = ns->queue; - struct nvm_dev *dev; - struct nvm_geo *geo; - - _nvme_nvm_check_size(); - - dev = nvm_alloc_dev(node); - if (!dev) - return -ENOMEM; - - /* Note that csecs and sos will be overridden if it is a 1.2 drive. */ - geo = &dev->geo; - geo->csecs = 1 << ns->lba_shift; - geo->sos = ns->ms; - if (ns->features & NVME_NS_EXT_LBAS) - geo->ext = true; - else - geo->ext = false; - geo->mdts = ns->ctrl->max_hw_sectors; - - dev->q = q; - memcpy(dev->name, disk_name, DISK_NAME_LEN); - dev->ops = &nvme_nvm_dev_ops; - dev->private_data = ns; - ns->ndev = dev; - - return nvm_register(dev); -} - -void nvme_nvm_unregister(struct nvme_ns *ns) -{ - nvm_unregister(ns->ndev); -} - -static ssize_t nvm_dev_attr_show(struct device *dev, - struct device_attribute *dattr, char *page) -{ - struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - struct nvm_dev *ndev = ns->ndev; - struct nvm_geo *geo = &ndev->geo; - struct attribute *attr; - - if (!ndev) - return 0; - - attr = &dattr->attr; - - if (strcmp(attr->name, "version") == 0) { - if (geo->major_ver_id == 1) - return scnprintf(page, PAGE_SIZE, "%u\n", - geo->major_ver_id); - else - return scnprintf(page, PAGE_SIZE, "%u.%u\n", - geo->major_ver_id, - geo->minor_ver_id); - } else if (strcmp(attr->name, "capabilities") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->cap); - } else if (strcmp(attr->name, "read_typ") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->trdt); - } else if (strcmp(attr->name, "read_max") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->trdm); - } else { - return scnprintf(page, - PAGE_SIZE, - "Unhandled attr(%s) in `%s`\n", - attr->name, __func__); - } -} - -static ssize_t nvm_dev_attr_show_ppaf(struct nvm_addrf_12 *ppaf, char *page) -{ - return scnprintf(page, PAGE_SIZE, - "0x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n", - ppaf->ch_offset, ppaf->ch_len, - ppaf->lun_offset, ppaf->lun_len, - ppaf->pln_offset, ppaf->pln_len, - ppaf->blk_offset, ppaf->blk_len, - ppaf->pg_offset, ppaf->pg_len, - ppaf->sec_offset, ppaf->sec_len); -} - -static ssize_t nvm_dev_attr_show_12(struct device *dev, - struct device_attribute *dattr, char *page) -{ - struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - struct nvm_dev *ndev = ns->ndev; - struct nvm_geo *geo = &ndev->geo; - struct attribute *attr; - - if (!ndev) - return 0; - - attr = &dattr->attr; - - if (strcmp(attr->name, "vendor_opcode") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->vmnt); - } else if (strcmp(attr->name, "device_mode") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->dom); - /* kept for compatibility */ - } else if (strcmp(attr->name, "media_manager") == 0) { - return scnprintf(page, PAGE_SIZE, "%s\n", "gennvm"); - } else if (strcmp(attr->name, "ppa_format") == 0) { - return nvm_dev_attr_show_ppaf((void *)&geo->addrf, page); - } else if (strcmp(attr->name, "media_type") == 0) { /* u8 */ - return scnprintf(page, PAGE_SIZE, "%u\n", geo->mtype); - } else if (strcmp(attr->name, "flash_media_type") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->fmtype); - } else if (strcmp(attr->name, "num_channels") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_ch); - } else if (strcmp(attr->name, "num_luns") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_lun); - } else if (strcmp(attr->name, "num_planes") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_pln); - } else if (strcmp(attr->name, "num_blocks") == 0) { /* u16 */ - return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_chk); - } else if (strcmp(attr->name, "num_pages") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_pg); - } else if (strcmp(attr->name, "page_size") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->fpg_sz); - } else if (strcmp(attr->name, "hw_sector_size") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->csecs); - } else if (strcmp(attr->name, "oob_sector_size") == 0) {/* u32 */ - return scnprintf(page, PAGE_SIZE, "%u\n", geo->sos); - } else if (strcmp(attr->name, "prog_typ") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->tprt); - } else if (strcmp(attr->name, "prog_max") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->tprm); - } else if (strcmp(attr->name, "erase_typ") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->tbet); - } else if (strcmp(attr->name, "erase_max") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->tbem); - } else if (strcmp(attr->name, "multiplane_modes") == 0) { - return scnprintf(page, PAGE_SIZE, "0x%08x\n", geo->mpos); - } else if (strcmp(attr->name, "media_capabilities") == 0) { - return scnprintf(page, PAGE_SIZE, "0x%08x\n", geo->mccap); - } else if (strcmp(attr->name, "max_phys_secs") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", NVM_MAX_VLBA); - } else { - return scnprintf(page, PAGE_SIZE, - "Unhandled attr(%s) in `%s`\n", - attr->name, __func__); - } -} - -static ssize_t nvm_dev_attr_show_20(struct device *dev, - struct device_attribute *dattr, char *page) -{ - struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - struct nvm_dev *ndev = ns->ndev; - struct nvm_geo *geo = &ndev->geo; - struct attribute *attr; - - if (!ndev) - return 0; - - attr = &dattr->attr; - - if (strcmp(attr->name, "groups") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_ch); - } else if (strcmp(attr->name, "punits") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_lun); - } else if (strcmp(attr->name, "chunks") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->num_chk); - } else if (strcmp(attr->name, "clba") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->clba); - } else if (strcmp(attr->name, "ws_min") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->ws_min); - } else if (strcmp(attr->name, "ws_opt") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->ws_opt); - } else if (strcmp(attr->name, "maxoc") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->maxoc); - } else if (strcmp(attr->name, "maxocpu") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->maxocpu); - } else if (strcmp(attr->name, "mw_cunits") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->mw_cunits); - } else if (strcmp(attr->name, "write_typ") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->tprt); - } else if (strcmp(attr->name, "write_max") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->tprm); - } else if (strcmp(attr->name, "reset_typ") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->tbet); - } else if (strcmp(attr->name, "reset_max") == 0) { - return scnprintf(page, PAGE_SIZE, "%u\n", geo->tbem); - } else { - return scnprintf(page, PAGE_SIZE, - "Unhandled attr(%s) in `%s`\n", - attr->name, __func__); - } -} - -#define NVM_DEV_ATTR_RO(_name) \ - DEVICE_ATTR(_name, S_IRUGO, nvm_dev_attr_show, NULL) -#define NVM_DEV_ATTR_12_RO(_name) \ - DEVICE_ATTR(_name, S_IRUGO, nvm_dev_attr_show_12, NULL) -#define NVM_DEV_ATTR_20_RO(_name) \ - DEVICE_ATTR(_name, S_IRUGO, nvm_dev_attr_show_20, NULL) - -/* general attributes */ -static NVM_DEV_ATTR_RO(version); -static NVM_DEV_ATTR_RO(capabilities); - -static NVM_DEV_ATTR_RO(read_typ); -static NVM_DEV_ATTR_RO(read_max); - -/* 1.2 values */ -static NVM_DEV_ATTR_12_RO(vendor_opcode); -static NVM_DEV_ATTR_12_RO(device_mode); -static NVM_DEV_ATTR_12_RO(ppa_format); -static NVM_DEV_ATTR_12_RO(media_manager); -static NVM_DEV_ATTR_12_RO(media_type); -static NVM_DEV_ATTR_12_RO(flash_media_type); -static NVM_DEV_ATTR_12_RO(num_channels); -static NVM_DEV_ATTR_12_RO(num_luns); -static NVM_DEV_ATTR_12_RO(num_planes); -static NVM_DEV_ATTR_12_RO(num_blocks); -static NVM_DEV_ATTR_12_RO(num_pages); -static NVM_DEV_ATTR_12_RO(page_size); -static NVM_DEV_ATTR_12_RO(hw_sector_size); -static NVM_DEV_ATTR_12_RO(oob_sector_size); -static NVM_DEV_ATTR_12_RO(prog_typ); -static NVM_DEV_ATTR_12_RO(prog_max); -static NVM_DEV_ATTR_12_RO(erase_typ); -static NVM_DEV_ATTR_12_RO(erase_max); -static NVM_DEV_ATTR_12_RO(multiplane_modes); -static NVM_DEV_ATTR_12_RO(media_capabilities); -static NVM_DEV_ATTR_12_RO(max_phys_secs); - -/* 2.0 values */ -static NVM_DEV_ATTR_20_RO(groups); -static NVM_DEV_ATTR_20_RO(punits); -static NVM_DEV_ATTR_20_RO(chunks); -static NVM_DEV_ATTR_20_RO(clba); -static NVM_DEV_ATTR_20_RO(ws_min); -static NVM_DEV_ATTR_20_RO(ws_opt); -static NVM_DEV_ATTR_20_RO(maxoc); -static NVM_DEV_ATTR_20_RO(maxocpu); -static NVM_DEV_ATTR_20_RO(mw_cunits); -static NVM_DEV_ATTR_20_RO(write_typ); -static NVM_DEV_ATTR_20_RO(write_max); -static NVM_DEV_ATTR_20_RO(reset_typ); -static NVM_DEV_ATTR_20_RO(reset_max); - -static struct attribute *nvm_dev_attrs[] = { - /* version agnostic attrs */ - &dev_attr_version.attr, - &dev_attr_capabilities.attr, - &dev_attr_read_typ.attr, - &dev_attr_read_max.attr, - - /* 1.2 attrs */ - &dev_attr_vendor_opcode.attr, - &dev_attr_device_mode.attr, - &dev_attr_media_manager.attr, - &dev_attr_ppa_format.attr, - &dev_attr_media_type.attr, - &dev_attr_flash_media_type.attr, - &dev_attr_num_channels.attr, - &dev_attr_num_luns.attr, - &dev_attr_num_planes.attr, - &dev_attr_num_blocks.attr, - &dev_attr_num_pages.attr, - &dev_attr_page_size.attr, - &dev_attr_hw_sector_size.attr, - &dev_attr_oob_sector_size.attr, - &dev_attr_prog_typ.attr, - &dev_attr_prog_max.attr, - &dev_attr_erase_typ.attr, - &dev_attr_erase_max.attr, - &dev_attr_multiplane_modes.attr, - &dev_attr_media_capabilities.attr, - &dev_attr_max_phys_secs.attr, - - /* 2.0 attrs */ - &dev_attr_groups.attr, - &dev_attr_punits.attr, - &dev_attr_chunks.attr, - &dev_attr_clba.attr, - &dev_attr_ws_min.attr, - &dev_attr_ws_opt.attr, - &dev_attr_maxoc.attr, - &dev_attr_maxocpu.attr, - &dev_attr_mw_cunits.attr, - - &dev_attr_write_typ.attr, - &dev_attr_write_max.attr, - &dev_attr_reset_typ.attr, - &dev_attr_reset_max.attr, - - NULL, -}; - -static umode_t nvm_dev_attrs_visible(struct kobject *kobj, - struct attribute *attr, int index) -{ - struct device *dev = kobj_to_dev(kobj); - struct gendisk *disk = dev_to_disk(dev); - struct nvme_ns *ns = disk->private_data; - struct nvm_dev *ndev = ns->ndev; - struct device_attribute *dev_attr = - container_of(attr, typeof(*dev_attr), attr); - - if (!ndev) - return 0; - - if (dev_attr->show == nvm_dev_attr_show) - return attr->mode; - - switch (ndev->geo.major_ver_id) { - case 1: - if (dev_attr->show == nvm_dev_attr_show_12) - return attr->mode; - break; - case 2: - if (dev_attr->show == nvm_dev_attr_show_20) - return attr->mode; - break; - } - - return 0; -} - -const struct attribute_group nvme_nvm_attr_group = { - .name = "lightnvm", - .attrs = nvm_dev_attrs, - .is_visible = nvm_dev_attrs_visible, -}; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 5cd1fa3b8464..ab803f91ace1 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include @@ -92,11 +91,6 @@ enum nvme_quirks { */ NVME_QUIRK_NO_DEEPEST_PS = (1 << 5), - /* - * Supports the LighNVM command set if indicated in vs[1]. - */ - NVME_QUIRK_LIGHTNVM = (1 << 6), - /* * Set MEDIUM priority on SQ creation */ @@ -823,26 +817,6 @@ static inline int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf) } #endif -#ifdef CONFIG_NVM -int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node); -void nvme_nvm_unregister(struct nvme_ns *ns); -extern const struct attribute_group nvme_nvm_attr_group; -int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, void __user *argp); -#else -static inline int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, - int node) -{ - return 0; -} - -static inline void nvme_nvm_unregister(struct nvme_ns *ns) {}; -static inline int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, - void __user *argp) -{ - return -ENOTTY; -} -#endif /* CONFIG_NVM */ - static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev) { return dev_to_disk(dev)->private_data; diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 51852085239e..db7a9bee2014 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3243,12 +3243,6 @@ static const struct pci_device_id nvme_id_table[] = { { PCI_DEVICE(0x1b4b, 0x1092), /* Lexar 256 GB SSD */ .driver_data = NVME_QUIRK_NO_NS_DESC_LIST | NVME_QUIRK_IGNORE_DEV_SUBNQN, }, - { PCI_DEVICE(0x1d1d, 0x1f1f), /* LighNVM qemu device */ - .driver_data = NVME_QUIRK_LIGHTNVM, }, - { PCI_DEVICE(0x1d1d, 0x2807), /* CNEX WL */ - .driver_data = NVME_QUIRK_LIGHTNVM, }, - { PCI_DEVICE(0x1d1d, 0x2601), /* CNEX Granby */ - .driver_data = NVME_QUIRK_LIGHTNVM, }, { PCI_DEVICE(0x10ec, 0x5762), /* ADATA SX6000LNP */ .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, }, { PCI_DEVICE(0x1cc1, 0x8201), /* ADATA SX8200PNP 512GB */ diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h deleted file mode 100644 index 0908abda9c1b..000000000000 --- a/include/linux/lightnvm.h +++ /dev/null @@ -1,697 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef NVM_H -#define NVM_H - -#include -#include -#include - -enum { - NVM_IO_OK = 0, - NVM_IO_REQUEUE = 1, - NVM_IO_DONE = 2, - NVM_IO_ERR = 3, - - NVM_IOTYPE_NONE = 0, - NVM_IOTYPE_GC = 1, -}; - -/* common format */ -#define NVM_GEN_CH_BITS (8) -#define NVM_GEN_LUN_BITS (8) -#define NVM_GEN_BLK_BITS (16) -#define NVM_GEN_RESERVED (32) - -/* 1.2 format */ -#define NVM_12_PG_BITS (16) -#define NVM_12_PL_BITS (4) -#define NVM_12_SEC_BITS (4) -#define NVM_12_RESERVED (8) - -/* 2.0 format */ -#define NVM_20_SEC_BITS (24) -#define NVM_20_RESERVED (8) - -enum { - NVM_OCSSD_SPEC_12 = 12, - NVM_OCSSD_SPEC_20 = 20, -}; - -struct ppa_addr { - /* Generic structure for all addresses */ - union { - /* generic device format */ - struct { - u64 ch : NVM_GEN_CH_BITS; - u64 lun : NVM_GEN_LUN_BITS; - u64 blk : NVM_GEN_BLK_BITS; - u64 reserved : NVM_GEN_RESERVED; - } a; - - /* 1.2 device format */ - struct { - u64 ch : NVM_GEN_CH_BITS; - u64 lun : NVM_GEN_LUN_BITS; - u64 blk : NVM_GEN_BLK_BITS; - u64 pg : NVM_12_PG_BITS; - u64 pl : NVM_12_PL_BITS; - u64 sec : NVM_12_SEC_BITS; - u64 reserved : NVM_12_RESERVED; - } g; - - /* 2.0 device format */ - struct { - u64 grp : NVM_GEN_CH_BITS; - u64 pu : NVM_GEN_LUN_BITS; - u64 chk : NVM_GEN_BLK_BITS; - u64 sec : NVM_20_SEC_BITS; - u64 reserved : NVM_20_RESERVED; - } m; - - struct { - u64 line : 63; - u64 is_cached : 1; - } c; - - u64 ppa; - }; -}; - -struct nvm_rq; -struct nvm_id; -struct nvm_dev; -struct nvm_tgt_dev; -struct nvm_chk_meta; - -typedef int (nvm_id_fn)(struct nvm_dev *); -typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, u8 *); -typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct ppa_addr *, int, int); -typedef int (nvm_get_chk_meta_fn)(struct nvm_dev *, sector_t, int, - struct nvm_chk_meta *); -typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *, void *); -typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *, int); -typedef void (nvm_destroy_dma_pool_fn)(void *); -typedef void *(nvm_dev_dma_alloc_fn)(struct nvm_dev *, void *, gfp_t, - dma_addr_t *); -typedef void (nvm_dev_dma_free_fn)(void *, void*, dma_addr_t); - -struct nvm_dev_ops { - nvm_id_fn *identity; - nvm_op_bb_tbl_fn *get_bb_tbl; - nvm_op_set_bb_fn *set_bb_tbl; - - nvm_get_chk_meta_fn *get_chk_meta; - - nvm_submit_io_fn *submit_io; - - nvm_create_dma_pool_fn *create_dma_pool; - nvm_destroy_dma_pool_fn *destroy_dma_pool; - nvm_dev_dma_alloc_fn *dev_dma_alloc; - nvm_dev_dma_free_fn *dev_dma_free; -}; - -#ifdef CONFIG_NVM - -#include -#include - -enum { - /* HW Responsibilities */ - NVM_RSP_L2P = 1 << 0, - NVM_RSP_ECC = 1 << 1, - - /* Physical Adressing Mode */ - NVM_ADDRMODE_LINEAR = 0, - NVM_ADDRMODE_CHANNEL = 1, - - /* Plane programming mode for LUN */ - NVM_PLANE_SINGLE = 1, - NVM_PLANE_DOUBLE = 2, - NVM_PLANE_QUAD = 4, - - /* Status codes */ - NVM_RSP_SUCCESS = 0x0, - NVM_RSP_NOT_CHANGEABLE = 0x1, - NVM_RSP_ERR_FAILWRITE = 0x40ff, - NVM_RSP_ERR_EMPTYPAGE = 0x42ff, - NVM_RSP_ERR_FAILECC = 0x4281, - NVM_RSP_ERR_FAILCRC = 0x4004, - NVM_RSP_WARN_HIGHECC = 0x4700, - - /* Device opcodes */ - NVM_OP_PWRITE = 0x91, - NVM_OP_PREAD = 0x92, - NVM_OP_ERASE = 0x90, - - /* PPA Command Flags */ - NVM_IO_SNGL_ACCESS = 0x0, - NVM_IO_DUAL_ACCESS = 0x1, - NVM_IO_QUAD_ACCESS = 0x2, - - /* NAND Access Modes */ - NVM_IO_SUSPEND = 0x80, - NVM_IO_SLC_MODE = 0x100, - NVM_IO_SCRAMBLE_ENABLE = 0x200, - - /* Block Types */ - NVM_BLK_T_FREE = 0x0, - NVM_BLK_T_BAD = 0x1, - NVM_BLK_T_GRWN_BAD = 0x2, - NVM_BLK_T_DEV = 0x4, - NVM_BLK_T_HOST = 0x8, - - /* Memory capabilities */ - NVM_ID_CAP_SLC = 0x1, - NVM_ID_CAP_CMD_SUSPEND = 0x2, - NVM_ID_CAP_SCRAMBLE = 0x4, - NVM_ID_CAP_ENCRYPT = 0x8, - - /* Memory types */ - NVM_ID_FMTYPE_SLC = 0, - NVM_ID_FMTYPE_MLC = 1, - - /* Device capabilities */ - NVM_ID_DCAP_BBLKMGMT = 0x1, - NVM_UD_DCAP_ECC = 0x2, -}; - -struct nvm_id_lp_mlc { - u16 num_pairs; - u8 pairs[886]; -}; - -struct nvm_id_lp_tbl { - __u8 id[8]; - struct nvm_id_lp_mlc mlc; -}; - -struct nvm_addrf_12 { - u8 ch_len; - u8 lun_len; - u8 blk_len; - u8 pg_len; - u8 pln_len; - u8 sec_len; - - u8 ch_offset; - u8 lun_offset; - u8 blk_offset; - u8 pg_offset; - u8 pln_offset; - u8 sec_offset; - - u64 ch_mask; - u64 lun_mask; - u64 blk_mask; - u64 pg_mask; - u64 pln_mask; - u64 sec_mask; -}; - -struct nvm_addrf { - u8 ch_len; - u8 lun_len; - u8 chk_len; - u8 sec_len; - u8 rsv_len[2]; - - u8 ch_offset; - u8 lun_offset; - u8 chk_offset; - u8 sec_offset; - u8 rsv_off[2]; - - u64 ch_mask; - u64 lun_mask; - u64 chk_mask; - u64 sec_mask; - u64 rsv_mask[2]; -}; - -enum { - /* Chunk states */ - NVM_CHK_ST_FREE = 1 << 0, - NVM_CHK_ST_CLOSED = 1 << 1, - NVM_CHK_ST_OPEN = 1 << 2, - NVM_CHK_ST_OFFLINE = 1 << 3, - - /* Chunk types */ - NVM_CHK_TP_W_SEQ = 1 << 0, - NVM_CHK_TP_W_RAN = 1 << 1, - NVM_CHK_TP_SZ_SPEC = 1 << 4, -}; - -/* - * Note: The structure size is linked to nvme_nvm_chk_meta such that the same - * buffer can be used when converting from little endian to cpu addressing. - */ -struct nvm_chk_meta { - u8 state; - u8 type; - u8 wi; - u8 rsvd[5]; - u64 slba; - u64 cnlb; - u64 wp; -}; - -struct nvm_target { - struct list_head list; - struct nvm_tgt_dev *dev; - struct nvm_tgt_type *type; - struct gendisk *disk; -}; - -#define ADDR_EMPTY (~0ULL) - -#define NVM_TARGET_DEFAULT_OP (101) -#define NVM_TARGET_MIN_OP (3) -#define NVM_TARGET_MAX_OP (80) - -#define NVM_VERSION_MAJOR 1 -#define NVM_VERSION_MINOR 0 -#define NVM_VERSION_PATCH 0 - -#define NVM_MAX_VLBA (64) /* max logical blocks in a vector command */ - -struct nvm_rq; -typedef void (nvm_end_io_fn)(struct nvm_rq *); - -struct nvm_rq { - struct nvm_tgt_dev *dev; - - struct bio *bio; - - union { - struct ppa_addr ppa_addr; - dma_addr_t dma_ppa_list; - }; - - struct ppa_addr *ppa_list; - - void *meta_list; - dma_addr_t dma_meta_list; - - nvm_end_io_fn *end_io; - - uint8_t opcode; - uint16_t nr_ppas; - uint16_t flags; - - u64 ppa_status; /* ppa media status */ - int error; - - int is_seq; /* Sequential hint flag. 1.2 only */ - - void *private; -}; - -static inline struct nvm_rq *nvm_rq_from_pdu(void *pdu) -{ - return pdu - sizeof(struct nvm_rq); -} - -static inline void *nvm_rq_to_pdu(struct nvm_rq *rqdata) -{ - return rqdata + 1; -} - -static inline struct ppa_addr *nvm_rq_to_ppa_list(struct nvm_rq *rqd) -{ - return (rqd->nr_ppas > 1) ? rqd->ppa_list : &rqd->ppa_addr; -} - -enum { - NVM_BLK_ST_FREE = 0x1, /* Free block */ - NVM_BLK_ST_TGT = 0x2, /* Block in use by target */ - NVM_BLK_ST_BAD = 0x8, /* Bad block */ -}; - -/* Instance geometry */ -struct nvm_geo { - /* device reported version */ - u8 major_ver_id; - u8 minor_ver_id; - - /* kernel short version */ - u8 version; - - /* instance specific geometry */ - int num_ch; - int num_lun; /* per channel */ - - /* calculated values */ - int all_luns; /* across channels */ - int all_chunks; /* across channels */ - - int op; /* over-provision in instance */ - - sector_t total_secs; /* across channels */ - - /* chunk geometry */ - u32 num_chk; /* chunks per lun */ - u32 clba; /* sectors per chunk */ - u16 csecs; /* sector size */ - u16 sos; /* out-of-band area size */ - bool ext; /* metadata in extended data buffer */ - u32 mdts; /* Max data transfer size*/ - - /* device write constrains */ - u32 ws_min; /* minimum write size */ - u32 ws_opt; /* optimal write size */ - u32 mw_cunits; /* distance required for successful read */ - u32 maxoc; /* maximum open chunks */ - u32 maxocpu; /* maximum open chunks per parallel unit */ - - /* device capabilities */ - u32 mccap; - - /* device timings */ - u32 trdt; /* Avg. Tread (ns) */ - u32 trdm; /* Max Tread (ns) */ - u32 tprt; /* Avg. Tprog (ns) */ - u32 tprm; /* Max Tprog (ns) */ - u32 tbet; /* Avg. Terase (ns) */ - u32 tbem; /* Max Terase (ns) */ - - /* generic address format */ - struct nvm_addrf addrf; - - /* 1.2 compatibility */ - u8 vmnt; - u32 cap; - u32 dom; - - u8 mtype; - u8 fmtype; - - u16 cpar; - u32 mpos; - - u8 num_pln; - u8 pln_mode; - u16 num_pg; - u16 fpg_sz; -}; - -/* sub-device structure */ -struct nvm_tgt_dev { - /* Device information */ - struct nvm_geo geo; - - /* Base ppas for target LUNs */ - struct ppa_addr *luns; - - struct request_queue *q; - - struct nvm_dev *parent; - void *map; -}; - -struct nvm_dev { - struct nvm_dev_ops *ops; - - struct list_head devices; - - /* Device information */ - struct nvm_geo geo; - - unsigned long *lun_map; - void *dma_pool; - - /* Backend device */ - struct request_queue *q; - char name[DISK_NAME_LEN]; - void *private_data; - - struct kref ref; - void *rmap; - - struct mutex mlock; - spinlock_t lock; - - /* target management */ - struct list_head area_list; - struct list_head targets; -}; - -static inline struct ppa_addr generic_to_dev_addr(struct nvm_dev *dev, - struct ppa_addr r) -{ - struct nvm_geo *geo = &dev->geo; - struct ppa_addr l; - - if (geo->version == NVM_OCSSD_SPEC_12) { - struct nvm_addrf_12 *ppaf = (struct nvm_addrf_12 *)&geo->addrf; - - l.ppa = ((u64)r.g.ch) << ppaf->ch_offset; - l.ppa |= ((u64)r.g.lun) << ppaf->lun_offset; - l.ppa |= ((u64)r.g.blk) << ppaf->blk_offset; - l.ppa |= ((u64)r.g.pg) << ppaf->pg_offset; - l.ppa |= ((u64)r.g.pl) << ppaf->pln_offset; - l.ppa |= ((u64)r.g.sec) << ppaf->sec_offset; - } else { - struct nvm_addrf *lbaf = &geo->addrf; - - l.ppa = ((u64)r.m.grp) << lbaf->ch_offset; - l.ppa |= ((u64)r.m.pu) << lbaf->lun_offset; - l.ppa |= ((u64)r.m.chk) << lbaf->chk_offset; - l.ppa |= ((u64)r.m.sec) << lbaf->sec_offset; - } - - return l; -} - -static inline struct ppa_addr dev_to_generic_addr(struct nvm_dev *dev, - struct ppa_addr r) -{ - struct nvm_geo *geo = &dev->geo; - struct ppa_addr l; - - l.ppa = 0; - - if (geo->version == NVM_OCSSD_SPEC_12) { - struct nvm_addrf_12 *ppaf = (struct nvm_addrf_12 *)&geo->addrf; - - l.g.ch = (r.ppa & ppaf->ch_mask) >> ppaf->ch_offset; - l.g.lun = (r.ppa & ppaf->lun_mask) >> ppaf->lun_offset; - l.g.blk = (r.ppa & ppaf->blk_mask) >> ppaf->blk_offset; - l.g.pg = (r.ppa & ppaf->pg_mask) >> ppaf->pg_offset; - l.g.pl = (r.ppa & ppaf->pln_mask) >> ppaf->pln_offset; - l.g.sec = (r.ppa & ppaf->sec_mask) >> ppaf->sec_offset; - } else { - struct nvm_addrf *lbaf = &geo->addrf; - - l.m.grp = (r.ppa & lbaf->ch_mask) >> lbaf->ch_offset; - l.m.pu = (r.ppa & lbaf->lun_mask) >> lbaf->lun_offset; - l.m.chk = (r.ppa & lbaf->chk_mask) >> lbaf->chk_offset; - l.m.sec = (r.ppa & lbaf->sec_mask) >> lbaf->sec_offset; - } - - return l; -} - -static inline u64 dev_to_chunk_addr(struct nvm_dev *dev, void *addrf, - struct ppa_addr p) -{ - struct nvm_geo *geo = &dev->geo; - u64 caddr; - - if (geo->version == NVM_OCSSD_SPEC_12) { - struct nvm_addrf_12 *ppaf = (struct nvm_addrf_12 *)addrf; - - caddr = (u64)p.g.pg << ppaf->pg_offset; - caddr |= (u64)p.g.pl << ppaf->pln_offset; - caddr |= (u64)p.g.sec << ppaf->sec_offset; - } else { - caddr = p.m.sec; - } - - return caddr; -} - -static inline struct ppa_addr nvm_ppa32_to_ppa64(struct nvm_dev *dev, - void *addrf, u32 ppa32) -{ - struct ppa_addr ppa64; - - ppa64.ppa = 0; - - if (ppa32 == -1) { - ppa64.ppa = ADDR_EMPTY; - } else if (ppa32 & (1U << 31)) { - ppa64.c.line = ppa32 & ((~0U) >> 1); - ppa64.c.is_cached = 1; - } else { - struct nvm_geo *geo = &dev->geo; - - if (geo->version == NVM_OCSSD_SPEC_12) { - struct nvm_addrf_12 *ppaf = addrf; - - ppa64.g.ch = (ppa32 & ppaf->ch_mask) >> - ppaf->ch_offset; - ppa64.g.lun = (ppa32 & ppaf->lun_mask) >> - ppaf->lun_offset; - ppa64.g.blk = (ppa32 & ppaf->blk_mask) >> - ppaf->blk_offset; - ppa64.g.pg = (ppa32 & ppaf->pg_mask) >> - ppaf->pg_offset; - ppa64.g.pl = (ppa32 & ppaf->pln_mask) >> - ppaf->pln_offset; - ppa64.g.sec = (ppa32 & ppaf->sec_mask) >> - ppaf->sec_offset; - } else { - struct nvm_addrf *lbaf = addrf; - - ppa64.m.grp = (ppa32 & lbaf->ch_mask) >> - lbaf->ch_offset; - ppa64.m.pu = (ppa32 & lbaf->lun_mask) >> - lbaf->lun_offset; - ppa64.m.chk = (ppa32 & lbaf->chk_mask) >> - lbaf->chk_offset; - ppa64.m.sec = (ppa32 & lbaf->sec_mask) >> - lbaf->sec_offset; - } - } - - return ppa64; -} - -static inline u32 nvm_ppa64_to_ppa32(struct nvm_dev *dev, - void *addrf, struct ppa_addr ppa64) -{ - u32 ppa32 = 0; - - if (ppa64.ppa == ADDR_EMPTY) { - ppa32 = ~0U; - } else if (ppa64.c.is_cached) { - ppa32 |= ppa64.c.line; - ppa32 |= 1U << 31; - } else { - struct nvm_geo *geo = &dev->geo; - - if (geo->version == NVM_OCSSD_SPEC_12) { - struct nvm_addrf_12 *ppaf = addrf; - - ppa32 |= ppa64.g.ch << ppaf->ch_offset; - ppa32 |= ppa64.g.lun << ppaf->lun_offset; - ppa32 |= ppa64.g.blk << ppaf->blk_offset; - ppa32 |= ppa64.g.pg << ppaf->pg_offset; - ppa32 |= ppa64.g.pl << ppaf->pln_offset; - ppa32 |= ppa64.g.sec << ppaf->sec_offset; - } else { - struct nvm_addrf *lbaf = addrf; - - ppa32 |= ppa64.m.grp << lbaf->ch_offset; - ppa32 |= ppa64.m.pu << lbaf->lun_offset; - ppa32 |= ppa64.m.chk << lbaf->chk_offset; - ppa32 |= ppa64.m.sec << lbaf->sec_offset; - } - } - - return ppa32; -} - -static inline int nvm_next_ppa_in_chk(struct nvm_tgt_dev *dev, - struct ppa_addr *ppa) -{ - struct nvm_geo *geo = &dev->geo; - int last = 0; - - if (geo->version == NVM_OCSSD_SPEC_12) { - int sec = ppa->g.sec; - - sec++; - if (sec == geo->ws_min) { - int pg = ppa->g.pg; - - sec = 0; - pg++; - if (pg == geo->num_pg) { - int pl = ppa->g.pl; - - pg = 0; - pl++; - if (pl == geo->num_pln) - last = 1; - - ppa->g.pl = pl; - } - ppa->g.pg = pg; - } - ppa->g.sec = sec; - } else { - ppa->m.sec++; - if (ppa->m.sec == geo->clba) - last = 1; - } - - return last; -} - -typedef sector_t (nvm_tgt_capacity_fn)(void *); -typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *, - int flags); -typedef void (nvm_tgt_exit_fn)(void *, bool); -typedef int (nvm_tgt_sysfs_init_fn)(struct gendisk *); -typedef void (nvm_tgt_sysfs_exit_fn)(struct gendisk *); - -enum { - NVM_TGT_F_DEV_L2P = 0, - NVM_TGT_F_HOST_L2P = 1 << 0, -}; - -struct nvm_tgt_type { - const char *name; - unsigned int version[3]; - int flags; - - /* target entry points */ - const struct block_device_operations *bops; - nvm_tgt_capacity_fn *capacity; - - /* module-specific init/teardown */ - nvm_tgt_init_fn *init; - nvm_tgt_exit_fn *exit; - - /* sysfs */ - nvm_tgt_sysfs_init_fn *sysfs_init; - nvm_tgt_sysfs_exit_fn *sysfs_exit; - - /* For internal use */ - struct list_head list; - struct module *owner; -}; - -extern int nvm_register_tgt_type(struct nvm_tgt_type *); -extern void nvm_unregister_tgt_type(struct nvm_tgt_type *); - -extern void *nvm_dev_dma_alloc(struct nvm_dev *, gfp_t, dma_addr_t *); -extern void nvm_dev_dma_free(struct nvm_dev *, void *, dma_addr_t); - -extern struct nvm_dev *nvm_alloc_dev(int); -extern int nvm_register(struct nvm_dev *); -extern void nvm_unregister(struct nvm_dev *); - -extern int nvm_get_chunk_meta(struct nvm_tgt_dev *, struct ppa_addr, - int, struct nvm_chk_meta *); -extern int nvm_set_chunk_meta(struct nvm_tgt_dev *, struct ppa_addr *, - int, int); -extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *, void *); -extern int nvm_submit_io_sync(struct nvm_tgt_dev *, struct nvm_rq *, void *); -extern void nvm_end_io(struct nvm_rq *); - -#else /* CONFIG_NVM */ -struct nvm_dev_ops; - -static inline struct nvm_dev *nvm_alloc_dev(int node) -{ - return ERR_PTR(-EINVAL); -} -static inline int nvm_register(struct nvm_dev *dev) -{ - return -EINVAL; -} -static inline void nvm_unregister(struct nvm_dev *dev) {} -#endif /* CONFIG_NVM */ -#endif /* LIGHTNVM.H */ diff --git a/include/uapi/linux/lightnvm.h b/include/uapi/linux/lightnvm.h deleted file mode 100644 index 2745afd9b8fa..000000000000 --- a/include/uapi/linux/lightnvm.h +++ /dev/null @@ -1,224 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* - * Copyright (C) 2015 CNEX Labs. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; see the file COPYING. If not, write to - * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, - * USA. - */ - -#ifndef _UAPI_LINUX_LIGHTNVM_H -#define _UAPI_LINUX_LIGHTNVM_H - -#ifdef __KERNEL__ -#include -#else /* __KERNEL__ */ -#include -#include -#define DISK_NAME_LEN 32 -#endif /* __KERNEL__ */ - -#include -#include - -#define NVM_TTYPE_NAME_MAX 48 -#define NVM_TTYPE_MAX 63 -#define NVM_MMTYPE_LEN 8 - -#define NVM_CTRL_FILE "/dev/lightnvm/control" - -struct nvm_ioctl_info_tgt { - __u32 version[3]; - __u32 reserved; - char tgtname[NVM_TTYPE_NAME_MAX]; -}; - -struct nvm_ioctl_info { - __u32 version[3]; /* in/out - major, minor, patch */ - __u16 tgtsize; /* number of targets */ - __u16 reserved16; /* pad to 4K page */ - __u32 reserved[12]; - struct nvm_ioctl_info_tgt tgts[NVM_TTYPE_MAX]; -}; - -enum { - NVM_DEVICE_ACTIVE = 1 << 0, -}; - -struct nvm_ioctl_device_info { - char devname[DISK_NAME_LEN]; - char bmname[NVM_TTYPE_NAME_MAX]; - __u32 bmversion[3]; - __u32 flags; - __u32 reserved[8]; -}; - -struct nvm_ioctl_get_devices { - __u32 nr_devices; - __u32 reserved[31]; - struct nvm_ioctl_device_info info[31]; -}; - -struct nvm_ioctl_create_simple { - __u32 lun_begin; - __u32 lun_end; -}; - -struct nvm_ioctl_create_extended { - __u16 lun_begin; - __u16 lun_end; - __u16 op; - __u16 rsv; -}; - -enum { - NVM_CONFIG_TYPE_SIMPLE = 0, - NVM_CONFIG_TYPE_EXTENDED = 1, -}; - -struct nvm_ioctl_create_conf { - __u32 type; - union { - struct nvm_ioctl_create_simple s; - struct nvm_ioctl_create_extended e; - }; -}; - -enum { - NVM_TARGET_FACTORY = 1 << 0, /* Init target in factory mode */ -}; - -struct nvm_ioctl_create { - char dev[DISK_NAME_LEN]; /* open-channel SSD device */ - char tgttype[NVM_TTYPE_NAME_MAX]; /* target type name */ - char tgtname[DISK_NAME_LEN]; /* dev to expose target as */ - - __u32 flags; - - struct nvm_ioctl_create_conf conf; -}; - -struct nvm_ioctl_remove { - char tgtname[DISK_NAME_LEN]; - - __u32 flags; -}; - -struct nvm_ioctl_dev_init { - char dev[DISK_NAME_LEN]; /* open-channel SSD device */ - char mmtype[NVM_MMTYPE_LEN]; /* register to media manager */ - - __u32 flags; -}; - -enum { - NVM_FACTORY_ERASE_ONLY_USER = 1 << 0, /* erase only blocks used as - * host blks or grown blks */ - NVM_FACTORY_RESET_HOST_BLKS = 1 << 1, /* remove host blk marks */ - NVM_FACTORY_RESET_GRWN_BBLKS = 1 << 2, /* remove grown blk marks */ - NVM_FACTORY_NR_BITS = 1 << 3, /* stops here */ -}; - -struct nvm_ioctl_dev_factory { - char dev[DISK_NAME_LEN]; - - __u32 flags; -}; - -struct nvm_user_vio { - __u8 opcode; - __u8 flags; - __u16 control; - __u16 nppas; - __u16 rsvd; - __u64 metadata; - __u64 addr; - __u64 ppa_list; - __u32 metadata_len; - __u32 data_len; - __u64 status; - __u32 result; - __u32 rsvd3[3]; -}; - -struct nvm_passthru_vio { - __u8 opcode; - __u8 flags; - __u8 rsvd[2]; - __u32 nsid; - __u32 cdw2; - __u32 cdw3; - __u64 metadata; - __u64 addr; - __u32 metadata_len; - __u32 data_len; - __u64 ppa_list; - __u16 nppas; - __u16 control; - __u32 cdw13; - __u32 cdw14; - __u32 cdw15; - __u64 status; - __u32 result; - __u32 timeout_ms; -}; - -/* The ioctl type, 'L', 0x20 - 0x2F documented in ioctl-number.txt */ -enum { - /* top level cmds */ - NVM_INFO_CMD = 0x20, - NVM_GET_DEVICES_CMD, - - /* device level cmds */ - NVM_DEV_CREATE_CMD, - NVM_DEV_REMOVE_CMD, - - /* Init a device to support LightNVM media managers */ - NVM_DEV_INIT_CMD, - - /* Factory reset device */ - NVM_DEV_FACTORY_CMD, - - /* Vector user I/O */ - NVM_DEV_VIO_ADMIN_CMD = 0x41, - NVM_DEV_VIO_CMD = 0x42, - NVM_DEV_VIO_USER_CMD = 0x43, -}; - -#define NVM_IOCTL 'L' /* 0x4c */ - -#define NVM_INFO _IOWR(NVM_IOCTL, NVM_INFO_CMD, \ - struct nvm_ioctl_info) -#define NVM_GET_DEVICES _IOR(NVM_IOCTL, NVM_GET_DEVICES_CMD, \ - struct nvm_ioctl_get_devices) -#define NVM_DEV_CREATE _IOW(NVM_IOCTL, NVM_DEV_CREATE_CMD, \ - struct nvm_ioctl_create) -#define NVM_DEV_REMOVE _IOW(NVM_IOCTL, NVM_DEV_REMOVE_CMD, \ - struct nvm_ioctl_remove) -#define NVM_DEV_INIT _IOW(NVM_IOCTL, NVM_DEV_INIT_CMD, \ - struct nvm_ioctl_dev_init) -#define NVM_DEV_FACTORY _IOW(NVM_IOCTL, NVM_DEV_FACTORY_CMD, \ - struct nvm_ioctl_dev_factory) - -#define NVME_NVM_IOCTL_IO_VIO _IOWR(NVM_IOCTL, NVM_DEV_VIO_USER_CMD, \ - struct nvm_passthru_vio) -#define NVME_NVM_IOCTL_ADMIN_VIO _IOWR(NVM_IOCTL, NVM_DEV_VIO_ADMIN_CMD,\ - struct nvm_passthru_vio) -#define NVME_NVM_IOCTL_SUBMIT_VIO _IOWR(NVM_IOCTL, NVM_DEV_VIO_CMD,\ - struct nvm_user_vio) - -#define NVM_VERSION_MAJOR 1 -#define NVM_VERSION_MINOR 0 -#define NVM_VERSION_PATCHLEVEL 0 - -#endif From 4f1e9630afe6332de7286820fedd019f19eac057 Mon Sep 17 00:00:00 2001 From: Chunguang Xu Date: Mon, 2 Aug 2021 11:51:56 +0800 Subject: [PATCH 123/315] blk-throtl: optimize IOPS throttle for large IO scenarios After patch 54efd50 (block: make generic_make_request handle arbitrarily sized bios), the IO through io-throttle may be larger, and these IOs may be further split into more small IOs. However, IOPS throttle does not seem to be aware of this change, which makes the calculation of IOPS of large IOs incomplete, resulting in disk-side IOPS that does not meet expectations. Maybe we should fix this problem. We can reproduce it by set max_sectors_kb of disk to 128, set blkio.write_iops_throttle to 100, run a dd instance inside blkio and use iostat to watch IOPS: dd if=/dev/zero of=/dev/sdb bs=1M count=1000 oflag=direct As a result, without this change the average IOPS is 1995, with this change the IOPS is 98. Signed-off-by: Chunguang Xu Acked-by: Tejun Heo Link: https://lore.kernel.org/r/65869aaad05475797d63b4c3fed4f529febe3c26.1627876014.git.brookxu@tencent.com Signed-off-by: Jens Axboe --- block/blk-merge.c | 2 ++ block/blk-throttle.c | 32 ++++++++++++++++++++++++++++++++ block/blk.h | 2 ++ 3 files changed, 36 insertions(+) diff --git a/block/blk-merge.c b/block/blk-merge.c index f8707ff7e2fc..eeba8422ae82 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -348,6 +348,8 @@ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs) trace_block_split(split, (*bio)->bi_iter.bi_sector); submit_bio_noacct(*bio); *bio = split; + + blk_throtl_charge_bio_split(*bio); } } diff --git a/block/blk-throttle.c b/block/blk-throttle.c index b1b22d863bdf..55c49015e533 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -178,6 +178,9 @@ struct throtl_grp { unsigned int bad_bio_cnt; /* bios exceeding latency threshold */ unsigned long bio_cnt_reset_time; + atomic_t io_split_cnt[2]; + atomic_t last_io_split_cnt[2]; + struct blkg_rwstat stat_bytes; struct blkg_rwstat stat_ios; }; @@ -777,6 +780,8 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, tg->bytes_disp[rw] = 0; tg->io_disp[rw] = 0; + atomic_set(&tg->io_split_cnt[rw], 0); + /* * Previous slice has expired. We must have trimmed it after last * bio dispatch. That means since start of last slice, we never used @@ -799,6 +804,9 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw) tg->io_disp[rw] = 0; tg->slice_start[rw] = jiffies; tg->slice_end[rw] = jiffies + tg->td->throtl_slice; + + atomic_set(&tg->io_split_cnt[rw], 0); + throtl_log(&tg->service_queue, "[%c] new slice start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', tg->slice_start[rw], @@ -1031,6 +1039,9 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, jiffies + tg->td->throtl_slice); } + if (iops_limit != UINT_MAX) + tg->io_disp[rw] += atomic_xchg(&tg->io_split_cnt[rw], 0); + if (tg_with_in_bps_limit(tg, bio, bps_limit, &bps_wait) && tg_with_in_iops_limit(tg, bio, iops_limit, &iops_wait)) { if (wait) @@ -2052,12 +2063,14 @@ static void throtl_downgrade_check(struct throtl_grp *tg) } if (tg->iops[READ][LIMIT_LOW]) { + tg->last_io_disp[READ] += atomic_xchg(&tg->last_io_split_cnt[READ], 0); iops = tg->last_io_disp[READ] * HZ / elapsed_time; if (iops >= tg->iops[READ][LIMIT_LOW]) tg->last_low_overflow_time[READ] = now; } if (tg->iops[WRITE][LIMIT_LOW]) { + tg->last_io_disp[WRITE] += atomic_xchg(&tg->last_io_split_cnt[WRITE], 0); iops = tg->last_io_disp[WRITE] * HZ / elapsed_time; if (iops >= tg->iops[WRITE][LIMIT_LOW]) tg->last_low_overflow_time[WRITE] = now; @@ -2176,6 +2189,25 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td) } #endif +void blk_throtl_charge_bio_split(struct bio *bio) +{ + struct blkcg_gq *blkg = bio->bi_blkg; + struct throtl_grp *parent = blkg_to_tg(blkg); + struct throtl_service_queue *parent_sq; + bool rw = bio_data_dir(bio); + + do { + if (!parent->has_rules[rw]) + break; + + atomic_inc(&parent->io_split_cnt[rw]); + atomic_inc(&parent->last_io_split_cnt[rw]); + + parent_sq = parent->service_queue.parent_sq; + parent = sq_to_tg(parent_sq); + } while (parent); +} + bool blk_throtl_bio(struct bio *bio) { struct request_queue *q = bio->bi_bdev->bd_disk->queue; diff --git a/block/blk.h b/block/blk.h index db6f82bbb683..148bdcd3aa08 100644 --- a/block/blk.h +++ b/block/blk.h @@ -293,11 +293,13 @@ int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node); extern int blk_throtl_init(struct request_queue *q); extern void blk_throtl_exit(struct request_queue *q); extern void blk_throtl_register_queue(struct request_queue *q); +extern void blk_throtl_charge_bio_split(struct bio *bio); bool blk_throtl_bio(struct bio *bio); #else /* CONFIG_BLK_DEV_THROTTLING */ static inline int blk_throtl_init(struct request_queue *q) { return 0; } static inline void blk_throtl_exit(struct request_queue *q) { } static inline void blk_throtl_register_queue(struct request_queue *q) { } +static inline void blk_throtl_charge_bio_split(struct bio *bio) { } static inline bool blk_throtl_bio(struct bio *bio) { return false; } #endif /* CONFIG_BLK_DEV_THROTTLING */ #ifdef CONFIG_BLK_DEV_THROTTLING_LOW From 2a14c9ae15a38148484a128b84bff7e9ffd90d68 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 16 Jun 2021 14:19:33 -0700 Subject: [PATCH 124/315] params: lift param_set_uint_minmax to common code It is a useful helper hence move it to common code so others can enjoy it. Suggested-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- include/linux/moduleparam.h | 2 ++ kernel/params.c | 18 ++++++++++++++++++ net/sunrpc/xprtsock.c | 18 ------------------ 3 files changed, 20 insertions(+), 18 deletions(-) diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index eed280fae433..962cd41a2cb5 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -431,6 +431,8 @@ extern int param_get_int(char *buffer, const struct kernel_param *kp); extern const struct kernel_param_ops param_ops_uint; extern int param_set_uint(const char *val, const struct kernel_param *kp); extern int param_get_uint(char *buffer, const struct kernel_param *kp); +int param_set_uint_minmax(const char *val, const struct kernel_param *kp, + unsigned int min, unsigned int max); #define param_check_uint(name, p) __param_check(name, p, unsigned int) extern const struct kernel_param_ops param_ops_long; diff --git a/kernel/params.c b/kernel/params.c index 2daa2780a92c..8299bd764e42 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -243,6 +243,24 @@ STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul); STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull); STANDARD_PARAM_DEF(hexint, unsigned int, "%#08x", kstrtouint); +int param_set_uint_minmax(const char *val, const struct kernel_param *kp, + unsigned int min, unsigned int max) +{ + unsigned int num; + int ret; + + if (!val) + return -EINVAL; + ret = kstrtouint(val, 0, &num); + if (ret) + return ret; + if (num < min || num > max) + return -EINVAL; + *((unsigned int *)kp->arg) = num; + return 0; +} +EXPORT_SYMBOL_GPL(param_set_uint_minmax); + int param_set_charp(const char *val, const struct kernel_param *kp) { if (strlen(val) > 1024) { diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index e573dcecdd66..b7dbdcbdeb6c 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -3149,24 +3149,6 @@ void cleanup_socket_xprt(void) xprt_unregister_transport(&xs_bc_tcp_transport); } -static int param_set_uint_minmax(const char *val, - const struct kernel_param *kp, - unsigned int min, unsigned int max) -{ - unsigned int num; - int ret; - - if (!val) - return -EINVAL; - ret = kstrtouint(val, 0, &num); - if (ret) - return ret; - if (num < min || num > max) - return -EINVAL; - *((unsigned int *)kp->arg) = num; - return 0; -} - static int param_set_portnr(const char *val, const struct kernel_param *kp) { return param_set_uint_minmax(val, kp, From 27453b45e62da8656739f7e1365ea9318e7b040e Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 16 Jun 2021 14:19:34 -0700 Subject: [PATCH 125/315] nvme-pci: limit maximum queue depth to 4095 We are going to use the upper 4-bits of the command_id for a generation counter, so enforce the new queue depth upper limit. As we enforce both min and max queue depth, use param_set_uint_minmax istead of open coding it. Reviewed-by: Chaitanya Kulkarni Signed-off-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Reviewed-by: Daniel Wagner Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index db7a9bee2014..4880badb26d4 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -60,6 +60,8 @@ MODULE_PARM_DESC(sgl_threshold, "Use SGLs when average request segment size is larger or equal to " "this size. Use 0 to disable SGLs."); +#define NVME_PCI_MIN_QUEUE_SIZE 2 +#define NVME_PCI_MAX_QUEUE_SIZE 4095 static int io_queue_depth_set(const char *val, const struct kernel_param *kp); static const struct kernel_param_ops io_queue_depth_ops = { .set = io_queue_depth_set, @@ -68,7 +70,7 @@ static const struct kernel_param_ops io_queue_depth_ops = { static unsigned int io_queue_depth = 1024; module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644); -MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2"); +MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2 and < 4096"); static int io_queue_count_set(const char *val, const struct kernel_param *kp) { @@ -157,14 +159,8 @@ struct nvme_dev { static int io_queue_depth_set(const char *val, const struct kernel_param *kp) { - int ret; - u32 n; - - ret = kstrtou32(val, 10, &n); - if (ret != 0 || n < 2) - return -EINVAL; - - return param_set_uint(val, kp); + return param_set_uint_minmax(val, kp, NVME_PCI_MIN_QUEUE_SIZE, + NVME_PCI_MAX_QUEUE_SIZE); } static inline unsigned int sq_idx(unsigned int qid, u32 stride) From 3b01a9d0caa8276d9ce314e09610f7fb70f49a00 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 16 Jun 2021 14:19:35 -0700 Subject: [PATCH 126/315] nvme-tcp: don't check blk_mq_tag_to_rq when receiving pdu data We already validate it when receiving the c2hdata pdu header and this is not changing so this is a redundant check. Reviewed-by: Hannes Reinecke Signed-off-by: Sagi Grimberg Reviewed-by: Daniel Wagner Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 8cb15ee5b249..d649b446da66 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -702,17 +702,9 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, unsigned int *offset, size_t *len) { struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu; - struct nvme_tcp_request *req; - struct request *rq; - - rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id); - if (!rq) { - dev_err(queue->ctrl->ctrl.device, - "queue %d tag %#x not found\n", - nvme_tcp_queue_id(queue), pdu->command_id); - return -ENOENT; - } - req = blk_mq_rq_to_pdu(rq); + struct request *rq = + blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id); + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); while (true) { int recv_len, ret; From e7006de6c23803799be000a5dcce4d916a36541a Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 16 Jun 2021 14:19:36 -0700 Subject: [PATCH 127/315] nvme: code command_id with a genctr for use-after-free validation We cannot detect a (perhaps buggy) controller that is sending us a completion for a request that was already completed (for example sending a completion twice), this phenomenon was seen in the wild a few times. So to protect against this, we use the upper 4 msbits of the nvme sqe command_id to use as a 4-bit generation counter and verify it matches the existing request generation that is incrementing on every execution. The 16-bit command_id structure now is constructed by: | xxxx | xxxxxxxxxxxx | gen request tag This means that we are giving up some possible queue depth as 12 bits allow for a maximum queue depth of 4095 instead of 65536, however we never create such long queues anyways so no real harm done. Suggested-by: Keith Busch Signed-off-by: Sagi Grimberg Acked-by: Keith Busch Reviewed-by: Hannes Reinecke Reviewed-by: Daniel Wagner Tested-by: Daniel Wagner Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 3 ++- drivers/nvme/host/nvme.h | 47 +++++++++++++++++++++++++++++++++++++- drivers/nvme/host/pci.c | 2 +- drivers/nvme/host/rdma.c | 4 ++-- drivers/nvme/host/tcp.c | 26 ++++++++++----------- drivers/nvme/target/loop.c | 4 ++-- 6 files changed, 66 insertions(+), 20 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index ce33014e3eb0..b9a46c54f714 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1026,7 +1026,8 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req) return BLK_STS_IOERR; } - cmd->common.command_id = req->tag; + nvme_req(req)->genctr++; + cmd->common.command_id = nvme_cid(req); trace_nvme_setup_cmd(req, cmd); return ret; } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index ab803f91ace1..57d2ac00a6bd 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -152,6 +152,7 @@ enum nvme_quirks { struct nvme_request { struct nvme_command *cmd; union nvme_result result; + u8 genctr; u8 retries; u8 flags; u16 status; @@ -491,6 +492,49 @@ struct nvme_ctrl_ops { int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size); }; +/* + * nvme command_id is constructed as such: + * | xxxx | xxxxxxxxxxxx | + * gen request tag + */ +#define nvme_genctr_mask(gen) (gen & 0xf) +#define nvme_cid_install_genctr(gen) (nvme_genctr_mask(gen) << 12) +#define nvme_genctr_from_cid(cid) ((cid & 0xf000) >> 12) +#define nvme_tag_from_cid(cid) (cid & 0xfff) + +static inline u16 nvme_cid(struct request *rq) +{ + return nvme_cid_install_genctr(nvme_req(rq)->genctr) | rq->tag; +} + +static inline struct request *nvme_find_rq(struct blk_mq_tags *tags, + u16 command_id) +{ + u8 genctr = nvme_genctr_from_cid(command_id); + u16 tag = nvme_tag_from_cid(command_id); + struct request *rq; + + rq = blk_mq_tag_to_rq(tags, tag); + if (unlikely(!rq)) { + pr_err("could not locate request for tag %#x\n", + tag); + return NULL; + } + if (unlikely(nvme_genctr_mask(nvme_req(rq)->genctr) != genctr)) { + dev_err(nvme_req(rq)->ctrl->device, + "request %#x genctr mismatch (got %#x expected %#x)\n", + tag, genctr, nvme_genctr_mask(nvme_req(rq)->genctr)); + return NULL; + } + return rq; +} + +static inline struct request *nvme_cid_to_rq(struct blk_mq_tags *tags, + u16 command_id) +{ + return blk_mq_tag_to_rq(tags, nvme_tag_from_cid(command_id)); +} + #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS void nvme_fault_inject_init(struct nvme_fault_inject *fault_inj, const char *dev_name); @@ -588,7 +632,8 @@ static inline void nvme_put_ctrl(struct nvme_ctrl *ctrl) static inline bool nvme_is_aen_req(u16 qid, __u16 command_id) { - return !qid && command_id >= NVME_AQ_BLK_MQ_DEPTH; + return !qid && + nvme_tag_from_cid(command_id) >= NVME_AQ_BLK_MQ_DEPTH; } void nvme_complete_rq(struct request *req); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 4880badb26d4..0471c2c7d64b 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1010,7 +1010,7 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) return; } - req = blk_mq_tag_to_rq(nvme_queue_tagset(nvmeq), command_id); + req = nvme_find_rq(nvme_queue_tagset(nvmeq), command_id); if (unlikely(!req)) { dev_warn(nvmeq->dev->ctrl.device, "invalid id %d completed on queue %d\n", diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 7f6b3a991501..69ae67652f38 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1730,10 +1730,10 @@ static void nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue, struct request *rq; struct nvme_rdma_request *req; - rq = blk_mq_tag_to_rq(nvme_rdma_tagset(queue), cqe->command_id); + rq = nvme_find_rq(nvme_rdma_tagset(queue), cqe->command_id); if (!rq) { dev_err(queue->ctrl->ctrl.device, - "tag 0x%x on QP %#x not found\n", + "got bad command_id %#x on QP %#x\n", cqe->command_id, queue->qp->qp_num); nvme_rdma_error_recovery(queue->ctrl); return; diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index d649b446da66..0a97ba02f61e 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -487,11 +487,11 @@ static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue, { struct request *rq; - rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), cqe->command_id); + rq = nvme_find_rq(nvme_tcp_tagset(queue), cqe->command_id); if (!rq) { dev_err(queue->ctrl->ctrl.device, - "queue %d tag 0x%x not found\n", - nvme_tcp_queue_id(queue), cqe->command_id); + "got bad cqe.command_id %#x on queue %d\n", + cqe->command_id, nvme_tcp_queue_id(queue)); nvme_tcp_error_recovery(&queue->ctrl->ctrl); return -EINVAL; } @@ -508,11 +508,11 @@ static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue, { struct request *rq; - rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id); + rq = nvme_find_rq(nvme_tcp_tagset(queue), pdu->command_id); if (!rq) { dev_err(queue->ctrl->ctrl.device, - "queue %d tag %#x not found\n", - nvme_tcp_queue_id(queue), pdu->command_id); + "got bad c2hdata.command_id %#x on queue %d\n", + pdu->command_id, nvme_tcp_queue_id(queue)); return -ENOENT; } @@ -606,7 +606,7 @@ static int nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req, data->hdr.plen = cpu_to_le32(data->hdr.hlen + hdgst + req->pdu_len + ddgst); data->ttag = pdu->ttag; - data->command_id = rq->tag; + data->command_id = nvme_cid(rq); data->data_offset = cpu_to_le32(req->data_sent); data->data_length = cpu_to_le32(req->pdu_len); return 0; @@ -619,11 +619,11 @@ static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue, struct request *rq; int ret; - rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id); + rq = nvme_find_rq(nvme_tcp_tagset(queue), pdu->command_id); if (!rq) { dev_err(queue->ctrl->ctrl.device, - "queue %d tag %#x not found\n", - nvme_tcp_queue_id(queue), pdu->command_id); + "got bad r2t.command_id %#x on queue %d\n", + pdu->command_id, nvme_tcp_queue_id(queue)); return -ENOENT; } req = blk_mq_rq_to_pdu(rq); @@ -703,7 +703,7 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, { struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu; struct request *rq = - blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id); + nvme_cid_to_rq(nvme_tcp_tagset(queue), pdu->command_id); struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); while (true) { @@ -796,8 +796,8 @@ static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue, } if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) { - struct request *rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), - pdu->command_id); + struct request *rq = nvme_cid_to_rq(nvme_tcp_tagset(queue), + pdu->command_id); nvme_tcp_end_request(rq, NVME_SC_SUCCESS); queue->nr_cqe++; diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 3a17a7e26bbf..0285ccc7541f 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -107,10 +107,10 @@ static void nvme_loop_queue_response(struct nvmet_req *req) } else { struct request *rq; - rq = blk_mq_tag_to_rq(nvme_loop_tagset(queue), cqe->command_id); + rq = nvme_find_rq(nvme_loop_tagset(queue), cqe->command_id); if (!rq) { dev_err(queue->ctrl->ctrl.device, - "tag 0x%x on queue %d not found\n", + "got bad command_id %#x on queue %d\n", cqe->command_id, nvme_loop_queue_idx(queue)); return; } From 0521905e859fd1a07949cb18efb20cdd4aab3b20 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 14 Jul 2021 14:02:37 -0700 Subject: [PATCH 128/315] nvme-pci: use attribute group for cmb sysfs Appending sysfs files to the controller kobject is a bit clunky and becomes a maintenance problem as more attributes are added. The attribute group infrastructure handles this better, so use that. Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 72 ++++++++++++++++++++++++++--------------- 1 file changed, 46 insertions(+), 26 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 0471c2c7d64b..6658f58ef824 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -155,6 +155,8 @@ struct nvme_dev { unsigned int nr_allocated_queues; unsigned int nr_write_queues; unsigned int nr_poll_queues; + + bool attrs_added; }; static int io_queue_depth_set(const char *val, const struct kernel_param *kp) @@ -1804,17 +1806,6 @@ static int nvme_create_io_queues(struct nvme_dev *dev) return ret >= 0 ? 0 : ret; } -static ssize_t nvme_cmb_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); - - return scnprintf(buf, PAGE_SIZE, "cmbloc : x%08x\ncmbsz : x%08x\n", - ndev->cmbloc, ndev->cmbsz); -} -static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL); - static u64 nvme_cmb_size_unit(struct nvme_dev *dev) { u8 szu = (dev->cmbsz >> NVME_CMBSZ_SZU_SHIFT) & NVME_CMBSZ_SZU_MASK; @@ -1883,20 +1874,6 @@ static void nvme_map_cmb(struct nvme_dev *dev) if ((dev->cmbsz & (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) == (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) pci_p2pmem_publish(pdev, true); - - if (sysfs_add_file_to_group(&dev->ctrl.device->kobj, - &dev_attr_cmb.attr, NULL)) - dev_warn(dev->ctrl.device, - "failed to add sysfs attribute for CMB\n"); -} - -static inline void nvme_release_cmb(struct nvme_dev *dev) -{ - if (dev->cmb_size) { - sysfs_remove_file_from_group(&dev->ctrl.device->kobj, - &dev_attr_cmb.attr, NULL); - dev->cmb_size = 0; - } } static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits) @@ -2076,6 +2053,38 @@ static int nvme_setup_host_mem(struct nvme_dev *dev) return ret; } +static ssize_t cmb_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); + + return sysfs_emit(buf, "cmbloc : x%08x\ncmbsz : x%08x\n", + ndev->cmbloc, ndev->cmbsz); +} +static DEVICE_ATTR_RO(cmb); + +static umode_t nvme_pci_attrs_are_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct nvme_ctrl *ctrl = + dev_get_drvdata(container_of(kobj, struct device, kobj)); + struct nvme_dev *dev = to_nvme_dev(ctrl); + + if (a == &dev_attr_cmb.attr && !dev->cmbsz) + return 0; + return a->mode; +} + +static struct attribute *nvme_pci_attrs[] = { + &dev_attr_cmb.attr, + NULL, +}; + +static const struct attribute_group nvme_pci_attr_group = { + .attrs = nvme_pci_attrs, + .is_visible = nvme_pci_attrs_are_visible, +}; + /* * nirqs is the number of interrupts available for write and read * queues. The core already reserved an interrupt for the admin queue. @@ -2747,6 +2756,10 @@ static void nvme_reset_work(struct work_struct *work) goto out; } + if (!dev->attrs_added && !sysfs_create_group(&dev->ctrl.device->kobj, + &nvme_pci_attr_group)) + dev->attrs_added = true; + nvme_start_ctrl(&dev->ctrl); return; @@ -2995,6 +3008,13 @@ static void nvme_shutdown(struct pci_dev *pdev) nvme_disable_prepare_reset(dev, true); } +static void nvme_remove_attrs(struct nvme_dev *dev) +{ + if (dev->attrs_added) + sysfs_remove_group(&dev->ctrl.device->kobj, + &nvme_pci_attr_group); +} + /* * The driver's remove may be called on a device in a partially initialized * state. This function must not have any dependencies on the device state in @@ -3016,7 +3036,7 @@ static void nvme_remove(struct pci_dev *pdev) nvme_stop_ctrl(&dev->ctrl); nvme_remove_namespaces(&dev->ctrl); nvme_dev_disable(dev, true); - nvme_release_cmb(dev); + nvme_remove_attrs(dev); nvme_free_host_mem(dev); nvme_dev_remove_admin(dev); nvme_free_queues(dev, 0); From 1751e97aa940656b5de0e620f02cf193a275e014 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 16 Jul 2021 09:22:49 +0200 Subject: [PATCH 129/315] nvme-pci: cmb sysfs: one file, one value An attribute should only be exporting one value as recommended in Documentation/filesystems/sysfs.rst. Implement CMB attributes this way. The old attribute will remain for backward compatibility. Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 6658f58ef824..909dadcdab09 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2063,6 +2063,24 @@ static ssize_t cmb_show(struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR_RO(cmb); +static ssize_t cmbloc_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); + + return sysfs_emit(buf, "%u\n", ndev->cmbloc); +} +static DEVICE_ATTR_RO(cmbloc); + +static ssize_t cmbsz_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); + + return sysfs_emit(buf, "%u\n", ndev->cmbsz); +} +static DEVICE_ATTR_RO(cmbsz); + static umode_t nvme_pci_attrs_are_visible(struct kobject *kobj, struct attribute *a, int n) { @@ -2070,13 +2088,19 @@ static umode_t nvme_pci_attrs_are_visible(struct kobject *kobj, dev_get_drvdata(container_of(kobj, struct device, kobj)); struct nvme_dev *dev = to_nvme_dev(ctrl); - if (a == &dev_attr_cmb.attr && !dev->cmbsz) - return 0; + if (a == &dev_attr_cmb.attr || + a == &dev_attr_cmbloc.attr || + a == &dev_attr_cmbsz.attr) { + if (!dev->cmbsz) + return 0; + } return a->mode; } static struct attribute *nvme_pci_attrs[] = { &dev_attr_cmb.attr, + &dev_attr_cmbloc.attr, + &dev_attr_cmbsz.attr, NULL, }; From e23439e977ed2b247912c2b5c6945ef1bc380100 Mon Sep 17 00:00:00 2001 From: Hou Pu Date: Fri, 9 Jul 2021 10:32:47 +0800 Subject: [PATCH 130/315] nvme-fabrics: remove superfluous nvmf_host_put in nvmf_parse_options Opts->host is NULL there. It is checked just before. So remove nvmf_host_put. It is introduced by commit 59a2f3f00fd7 ("nvme: fix potential memory leak in option parsing"). Signed-off-by: Hou Pu Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fabrics.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index a5469fd9d4c3..668c6bb7a567 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -719,7 +719,6 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, ret = -EINVAL; goto out; } - nvmf_host_put(opts->host); opts->host = nvmf_host_add(p); kfree(p); if (!opts->host) { From a7b5e8d864b356fdacfea08d9042261c37bc918e Mon Sep 17 00:00:00 2001 From: Hou Pu Date: Mon, 5 Jul 2021 11:15:28 +0800 Subject: [PATCH 131/315] nvme: add set feature tracing support A nvme connect command produces following trace. Before: /sys/kernel/debug/tracing# cat trace | grep feature kworker/5:1H-98 [005] .... 3221.294844: nvme_setup_cmd: nvme0: qid=0, cmdid=25, nsid=0, flags=0x0, meta=0x0, cmd=(nvme_admin_set_features cdw10=07 00 00 00 07 00 07 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00) kworker/4:1H-124 [004] .... 3222.009186: nvme_setup_cmd: nvme0: qid=0, cmdid=17, nsid=0, flags=0x0, meta=0x0, cmd=(nvme_admin_set_features cdw10=0b 00 00 00 00 09 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00) After: /sys/kernel/debug/tracing# cat trace | grep feature kworker/0:1H-253 [000] .... 196.060509: nvme_setup_cmd: nvme0: qid=0, cmdid=29, nsid=0, flags=0x0, meta=0x0, cmd=(nvme_admin_set_features fid=0x7, sv=0x0, cdw11=0x70007) kworker/0:1H-253 [000] .... 196.763947: nvme_setup_cmd: nvme0: qid=0, cmdid=29, nsid=0, flags=0x0, meta=0x0, cmd=(nvme_admin_set_features fid=0xb, sv=0x0, cdw11=0x900) Using ',' to separate different field like others in nvmet_trace_admin_get_features. Signed-off-by: Hou Pu Signed-off-by: Christoph Hellwig --- drivers/nvme/host/trace.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c index 6543015b6121..2a89c5aa0790 100644 --- a/drivers/nvme/host/trace.c +++ b/drivers/nvme/host/trace.c @@ -72,6 +72,20 @@ static const char *nvme_trace_admin_identify(struct trace_seq *p, u8 *cdw10) return ret; } +static const char *nvme_trace_admin_set_features(struct trace_seq *p, + u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u8 fid = cdw10[0]; + u8 sv = cdw10[3] & 0x8; + u32 cdw11 = get_unaligned_le32(cdw10 + 4); + + trace_seq_printf(p, "fid=0x%x, sv=0x%x, cdw11=0x%x", fid, sv, cdw11); + trace_seq_putc(p, 0); + + return ret; +} + static const char *nvme_trace_admin_get_features(struct trace_seq *p, u8 *cdw10) { @@ -80,7 +94,7 @@ static const char *nvme_trace_admin_get_features(struct trace_seq *p, u8 sel = cdw10[1] & 0x7; u32 cdw11 = get_unaligned_le32(cdw10 + 4); - trace_seq_printf(p, "fid=0x%x sel=0x%x cdw11=0x%x", fid, sel, cdw11); + trace_seq_printf(p, "fid=0x%x, sel=0x%x, cdw11=0x%x", fid, sel, cdw11); trace_seq_putc(p, 0); return ret; @@ -201,6 +215,8 @@ const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, return nvme_trace_create_cq(p, cdw10); case nvme_admin_identify: return nvme_trace_admin_identify(p, cdw10); + case nvme_admin_set_features: + return nvme_trace_admin_set_features(p, cdw10); case nvme_admin_get_features: return nvme_trace_admin_get_features(p, cdw10); case nvme_admin_get_lba_status: From 8d84f9de69ca23f2637dc19d96f39228c8426e97 Mon Sep 17 00:00:00 2001 From: Hou Pu Date: Mon, 5 Jul 2021 11:15:29 +0800 Subject: [PATCH 132/315] nvmet: add set feature tracing support A nvme connect command produces following trace from the target side. Before: kworker/0:1H-56 [000] .... 9012.155139: nvmet_req_init: nvmet1: qid=0, cmdid=16, nsid=0, flags=0x40, meta=0x0, cmd=(nvme_admin_set_features, cdw10=07 00 00 00 07 00 07 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00) kworker/0:1H-56 [000] .... 9012.872272: nvmet_req_init: nvmet1: qid=0, cmdid=13, nsid=0, flags=0x40, meta=0x0, cmd=(nvme_admin_set_features, cdw10=0b 00 00 00 00 09 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00) cmdline:/sys/kernel/debug/tracing# cat trace | grep feature kworker/0:1H-56 [000] .... 203.493914: nvmet_req_init: nvmet1: qid=0, cmdid=29, nsid=0, flags=0x40, meta=0x0, cmd=(nvme_admin_set_features, fid=0x7, sv=0x0, cdw11=0x70007) kworker/0:1H-56 [000] .... 204.197079: nvmet_req_init: nvmet1: qid=0, cmdid=29, nsid=0, flags=0x40, meta=0x0, cmd=(nvme_admin_set_features, fid=0xb, sv=0x0, cdw11=0x900) Using ',' to separate different field like others in nvmet_trace_admin_get_features. Signed-off-by: Hou Pu Signed-off-by: Christoph Hellwig --- drivers/nvme/target/trace.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/trace.c b/drivers/nvme/target/trace.c index 1373a3c67962..bff454d46255 100644 --- a/drivers/nvme/target/trace.c +++ b/drivers/nvme/target/trace.c @@ -27,7 +27,7 @@ static const char *nvmet_trace_admin_get_features(struct trace_seq *p, u8 sel = cdw10[1] & 0x7; u32 cdw11 = get_unaligned_le32(cdw10 + 4); - trace_seq_printf(p, "fid=0x%x sel=0x%x cdw11=0x%x", fid, sel, cdw11); + trace_seq_printf(p, "fid=0x%x, sel=0x%x, cdw11=0x%x", fid, sel, cdw11); trace_seq_putc(p, 0); return ret; @@ -49,6 +49,20 @@ static const char *nvmet_trace_get_lba_status(struct trace_seq *p, return ret; } +static const char *nvmet_trace_admin_set_features(struct trace_seq *p, + u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u8 fid = cdw10[0]; + u8 sv = cdw10[3] & 0x8; + u32 cdw11 = get_unaligned_le32(cdw10 + 4); + + trace_seq_printf(p, "fid=0x%x, sv=0x%x, cdw11=0x%x", fid, sv, cdw11); + trace_seq_putc(p, 0); + + return ret; +} + static const char *nvmet_trace_read_write(struct trace_seq *p, u8 *cdw10) { const char *ret = trace_seq_buffer_ptr(p); @@ -94,6 +108,8 @@ const char *nvmet_trace_parse_admin_cmd(struct trace_seq *p, switch (opcode) { case nvme_admin_identify: return nvmet_trace_admin_identify(p, cdw10); + case nvme_admin_set_features: + return nvmet_trace_admin_set_features(p, cdw10); case nvme_admin_get_features: return nvmet_trace_admin_get_features(p, cdw10); case nvme_admin_get_lba_status: From ad0e9a80ba0f20db0f86e23d1ad2979513a9a8ee Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 6 Jul 2021 15:56:50 +0100 Subject: [PATCH 133/315] nvmet: remove redundant assignments of variable status There are two occurrances where variable status is being assigned a value that is never read and it is being re-assigned a new value almost immediately afterwards on an error exit path. The assignments are redundant and can be removed. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/zns.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c index 17f8b7a45f21..46bc30fe85d2 100644 --- a/drivers/nvme/target/zns.c +++ b/drivers/nvme/target/zns.c @@ -115,14 +115,11 @@ void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req) } status = nvmet_req_find_ns(req); - if (status) { - status = NVME_SC_INTERNAL; + if (status) goto done; - } if (!bdev_is_zoned(req->ns->bdev)) { req->error_loc = offsetof(struct nvme_identify, nsid); - status = NVME_SC_INVALID_NS | NVME_SC_DNR; goto done; } From e5ad96f388b765fe6b52f64f37e910c0ba4f3de7 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 27 Jul 2021 09:40:44 -0700 Subject: [PATCH 134/315] nvme-pci: disable hmb on idle suspend An idle suspend may or may not disable host memory access from devices placed in low power mode. Either way, it should always be safe to disable the host memory buffer prior to entering the low power mode, and this should also always be faster than a full device shutdown. Signed-off-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 909dadcdab09..5b23d5818f75 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3087,8 +3087,13 @@ static int nvme_resume(struct device *dev) if (ndev->last_ps == U32_MAX || nvme_set_power_state(ctrl, ndev->last_ps) != 0) - return nvme_try_sched_reset(&ndev->ctrl); + goto reset; + if (ctrl->hmpre && nvme_setup_host_mem(ndev)) + goto reset; + return 0; +reset: + return nvme_try_sched_reset(ctrl); } static int nvme_suspend(struct device *dev) @@ -3112,15 +3117,9 @@ static int nvme_suspend(struct device *dev) * the PCI bus layer to put it into D3 in order to take the PCIe link * down, so as to allow the platform to achieve its minimum low-power * state (which may not be possible if the link is up). - * - * If a host memory buffer is enabled, shut down the device as the NVMe - * specification allows the device to access the host memory buffer in - * host DRAM from all power states, but hosts will fail access to DRAM - * during S3. */ if (pm_suspend_via_firmware() || !ctrl->npss || !pcie_aspm_enabled(pdev) || - ndev->nr_host_mem_descs || (ndev->ctrl.quirks & NVME_QUIRK_SIMPLE_SUSPEND)) return nvme_disable_prepare_reset(ndev, true); @@ -3131,6 +3130,17 @@ static int nvme_suspend(struct device *dev) if (ctrl->state != NVME_CTRL_LIVE) goto unfreeze; + /* + * Host memory access may not be successful in a system suspend state, + * but the specification allows the controller to access memory in a + * non-operational power state. + */ + if (ndev->hmb) { + ret = nvme_set_host_mem(ndev, 0); + if (ret < 0) + goto unfreeze; + } + ret = nvme_get_power_state(ctrl, &ndev->last_ps); if (ret < 0) goto unfreeze; From a5df5e79c43c84d9fb88f56b707c5ff52b27ccca Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 27 Jul 2021 09:40:43 -0700 Subject: [PATCH 135/315] nvme: allow user toggling hmb usage The NVMe host memory buffer may consume a non-negligable amount of memory. Controllers are required to function without the host memory buffer enabled, but with possibly degraded performance. Export a sysfs property to toggle this feature on a per-device granularity so users may choose to reclaim memory at the expense of storage performance. Signed-off-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 45 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 5b23d5818f75..b82492cd7503 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -137,6 +137,7 @@ struct nvme_dev { u32 cmbloc; struct nvme_ctrl ctrl; u32 last_ps; + bool hmb; mempool_t *iod_mempool; @@ -1896,7 +1897,9 @@ static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits) dev_warn(dev->ctrl.device, "failed to set host mem (err %d, flags %#x).\n", ret, bits); - } + } else + dev->hmb = bits & NVME_HOST_MEM_ENABLE; + return ret; } @@ -2081,6 +2084,42 @@ static ssize_t cmbsz_show(struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR_RO(cmbsz); +static ssize_t hmb_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); + + return sysfs_emit(buf, "%d\n", ndev->hmb); +} + +static ssize_t hmb_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); + bool new; + int ret; + + if (strtobool(buf, &new) < 0) + return -EINVAL; + + if (new == ndev->hmb) + return count; + + if (new) { + ret = nvme_setup_host_mem(ndev); + } else { + ret = nvme_set_host_mem(ndev, 0); + if (!ret) + nvme_free_host_mem(ndev); + } + + if (ret < 0) + return ret; + + return count; +} +static DEVICE_ATTR_RW(hmb); + static umode_t nvme_pci_attrs_are_visible(struct kobject *kobj, struct attribute *a, int n) { @@ -2094,6 +2133,9 @@ static umode_t nvme_pci_attrs_are_visible(struct kobject *kobj, if (!dev->cmbsz) return 0; } + if (a == &dev_attr_hmb.attr && !ctrl->hmpre) + return 0; + return a->mode; } @@ -2101,6 +2143,7 @@ static struct attribute *nvme_pci_attrs[] = { &dev_attr_cmb.attr, &dev_attr_cmbloc.attr, &dev_attr_cmbsz.attr, + &dev_attr_hmb.attr, NULL, }; From d48f92cd2739258a1292be56bbeadb5b6a57ea09 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 6 Aug 2021 08:41:43 -0700 Subject: [PATCH 136/315] nvme-tcp: pair send_mutex init with destroy Each mutex_init() should have a corresponding mutex_destroy(). Signed-off-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 0a97ba02f61e..95d4cf777d24 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1220,6 +1220,7 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid) sock_release(queue->sock); kfree(queue->pdu); + mutex_destroy(&queue->send_mutex); mutex_destroy(&queue->queue_lock); } @@ -1525,6 +1526,7 @@ err_sock: sock_release(queue->sock); queue->sock = NULL; err_destroy_mutex: + mutex_destroy(&queue->send_mutex); mutex_destroy(&queue->queue_lock); return ret; } From 664227fde63844d69e9ec9e90a8a7801e6ff072d Mon Sep 17 00:00:00 2001 From: Ruozhu Li Date: Sat, 7 Aug 2021 11:50:23 +0800 Subject: [PATCH 137/315] nvme-tcp: don't update queue count when failing to set io queues We update ctrl->queue_count and schedule another reconnect when io queue count is zero.But we will never try to create any io queue in next reco- nnection, because ctrl->queue_count already set to zero.We will end up having an admin-only session in Live state, which is exactly what we try to avoid in the original patch. Update ctrl->queue_count after queue_count zero checking to fix it. Signed-off-by: Ruozhu Li Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 95d4cf777d24..645025620154 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1763,13 +1763,13 @@ static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) if (ret) return ret; - ctrl->queue_count = nr_io_queues + 1; - if (ctrl->queue_count < 2) { + if (nr_io_queues == 0) { dev_err(ctrl->device, "unable to set any I/O queues\n"); return -ENOMEM; } + ctrl->queue_count = nr_io_queues + 1; dev_info(ctrl->device, "creating %d I/O queues.\n", nr_io_queues); From 85032874f80ba17bf187de1d14d9603bf3f582b8 Mon Sep 17 00:00:00 2001 From: Ruozhu Li Date: Wed, 28 Jul 2021 17:41:20 +0800 Subject: [PATCH 138/315] nvme-rdma: don't update queue count when failing to set io queues We update ctrl->queue_count and schedule another reconnect when io queue count is zero.But we will never try to create any io queue in next reco- nnection, because ctrl->queue_count already set to zero.We will end up having an admin-only session in Live state, which is exactly what we try to avoid in the original patch. Update ctrl->queue_count after queue_count zero checking to fix it. Signed-off-by: Ruozhu Li Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 69ae67652f38..a68704e39084 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -735,13 +735,13 @@ static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl) if (ret) return ret; - ctrl->ctrl.queue_count = nr_io_queues + 1; - if (ctrl->ctrl.queue_count < 2) { + if (nr_io_queues == 0) { dev_err(ctrl->ctrl.device, "unable to set any I/O queues\n"); return -ENOMEM; } + ctrl->ctrl.queue_count = nr_io_queues + 1; dev_info(ctrl->ctrl.device, "creating %d I/O queues.\n", nr_io_queues); From e804d5abe2d74cfe23f5f83be580d1cdc9307111 Mon Sep 17 00:00:00 2001 From: Amit Engel Date: Sun, 8 Aug 2021 09:20:14 +0300 Subject: [PATCH 139/315] nvmet: pass back cntlid on successful completion According to the NVMe specification, the response dword 0 value of the Connect command is based on status code: return cntlid for successful compeltion return IPO and IATTR for connect invalid parameters. Fix a missing error information for a zero sized queue, and return the cntlid also for I/O queue Connect commands. Signed-off-by: Amit Engel Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fabrics-cmd.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index 7d0f3523fdab..8ef564c3b32c 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -120,6 +120,7 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req) if (!sqsize) { pr_warn("queue size zero!\n"); req->error_loc = offsetof(struct nvmf_connect_command, sqsize); + req->cqe->result.u32 = IPO_IATTR_CONNECT_SQE(sqsize); ret = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; goto err; } @@ -260,11 +261,11 @@ static void nvmet_execute_io_connect(struct nvmet_req *req) } status = nvmet_install_queue(ctrl, req); - if (status) { - /* pass back cntlid that had the issue of installing queue */ - req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid); + if (status) goto out_ctrl_put; - } + + /* pass back cntlid for successful completion */ + req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid); pr_debug("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid); From b71df12605cabab47d58bd926badaf4130280e4d Mon Sep 17 00:00:00 2001 From: Amit Engel Date: Sun, 8 Aug 2021 18:06:15 +0300 Subject: [PATCH 140/315] nvmet: avoid duplicate qid in connect cmd According to the NVMe specification, if the host sends a Connect command specifying a queue id which has already been created, a status value of NVME_SC_CMD_SEQ_ERROR is returned. Signed-off-by: Amit Engel Signed-off-by: Christoph Hellwig --- drivers/nvme/target/core.c | 1 + drivers/nvme/target/fabrics-cmd.c | 20 ++++++++++++++------ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index ac7210a3ea1c..66d05eecc2a9 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -802,6 +802,7 @@ void nvmet_sq_destroy(struct nvmet_sq *sq) * controller teardown as a result of a keep-alive expiration. */ ctrl->reset_tbkas = true; + sq->ctrl->sqs[sq->qid] = NULL; nvmet_ctrl_put(ctrl); sq->ctrl = NULL; /* allows reusing the queue later */ } diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index 8ef564c3b32c..6f14da5cd08f 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -111,12 +111,6 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req) struct nvmet_ctrl *old; u16 ret; - old = cmpxchg(&req->sq->ctrl, NULL, ctrl); - if (old) { - pr_warn("queue already connected!\n"); - req->error_loc = offsetof(struct nvmf_connect_command, opcode); - return NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR; - } if (!sqsize) { pr_warn("queue size zero!\n"); req->error_loc = offsetof(struct nvmf_connect_command, sqsize); @@ -125,6 +119,19 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req) goto err; } + if (ctrl->sqs[qid] != NULL) { + pr_warn("qid %u has already been created\n", qid); + req->error_loc = offsetof(struct nvmf_connect_command, qid); + return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; + } + + old = cmpxchg(&req->sq->ctrl, NULL, ctrl); + if (old) { + pr_warn("queue already connected!\n"); + req->error_loc = offsetof(struct nvmf_connect_command, opcode); + return NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR; + } + /* note: convert queue size from 0's-based value to 1's-based value */ nvmet_cq_setup(ctrl, req->cq, qid, sqsize + 1); nvmet_sq_setup(ctrl, req->sq, qid, sqsize + 1); @@ -139,6 +146,7 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req) if (ret) { pr_err("failed to install queue %d cntlid %d ret %x\n", qid, ctrl->cntlid, ret); + ctrl->sqs[qid] = NULL; goto err; } } From e19e9f47f341cafcaf41253723f083223a4652a5 Mon Sep 17 00:00:00 2001 From: Amit Engel Date: Thu, 5 Aug 2021 18:02:51 +0300 Subject: [PATCH 141/315] nvmet: check that host sqsize does not exceed ctrl MQES Check that host sqsize is not greater-than Maximum Queue Entries Supported (MQES) value supported by the controller. Signed-off-by: Amit Engel Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fabrics-cmd.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index 6f14da5cd08f..7d0454cee920 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -109,6 +109,7 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req) u16 qid = le16_to_cpu(c->qid); u16 sqsize = le16_to_cpu(c->sqsize); struct nvmet_ctrl *old; + u16 mqes = NVME_CAP_MQES(ctrl->cap); u16 ret; if (!sqsize) { @@ -125,6 +126,14 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req) return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; } + if (sqsize > mqes) { + pr_warn("sqsize %u is larger than MQES supported %u cntlid %d\n", + sqsize, mqes, ctrl->cntlid); + req->error_loc = offsetof(struct nvmf_connect_command, sqsize); + req->cqe->result.u32 = IPO_IATTR_CONNECT_SQE(sqsize); + return NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + } + old = cmpxchg(&req->sq->ctrl, NULL, ctrl); if (old) { pr_warn("queue already connected!\n"); From 9451aa0aacaf7ea13d1acfd5de8b63a6e0b24fac Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Aug 2021 14:26:13 +0200 Subject: [PATCH 142/315] block: free the extended dev_t minor later The dev_t is used as the inode hash, so we should only released it once then block device inode is gone from the inode cache. Move it to bdev_free_inode to ensure that. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210816122614.601358-2-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 2 -- block/partitions/core.c | 2 -- fs/block_dev.c | 5 +++++ 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 9d6b3aeea288..ed58ddf6258b 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1085,8 +1085,6 @@ static void disk_release(struct device *dev) might_sleep(); bdi_put(disk->bdi); - if (MAJOR(dev->devt) == BLOCK_EXT_MAJOR) - blk_free_ext_minor(MINOR(dev->devt)); disk_release_events(disk); kfree(disk->random); xa_destroy(&disk->part_tbl); diff --git a/block/partitions/core.c b/block/partitions/core.c index 9265936df77e..58c4c362c94f 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -259,8 +259,6 @@ static const struct attribute_group *part_attr_groups[] = { static void part_release(struct device *dev) { - if (MAJOR(dev->devt) == BLOCK_EXT_MAJOR) - blk_free_ext_minor(MINOR(dev->devt)); put_disk(dev_to_bdev(dev)->bd_disk); iput(dev_to_bdev(dev)->bd_inode); } diff --git a/fs/block_dev.c b/fs/block_dev.c index 38a8b0e04a0c..4bd2a632c79c 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -35,6 +35,7 @@ #include #include #include "internal.h" +#include "../block/blk.h" struct bdev_inode { struct block_device bdev; @@ -813,6 +814,10 @@ static void bdev_free_inode(struct inode *inode) if (!bdev_is_partition(bdev)) kfree(bdev->bd_disk); + + if (MAJOR(bdev->bd_dev) == BLOCK_EXT_MAJOR) + blk_free_ext_minor(MINOR(bdev->bd_dev)); + kmem_cache_free(bdev_cachep, BDEV_I(inode)); } From 889c05cc5834a1eef2dbe1e639cfd7a81c4f4c6d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Aug 2021 14:26:14 +0200 Subject: [PATCH 143/315] block: ensure the bdi is freed after inode_detach_wb inode_detach_wb references the "main" bdi of the inode. With the recent change to move the bdi from the request_queue to the gendisk this causes a guaranteed use after free when using certain cgroup configurations. The big itself is older through as any non-default inode reference (e.g. an open file descriptor) could have injected this use after free even before that. Fixes: 52ebea749aae ("writeback: make backing_dev_info host cgroup-specific bdi_writebacks") Reported-by: Qian Cai Reported-by: syzbot Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210816122614.601358-3-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 1 - fs/block_dev.c | 7 ++++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index ed58ddf6258b..731a46063132 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1084,7 +1084,6 @@ static void disk_release(struct device *dev) might_sleep(); - bdi_put(disk->bdi); disk_release_events(disk); kfree(disk->random); xa_destroy(&disk->part_tbl); diff --git a/fs/block_dev.c b/fs/block_dev.c index 4bd2a632c79c..d3a8062302a0 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -812,8 +812,11 @@ static void bdev_free_inode(struct inode *inode) free_percpu(bdev->bd_stats); kfree(bdev->bd_meta_info); - if (!bdev_is_partition(bdev)) + if (!bdev_is_partition(bdev)) { + if (bdev->bd_disk && bdev->bd_disk->bdi) + bdi_put(bdev->bd_disk->bdi); kfree(bdev->bd_disk); + } if (MAJOR(bdev->bd_dev) == BLOCK_EXT_MAJOR) blk_free_ext_minor(MINOR(bdev->bd_dev)); @@ -833,8 +836,6 @@ static void bdev_evict_inode(struct inode *inode) truncate_inode_pages_final(&inode->i_data); invalidate_inode_buffers(inode); /* is it needed here? */ clear_inode(inode); - /* Detach inode from wb early as bdi_put() may free bdi->wb */ - inode_detach_wb(inode); } static const struct super_operations bdev_sops = { From 1113f0b69c6a98ff4e733c306a6658a31f8cbc49 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:56:20 +0200 Subject: [PATCH 144/315] bvec: add a bvec_virt helper Add a helper to get the virtual address for a bvec. This avoids that all callers need to know about the page + offset representation. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210804095634.460779-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bvec.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/include/linux/bvec.h b/include/linux/bvec.h index f9fa43b940ff..0e9bdd42dafb 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -229,4 +229,16 @@ static inline void memzero_bvec(struct bio_vec *bvec) memzero_page(bvec->bv_page, bvec->bv_offset, bvec->bv_len); } +/** + * bvec_virt - return the virtual address for a bvec + * @bvec: bvec to return the virtual address for + * + * Note: the caller must ensure that @bvec->bv_page is not a highmem page. + */ +static inline void *bvec_virt(struct bio_vec *bvec) +{ + WARN_ON_ONCE(PageHighMem(bvec->bv_page)); + return page_address(bvec->bv_page) + bvec->bv_offset; +} + #endif /* __LINUX_BVEC_H */ From b93ef45350c0119ddc275601438c89231b198414 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:56:21 +0200 Subject: [PATCH 145/315] block: use bvec_virt in bio_integrity_{process,free} Use the bvec_virt helper to clean up the bio integrity processing a little bit. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Acked-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210804095634.460779-3-hch@lst.de Signed-off-by: Jens Axboe --- block/bio-integrity.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 8f54d49dc500..6b47cddbbca1 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -104,8 +104,7 @@ void bio_integrity_free(struct bio *bio) struct bio_set *bs = bio->bi_pool; if (bip->bip_flags & BIP_BLOCK_INTEGRITY) - kfree(page_address(bip->bip_vec->bv_page) + - bip->bip_vec->bv_offset); + kfree(bvec_virt(bip->bip_vec)); __bio_integrity_free(bs, bip); bio->bi_integrity = NULL; @@ -163,13 +162,11 @@ static blk_status_t bio_integrity_process(struct bio *bio, struct bio_vec bv; struct bio_integrity_payload *bip = bio_integrity(bio); blk_status_t ret = BLK_STS_OK; - void *prot_buf = page_address(bip->bip_vec->bv_page) + - bip->bip_vec->bv_offset; iter.disk_name = bio->bi_bdev->bd_disk->disk_name; iter.interval = 1 << bi->interval_exp; iter.seed = proc_iter->bi_sector; - iter.prot_buf = prot_buf; + iter.prot_buf = bvec_virt(bip->bip_vec); __bio_for_each_segment(bv, bio, bviter, *proc_iter) { void *kaddr = bvec_kmap_local(&bv); From 1c277e501334238f6c4f57d16d14e7c911550075 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:56:22 +0200 Subject: [PATCH 146/315] dm: make EBS depend on !HIGHMEM __ebs_rw_bvec use page_address on the submitted bios data, and thus can't deal with highmem. Disable the target on highmem configs. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210804095634.460779-4-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index f821dae101a9..f45fb372e51b 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -342,7 +342,7 @@ config DM_WRITECACHE config DM_EBS tristate "Emulated block size target (EXPERIMENTAL)" - depends on BLK_DEV_DM + depends on BLK_DEV_DM && !HIGHMEM select DM_BUFIO help dm-ebs emulates smaller logical block size on backing devices From 3a8ba33bd71a4126b9e799e8d29d6d5da08c93f0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:56:23 +0200 Subject: [PATCH 147/315] dm-ebs: use bvec_virt Use bvec_virt instead of open coding it. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210804095634.460779-5-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/dm-ebs-target.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c index 71475a2410be..0c509dae0ff8 100644 --- a/drivers/md/dm-ebs-target.c +++ b/drivers/md/dm-ebs-target.c @@ -74,7 +74,7 @@ static int __ebs_rw_bvec(struct ebs_c *ec, int rw, struct bio_vec *bv, struct bv if (unlikely(!bv->bv_page || !bv_len)) return -EIO; - pa = page_address(bv->bv_page) + bv->bv_offset; + pa = bvec_virt(bv); /* Handle overlapping page <-> blocks */ while (bv_len) { From 964cacfdd34cd48e3b5b714c3cc33427001e843f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:56:24 +0200 Subject: [PATCH 148/315] dm-integrity: use bvec_virt Use bvec_virt instead of open coding it. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210804095634.460779-6-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/dm-integrity.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 20f2510db1f6..a9ea361769a7 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -1819,7 +1819,7 @@ again: unsigned this_len; BUG_ON(PageHighMem(biv.bv_page)); - tag = lowmem_page_address(biv.bv_page) + biv.bv_offset; + tag = bvec_virt(&biv); this_len = min(biv.bv_len, data_to_process); r = dm_integrity_rw_tag(ic, tag, &dio->metadata_block, &dio->metadata_offset, this_len, dio->op == REQ_OP_READ ? TAG_READ : TAG_WRITE); @@ -2006,7 +2006,7 @@ retry_kmap: unsigned tag_now = min(biv.bv_len, tag_todo); char *tag_addr; BUG_ON(PageHighMem(biv.bv_page)); - tag_addr = lowmem_page_address(biv.bv_page) + biv.bv_offset; + tag_addr = bvec_virt(&biv); if (likely(dio->op == REQ_OP_WRITE)) memcpy(tag_ptr, tag_addr, tag_now); else From fbc27241e537d3a99d0f843a4080e1d2fb014fb4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:56:25 +0200 Subject: [PATCH 149/315] squashfs: use bvec_virt Use bvec_virt instead of open coding it. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210804095634.460779-7-hch@lst.de Signed-off-by: Jens Axboe --- fs/squashfs/block.c | 7 +++---- fs/squashfs/lz4_wrapper.c | 2 +- fs/squashfs/lzo_wrapper.c | 2 +- fs/squashfs/xz_wrapper.c | 2 +- fs/squashfs/zlib_wrapper.c | 2 +- fs/squashfs/zstd_wrapper.c | 2 +- 6 files changed, 8 insertions(+), 9 deletions(-) diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 855f0e87066d..2db8bcf7ff85 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -49,8 +49,7 @@ static int copy_bio_to_actor(struct bio *bio, bytes_to_copy = min_t(int, bytes_to_copy, req_length - copied_bytes); - memcpy(actor_addr + actor_offset, - page_address(bvec->bv_page) + bvec->bv_offset + offset, + memcpy(actor_addr + actor_offset, bvec_virt(bvec) + offset, bytes_to_copy); actor_offset += bytes_to_copy; @@ -177,7 +176,7 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length, goto out_free_bio; } /* Extract the length of the metadata block */ - data = page_address(bvec->bv_page) + bvec->bv_offset; + data = bvec_virt(bvec); length = data[offset]; if (offset < bvec->bv_len - 1) { length |= data[offset + 1] << 8; @@ -186,7 +185,7 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length, res = -EIO; goto out_free_bio; } - data = page_address(bvec->bv_page) + bvec->bv_offset; + data = bvec_virt(bvec); length |= data[0] << 8; } bio_free_pages(bio); diff --git a/fs/squashfs/lz4_wrapper.c b/fs/squashfs/lz4_wrapper.c index 233d5582fbee..b685b6238316 100644 --- a/fs/squashfs/lz4_wrapper.c +++ b/fs/squashfs/lz4_wrapper.c @@ -101,7 +101,7 @@ static int lz4_uncompress(struct squashfs_sb_info *msblk, void *strm, while (bio_next_segment(bio, &iter_all)) { int avail = min(bytes, ((int)bvec->bv_len) - offset); - data = page_address(bvec->bv_page) + bvec->bv_offset; + data = bvec_virt(bvec); memcpy(buff, data + offset, avail); buff += avail; bytes -= avail; diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c index 97bb7d92ddcd..cb510a631968 100644 --- a/fs/squashfs/lzo_wrapper.c +++ b/fs/squashfs/lzo_wrapper.c @@ -76,7 +76,7 @@ static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm, while (bio_next_segment(bio, &iter_all)) { int avail = min(bytes, ((int)bvec->bv_len) - offset); - data = page_address(bvec->bv_page) + bvec->bv_offset; + data = bvec_virt(bvec); memcpy(buff, data + offset, avail); buff += avail; bytes -= avail; diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c index e80419aed862..68f6d09bb3a2 100644 --- a/fs/squashfs/xz_wrapper.c +++ b/fs/squashfs/xz_wrapper.c @@ -146,7 +146,7 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, } avail = min(length, ((int)bvec->bv_len) - offset); - data = page_address(bvec->bv_page) + bvec->bv_offset; + data = bvec_virt(bvec); length -= avail; stream->buf.in = data + offset; stream->buf.in_size = avail; diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c index bcb881ec47f2..a20e9042146b 100644 --- a/fs/squashfs/zlib_wrapper.c +++ b/fs/squashfs/zlib_wrapper.c @@ -76,7 +76,7 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, } avail = min(length, ((int)bvec->bv_len) - offset); - data = page_address(bvec->bv_page) + bvec->bv_offset; + data = bvec_virt(bvec); length -= avail; stream->next_in = data + offset; stream->avail_in = avail; diff --git a/fs/squashfs/zstd_wrapper.c b/fs/squashfs/zstd_wrapper.c index b7cb1faa652d..0015cf8b5582 100644 --- a/fs/squashfs/zstd_wrapper.c +++ b/fs/squashfs/zstd_wrapper.c @@ -94,7 +94,7 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm, } avail = min(length, ((int)bvec->bv_len) - offset); - data = page_address(bvec->bv_page) + bvec->bv_offset; + data = bvec_virt(bvec); length -= avail; in_buf.src = data + offset; in_buf.size = avail; From cf58b537781df6eee2bbeae0463e45acf727978a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:56:26 +0200 Subject: [PATCH 150/315] rbd: use bvec_virt Use bvec_virt instead of open coding it. Signed-off-by: Christoph Hellwig Reviewed-by: Jeff Layton Link: https://lore.kernel.org/r/20210804095634.460779-8-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/rbd.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 6d596c2c2cd6..e65c9d706f6f 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2986,8 +2986,7 @@ static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes) }; ceph_bvec_iter_advance_step(&it, bytes, ({ - if (memchr_inv(page_address(bv.bv_page) + bv.bv_offset, 0, - bv.bv_len)) + if (memchr_inv(bvec_virt(&bv), 0, bv.bv_len)) return false; })); return true; From 358b348b9197b977276e0f034c474380565879e3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:56:27 +0200 Subject: [PATCH 151/315] virtio_blk: use bvec_virt Use bvec_virt instead of open coding it. Signed-off-by: Christoph Hellwig Reviewed-by: Stefan Hajnoczi Link: https://lore.kernel.org/r/20210804095634.460779-9-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/virtio_blk.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 4b49df2dfd23..767b4f72a54d 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -166,11 +166,8 @@ static inline void virtblk_request_done(struct request *req) { struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); - if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { - kfree(page_address(req->special_vec.bv_page) + - req->special_vec.bv_offset); - } - + if (req->rq_flags & RQF_SPECIAL_PAYLOAD) + kfree(bvec_virt(&req->special_vec)); blk_mq_end_request(req, virtblk_result(vbr)); } From 2fd3e5efe791946be0957c8e1eed9560b541fe46 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:56:28 +0200 Subject: [PATCH 152/315] bcache: use bvec_virt Use bvec_virt instead of open coding it. Note that the existing code is fine despite ignoring bv_offset as the bio is known to contain exactly one page from the page allocator per bio_vec. Signed-off-by: Christoph Hellwig Reviewed-by: Coly Li Link: https://lore.kernel.org/r/20210804095634.460779-10-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/bcache/btree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 183a58c89377..0595559de174 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -378,7 +378,7 @@ static void do_btree_node_write(struct btree *b) struct bvec_iter_all iter_all; bio_for_each_segment_all(bv, b->bio, iter_all) { - memcpy(page_address(bv->bv_page), addr, PAGE_SIZE); + memcpy(bvec_virt(bv), addr, PAGE_SIZE); addr += PAGE_SIZE; } From c3c770563510aa66fd8e84b374daf43e236fa4ba Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:56:29 +0200 Subject: [PATCH 153/315] sd: use bvec_virt Use bvec_virt instead of open coding it. Signed-off-by: Christoph Hellwig Acked-by: Martin K. Petersen Link: https://lore.kernel.org/r/20210804095634.460779-11-hch@lst.de Signed-off-by: Jens Axboe --- drivers/scsi/sd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index b8d55af763f9..5b5b8266e142 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -886,7 +886,7 @@ static blk_status_t sd_setup_unmap_cmnd(struct scsi_cmnd *cmd) cmd->cmnd[0] = UNMAP; cmd->cmnd[8] = 24; - buf = page_address(rq->special_vec.bv_page); + buf = bvec_virt(&rq->special_vec); put_unaligned_be16(6 + 16, &buf[0]); put_unaligned_be16(16, &buf[2]); put_unaligned_be64(lba, &buf[8]); From 25d84545beaae8e9427bbd25feff309363cd0a58 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:56:30 +0200 Subject: [PATCH 154/315] ubd: use bvec_virt Use bvec_virt instead of open coding it. Signed-off-by: Christoph Hellwig Acked-By: Anton Ivanov Link: https://lore.kernel.org/r/20210804095634.460779-12-hch@lst.de Signed-off-by: Jens Axboe --- arch/um/drivers/ubd_kern.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index e497185dd393..cd9dc0556e91 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -1268,8 +1268,7 @@ static void ubd_map_req(struct ubd *dev, struct io_thread_req *io_req, rq_for_each_segment(bvec, req, iter) { BUG_ON(i >= io_req->desc_cnt); - io_req->io_desc[i].buffer = - page_address(bvec.bv_page) + bvec.bv_offset; + io_req->io_desc[i].buffer = bvec_virt(&bvec); io_req->io_desc[i].length = bvec.bv_len; i++; } From 6da525b3ecaea04eaaeb3277f6e16d91ecfdb84a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:56:31 +0200 Subject: [PATCH 155/315] ps3vram: use bvec_virt Use bvec_virt instead of open coding it. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210804095634.460779-13-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/ps3vram.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c index 7fbf469651c4..c7b19e128b03 100644 --- a/drivers/block/ps3vram.c +++ b/drivers/block/ps3vram.c @@ -541,7 +541,7 @@ static struct bio *ps3vram_do_bio(struct ps3_system_bus_device *dev, bio_for_each_segment(bvec, bio, iter) { /* PS3 is ppc64, so we don't handle highmem */ - char *ptr = page_address(bvec.bv_page) + bvec.bv_offset; + char *ptr = bvec_virt(&bvec); size_t len = bvec.bv_len, retlen; dev_dbg(&dev->core, " %s %zu bytes at offset %llu\n", op, From bf5fb875b494b32ef81fdfa5530a79fc22486254 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:56:32 +0200 Subject: [PATCH 156/315] dasd: use bvec_virt Use bvec_virt instead of open coding it. Signed-off-by: Christoph Hellwig Reviewed-by: Stefan Haberland Link: https://lore.kernel.org/r/20210804095634.460779-14-hch@lst.de Signed-off-by: Jens Axboe --- drivers/s390/block/dasd_diag.c | 2 +- drivers/s390/block/dasd_eckd.c | 14 +++++++------- drivers/s390/block/dasd_fba.c | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/s390/block/dasd_diag.c b/drivers/s390/block/dasd_diag.c index 6bb775236c16..db5987281010 100644 --- a/drivers/s390/block/dasd_diag.c +++ b/drivers/s390/block/dasd_diag.c @@ -552,7 +552,7 @@ static struct dasd_ccw_req *dasd_diag_build_cp(struct dasd_device *memdev, dbio = dreq->bio; recid = first_rec; rq_for_each_segment(bv, req, iter) { - dst = page_address(bv.bv_page) + bv.bv_offset; + dst = bvec_virt(&bv); for (off = 0; off < bv.bv_len; off += blksize) { memset(dbio, 0, sizeof (struct dasd_diag_bio)); dbio->type = rw_cmd; diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index 0de1a463c509..8610ea414322 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -3267,7 +3267,7 @@ static int dasd_eckd_ese_read(struct dasd_ccw_req *cqr, struct irb *irb) end_blk = (curr_trk + 1) * recs_per_trk; rq_for_each_segment(bv, req, iter) { - dst = page_address(bv.bv_page) + bv.bv_offset; + dst = bvec_virt(&bv); for (off = 0; off < bv.bv_len; off += blksize) { if (first_blk + blk_count >= end_blk) { cqr->proc_bytes = blk_count * blksize; @@ -3999,7 +3999,7 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_cmd_single( last_rec - recid + 1, cmd, basedev, blksize); } rq_for_each_segment(bv, req, iter) { - dst = page_address(bv.bv_page) + bv.bv_offset; + dst = bvec_virt(&bv); if (dasd_page_cache) { char *copy = kmem_cache_alloc(dasd_page_cache, GFP_DMA | __GFP_NOWARN); @@ -4166,7 +4166,7 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_cmd_track( idaw_dst = NULL; idaw_len = 0; rq_for_each_segment(bv, req, iter) { - dst = page_address(bv.bv_page) + bv.bv_offset; + dst = bvec_virt(&bv); seg_len = bv.bv_len; while (seg_len) { if (new_track) { @@ -4509,7 +4509,7 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_tpm_track( new_track = 1; recid = first_rec; rq_for_each_segment(bv, req, iter) { - dst = page_address(bv.bv_page) + bv.bv_offset; + dst = bvec_virt(&bv); seg_len = bv.bv_len; while (seg_len) { if (new_track) { @@ -4542,7 +4542,7 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_tpm_track( } } else { rq_for_each_segment(bv, req, iter) { - dst = page_address(bv.bv_page) + bv.bv_offset; + dst = bvec_virt(&bv); last_tidaw = itcw_add_tidaw(itcw, 0x00, dst, bv.bv_len); if (IS_ERR(last_tidaw)) { @@ -4778,7 +4778,7 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_raw(struct dasd_device *startdev, idaws = idal_create_words(idaws, rawpadpage, PAGE_SIZE); } rq_for_each_segment(bv, req, iter) { - dst = page_address(bv.bv_page) + bv.bv_offset; + dst = bvec_virt(&bv); seg_len = bv.bv_len; if (cmd == DASD_ECKD_CCW_READ_TRACK) memset(dst, 0, seg_len); @@ -4839,7 +4839,7 @@ dasd_eckd_free_cp(struct dasd_ccw_req *cqr, struct request *req) if (private->uses_cdl == 0 || recid > 2*blk_per_trk) ccw++; rq_for_each_segment(bv, req, iter) { - dst = page_address(bv.bv_page) + bv.bv_offset; + dst = bvec_virt(&bv); for (off = 0; off < bv.bv_len; off += blksize) { /* Skip locate record. */ if (private->uses_cdl && recid <= 2*blk_per_trk) diff --git a/drivers/s390/block/dasd_fba.c b/drivers/s390/block/dasd_fba.c index 3ad319aee51e..e084f4dedddd 100644 --- a/drivers/s390/block/dasd_fba.c +++ b/drivers/s390/block/dasd_fba.c @@ -501,7 +501,7 @@ static struct dasd_ccw_req *dasd_fba_build_cp_regular( } recid = first_rec; rq_for_each_segment(bv, req, iter) { - dst = page_address(bv.bv_page) + bv.bv_offset; + dst = bvec_virt(&bv); if (dasd_page_cache) { char *copy = kmem_cache_alloc(dasd_page_cache, GFP_DMA | __GFP_NOWARN); @@ -583,7 +583,7 @@ dasd_fba_free_cp(struct dasd_ccw_req *cqr, struct request *req) if (private->rdc_data.mode.bits.data_chain != 0) ccw++; rq_for_each_segment(bv, req, iter) { - dst = page_address(bv.bv_page) + bv.bv_offset; + dst = bvec_virt(&bv); for (off = 0; off < bv.bv_len; off += blksize) { /* Skip locate record. */ if (private->rdc_data.mode.bits.data_chain == 0) From 2b7a8112212afa90f36391e3ab7df531614bfb6a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:56:33 +0200 Subject: [PATCH 157/315] dcssblk: use bvec_virt Use bvec_virt instead of open coding it. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210804095634.460779-15-hch@lst.de Signed-off-by: Jens Axboe --- drivers/s390/block/dcssblk.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 29180bdf0977..5be3d1c39a78 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -892,8 +892,7 @@ dcssblk_submit_bio(struct bio *bio) index = (bio->bi_iter.bi_sector >> 3); bio_for_each_segment(bvec, bio, iter) { - page_addr = (unsigned long) - page_address(bvec.bv_page) + bvec.bv_offset; + page_addr = (unsigned long)bvec_virt(&bvec); source_addr = dev_info->start + (index<<12) + bytes_done; if (unlikely((page_addr & 4095) != 0) || (bvec.bv_len & 4095) != 0) // More paranoia. From 3973e15fa5342783ce0009ab3a423ae9b811fc63 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Aug 2021 11:56:34 +0200 Subject: [PATCH 158/315] nvme: use bvec_virt Use bvec_virt instead of open coding it. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20210804095634.460779-16-hch@lst.de Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 1478d825011d..9e51266c9e2c 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -968,12 +968,11 @@ void nvme_cleanup_cmd(struct request *req) { if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { struct nvme_ctrl *ctrl = nvme_req(req)->ctrl; - struct page *page = req->special_vec.bv_page; - if (page == ctrl->discard_page) + if (req->special_vec.bv_page == ctrl->discard_page) clear_bit_unlock(0, &ctrl->discard_page_busy); else - kfree(page_address(page) + req->special_vec.bv_offset); + kfree(bvec_virt(&req->special_vec)); } } EXPORT_SYMBOL_GPL(nvme_cleanup_cmd); From 49cb5168a7c6abf9835f9acdce6263bc2deefeb6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 17:26:22 +0200 Subject: [PATCH 159/315] blk-cgroup: refactor blkcg_print_stat Factor out a helper to deal with a single blkcg_gq to make the code a little bit easier to follow. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20210810152623.1796144-1-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 152 ++++++++++++++++++++++----------------------- 1 file changed, 76 insertions(+), 76 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index db034e35ae20..52aa0540ccaf 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -870,6 +870,81 @@ static void blkcg_fill_root_iostats(void) } } +static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s) +{ + struct blkg_iostat_set *bis = &blkg->iostat; + u64 rbytes, wbytes, rios, wios, dbytes, dios; + bool has_stats = false; + const char *dname; + unsigned seq; + char *buf; + size_t size = seq_get_buf(s, &buf), off = 0; + int i; + + if (!blkg->online) + return; + + dname = blkg_dev_name(blkg); + if (!dname) + return; + + /* + * Hooray string manipulation, count is the size written NOT + * INCLUDING THE \0, so size is now count+1 less than what we + * had before, but we want to start writing the next bit from + * the \0 so we only add count to buf. + */ + off += scnprintf(buf+off, size-off, "%s ", dname); + + do { + seq = u64_stats_fetch_begin(&bis->sync); + + rbytes = bis->cur.bytes[BLKG_IOSTAT_READ]; + wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE]; + dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD]; + rios = bis->cur.ios[BLKG_IOSTAT_READ]; + wios = bis->cur.ios[BLKG_IOSTAT_WRITE]; + dios = bis->cur.ios[BLKG_IOSTAT_DISCARD]; + } while (u64_stats_fetch_retry(&bis->sync, seq)); + + if (rbytes || wbytes || rios || wios) { + has_stats = true; + off += scnprintf(buf+off, size-off, + "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu", + rbytes, wbytes, rios, wios, + dbytes, dios); + } + + if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) { + has_stats = true; + off += scnprintf(buf+off, size-off, " use_delay=%d delay_nsec=%llu", + atomic_read(&blkg->use_delay), + atomic64_read(&blkg->delay_nsec)); + } + + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + size_t written; + + if (!blkg->pd[i] || !pol->pd_stat_fn) + continue; + + written = pol->pd_stat_fn(blkg->pd[i], buf+off, size-off); + if (written) + has_stats = true; + off += written; + } + + if (has_stats) { + if (off < size - 1) { + off += scnprintf(buf+off, size-off, "\n"); + seq_commit(s, off); + } else { + seq_commit(s, -1); + } + } +} + static int blkcg_print_stat(struct seq_file *sf, void *v) { struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); @@ -881,86 +956,11 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) cgroup_rstat_flush(blkcg->css.cgroup); rcu_read_lock(); - hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { - struct blkg_iostat_set *bis = &blkg->iostat; - const char *dname; - char *buf; - u64 rbytes, wbytes, rios, wios, dbytes, dios; - size_t size = seq_get_buf(sf, &buf), off = 0; - int i; - bool has_stats = false; - unsigned seq; - spin_lock_irq(&blkg->q->queue_lock); - - if (!blkg->online) - goto skip; - - dname = blkg_dev_name(blkg); - if (!dname) - goto skip; - - /* - * Hooray string manipulation, count is the size written NOT - * INCLUDING THE \0, so size is now count+1 less than what we - * had before, but we want to start writing the next bit from - * the \0 so we only add count to buf. - */ - off += scnprintf(buf+off, size-off, "%s ", dname); - - do { - seq = u64_stats_fetch_begin(&bis->sync); - - rbytes = bis->cur.bytes[BLKG_IOSTAT_READ]; - wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE]; - dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD]; - rios = bis->cur.ios[BLKG_IOSTAT_READ]; - wios = bis->cur.ios[BLKG_IOSTAT_WRITE]; - dios = bis->cur.ios[BLKG_IOSTAT_DISCARD]; - } while (u64_stats_fetch_retry(&bis->sync, seq)); - - if (rbytes || wbytes || rios || wios) { - has_stats = true; - off += scnprintf(buf+off, size-off, - "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu", - rbytes, wbytes, rios, wios, - dbytes, dios); - } - - if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) { - has_stats = true; - off += scnprintf(buf+off, size-off, - " use_delay=%d delay_nsec=%llu", - atomic_read(&blkg->use_delay), - (unsigned long long)atomic64_read(&blkg->delay_nsec)); - } - - for (i = 0; i < BLKCG_MAX_POLS; i++) { - struct blkcg_policy *pol = blkcg_policy[i]; - size_t written; - - if (!blkg->pd[i] || !pol->pd_stat_fn) - continue; - - written = pol->pd_stat_fn(blkg->pd[i], buf+off, size-off); - if (written) - has_stats = true; - off += written; - } - - if (has_stats) { - if (off < size - 1) { - off += scnprintf(buf+off, size-off, "\n"); - seq_commit(sf, off); - } else { - seq_commit(sf, -1); - } - } - skip: + blkcg_print_one_stat(blkg, sf); spin_unlock_irq(&blkg->q->queue_lock); } - rcu_read_unlock(); return 0; } From 252c651a4c854b328445a536bd1892e999103fca Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 17:26:23 +0200 Subject: [PATCH 160/315] blk-cgroup: stop using seq_get_buf seq_get_buf is a crutch that undoes all the memory safety of the seq_file interface. Use the normal seq_printf interfaces instead. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20210810152623.1796144-2-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 30 ++++++------------------------ block/blk-iocost.c | 23 +++++++++-------------- block/blk-iolatency.c | 38 +++++++++++++++++++------------------- block/mq-deadline-cgroup.c | 8 +++----- include/linux/blk-cgroup.h | 4 ++-- 5 files changed, 39 insertions(+), 64 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 52aa0540ccaf..b8ec47dcce42 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -877,8 +877,6 @@ static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s) bool has_stats = false; const char *dname; unsigned seq; - char *buf; - size_t size = seq_get_buf(s, &buf), off = 0; int i; if (!blkg->online) @@ -888,13 +886,7 @@ static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s) if (!dname) return; - /* - * Hooray string manipulation, count is the size written NOT - * INCLUDING THE \0, so size is now count+1 less than what we - * had before, but we want to start writing the next bit from - * the \0 so we only add count to buf. - */ - off += scnprintf(buf+off, size-off, "%s ", dname); + seq_printf(s, "%s ", dname); do { seq = u64_stats_fetch_begin(&bis->sync); @@ -909,40 +901,30 @@ static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s) if (rbytes || wbytes || rios || wios) { has_stats = true; - off += scnprintf(buf+off, size-off, - "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu", + seq_printf(s, "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu", rbytes, wbytes, rios, wios, dbytes, dios); } if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) { has_stats = true; - off += scnprintf(buf+off, size-off, " use_delay=%d delay_nsec=%llu", + seq_printf(s, " use_delay=%d delay_nsec=%llu", atomic_read(&blkg->use_delay), atomic64_read(&blkg->delay_nsec)); } for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; - size_t written; if (!blkg->pd[i] || !pol->pd_stat_fn) continue; - written = pol->pd_stat_fn(blkg->pd[i], buf+off, size-off); - if (written) + if (pol->pd_stat_fn(blkg->pd[i], s)) has_stats = true; - off += written; } - if (has_stats) { - if (off < size - 1) { - off += scnprintf(buf+off, size-off, "\n"); - seq_commit(s, off); - } else { - seq_commit(s, -1); - } - } + if (has_stats) + seq_printf(s, "\n"); } static int blkcg_print_stat(struct seq_file *sf, void *v) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 5fac3757e6e0..89b21a360b2c 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -2988,34 +2988,29 @@ static void ioc_pd_free(struct blkg_policy_data *pd) kfree(iocg); } -static size_t ioc_pd_stat(struct blkg_policy_data *pd, char *buf, size_t size) +static bool ioc_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) { struct ioc_gq *iocg = pd_to_iocg(pd); struct ioc *ioc = iocg->ioc; - size_t pos = 0; if (!ioc->enabled) - return 0; + return false; if (iocg->level == 0) { unsigned vp10k = DIV64_U64_ROUND_CLOSEST( ioc->vtime_base_rate * 10000, VTIME_PER_USEC); - pos += scnprintf(buf + pos, size - pos, " cost.vrate=%u.%02u", - vp10k / 100, vp10k % 100); + seq_printf(s, " cost.vrate=%u.%02u", vp10k / 100, vp10k % 100); } - pos += scnprintf(buf + pos, size - pos, " cost.usage=%llu", - iocg->last_stat.usage_us); + seq_printf(s, " cost.usage=%llu", iocg->last_stat.usage_us); if (blkcg_debug_stats) - pos += scnprintf(buf + pos, size - pos, - " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu", - iocg->last_stat.wait_us, - iocg->last_stat.indebt_us, - iocg->last_stat.indelay_us); - - return pos; + seq_printf(s, " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu", + iocg->last_stat.wait_us, + iocg->last_stat.indebt_us, + iocg->last_stat.indelay_us); + return true; } static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd, diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 81be0096411d..4c06fafb7411 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -886,8 +886,7 @@ static int iolatency_print_limit(struct seq_file *sf, void *v) return 0; } -static size_t iolatency_ssd_stat(struct iolatency_grp *iolat, char *buf, - size_t size) +static bool iolatency_ssd_stat(struct iolatency_grp *iolat, struct seq_file *s) { struct latency_stat stat; int cpu; @@ -902,39 +901,40 @@ static size_t iolatency_ssd_stat(struct iolatency_grp *iolat, char *buf, preempt_enable(); if (iolat->rq_depth.max_depth == UINT_MAX) - return scnprintf(buf, size, " missed=%llu total=%llu depth=max", - (unsigned long long)stat.ps.missed, - (unsigned long long)stat.ps.total); - return scnprintf(buf, size, " missed=%llu total=%llu depth=%u", - (unsigned long long)stat.ps.missed, - (unsigned long long)stat.ps.total, - iolat->rq_depth.max_depth); + seq_printf(s, " missed=%llu total=%llu depth=max", + (unsigned long long)stat.ps.missed, + (unsigned long long)stat.ps.total); + else + seq_printf(s, " missed=%llu total=%llu depth=%u", + (unsigned long long)stat.ps.missed, + (unsigned long long)stat.ps.total, + iolat->rq_depth.max_depth); + return true; } -static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf, - size_t size) +static bool iolatency_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) { struct iolatency_grp *iolat = pd_to_lat(pd); unsigned long long avg_lat; unsigned long long cur_win; if (!blkcg_debug_stats) - return 0; + return false; if (iolat->ssd) - return iolatency_ssd_stat(iolat, buf, size); + return iolatency_ssd_stat(iolat, s); avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC); cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC); if (iolat->rq_depth.max_depth == UINT_MAX) - return scnprintf(buf, size, " depth=max avg_lat=%llu win=%llu", - avg_lat, cur_win); - - return scnprintf(buf, size, " depth=%u avg_lat=%llu win=%llu", - iolat->rq_depth.max_depth, avg_lat, cur_win); + seq_printf(s, " depth=max avg_lat=%llu win=%llu", + avg_lat, cur_win); + else + seq_printf(s, " depth=%u avg_lat=%llu win=%llu", + iolat->rq_depth.max_depth, avg_lat, cur_win); + return true; } - static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, struct request_queue *q, struct blkcg *blkcg) diff --git a/block/mq-deadline-cgroup.c b/block/mq-deadline-cgroup.c index 3b4bfddec39f..b48a4b962f90 100644 --- a/block/mq-deadline-cgroup.c +++ b/block/mq-deadline-cgroup.c @@ -52,7 +52,7 @@ struct dd_blkcg *dd_blkcg_from_bio(struct bio *bio) return dd_blkcg_from_pd(pd); } -static size_t dd_pd_stat(struct blkg_policy_data *pd, char *buf, size_t size) +static bool dd_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) { static const char *const prio_class_name[] = { [IOPRIO_CLASS_NONE] = "NONE", @@ -61,12 +61,10 @@ static size_t dd_pd_stat(struct blkg_policy_data *pd, char *buf, size_t size) [IOPRIO_CLASS_IDLE] = "IDLE", }; struct dd_blkcg *blkcg = dd_blkcg_from_pd(pd); - int res = 0; u8 prio; for (prio = 0; prio < ARRAY_SIZE(blkcg->stats->stats); prio++) - res += scnprintf(buf + res, size - res, - " [%s] dispatched=%u inserted=%u merged=%u", + seq_printf(s, " [%s] dispatched=%u inserted=%u merged=%u", prio_class_name[prio], ddcg_sum(blkcg, dispatched, prio) + ddcg_sum(blkcg, merged, prio) - @@ -75,7 +73,7 @@ static size_t dd_pd_stat(struct blkg_policy_data *pd, char *buf, size_t size) ddcg_sum(blkcg, completed, prio), ddcg_sum(blkcg, merged, prio)); - return res; + return true; } static struct blkg_policy_data *dd_pd_alloc(gfp_t gfp, struct request_queue *q, diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 37048438872c..b4de2010fba5 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -152,8 +152,8 @@ typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd); -typedef size_t (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd, char *buf, - size_t size); +typedef bool (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd, + struct seq_file *s); struct blkcg_policy { int plid; From 69f87cc7086558ad84f20001256474aa611fc0eb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Aug 2021 14:36:49 +0200 Subject: [PATCH 161/315] block: unexport blk_register_queue Not actually used in any modular code. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210816123649.601591-1-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 1 - 1 file changed, 1 deletion(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 1832587dce3a..586507a5b8c2 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -939,7 +939,6 @@ unlock: return ret; } -EXPORT_SYMBOL_GPL(blk_register_queue); /** * blk_unregister_queue - counterpart of blk_register_queue() From b1a811633f7321cf1ae2bb76a66805b7720e44c9 Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Thu, 12 Aug 2021 12:15:01 +0300 Subject: [PATCH 162/315] block: nbd: add sanity check for first_minor Syzbot hit WARNING in internal_create_group(). The problem was in too big disk->first_minor. disk->first_minor is initialized by value, which comes from userspace and there wasn't any sanity checks about value correctness. It can cause duplicate creation of sysfs files/links, because disk->first_minor will be passed to MKDEV() which causes truncation to byte. Since maximum minor value is 0xff, let's check if first_minor is correct minor number. NOTE: the root case of the reported warning was in wrong error handling in register_disk(), but we can avoid passing knowingly wrong values to sysfs API, because sysfs error messages can confuse users. For example: user passed 1048576 as index, but sysfs complains about duplicate creation of /dev/block/43:0. It's not obvious how 1048576 becomes 0. Log and reproducer for above example can be found on syzkaller bug report page. Link: https://syzkaller.appspot.com/bug?id=03c2ae9146416edf811958d5fd7acfab75b143d1 Fixes: b0d9111a2d53 ("nbd: use an idr to keep track of nbd devices") Reported-by: syzbot+9937dc42271cd87d4b98@syzkaller.appspotmail.com Reviewed-by: Christoph Hellwig Signed-off-by: Pavel Skripkin Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 0fe82626bf70..379032a64a7c 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1750,7 +1750,17 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs) refcount_set(&nbd->refs, refs); INIT_LIST_HEAD(&nbd->list); disk->major = NBD_MAJOR; + + /* Too big first_minor can cause duplicate creation of + * sysfs files/links, since first_minor will be truncated to + * byte in __device_add_disk(). + */ disk->first_minor = index << part_shift; + if (disk->first_minor > 0xff) { + err = -EINVAL; + goto out_free_idr; + } + disk->minors = 1 << part_shift; disk->fops = &nbd_fops; disk->private_data = nbd; From 0866200ed7fdfbfba0c033aad63ff407e5368570 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 25 May 2021 08:59:46 -0700 Subject: [PATCH 163/315] nvme: Have NVME_FABRICS select NVME_CORE instead of transport drivers Transport drivers need both core and fabrics modules, instead of selecting both, have the selection transitive such that NVME_FABRICS selects NVME_CORE and transport drivers select NVME_FABRICS. Suggested-by: Keith Busch Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Reviewed-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/host/Kconfig | 4 +--- drivers/nvme/target/Kconfig | 2 -- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index c3f3d77f1aac..dc0450ca23a3 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -33,12 +33,12 @@ config NVME_HWMON in the system. config NVME_FABRICS + select NVME_CORE tristate config NVME_RDMA tristate "NVM Express over Fabrics RDMA host driver" depends on INFINIBAND && INFINIBAND_ADDR_TRANS && BLOCK - select NVME_CORE select NVME_FABRICS select SG_POOL help @@ -55,7 +55,6 @@ config NVME_FC tristate "NVM Express over Fabrics FC host driver" depends on BLOCK depends on HAS_DMA - select NVME_CORE select NVME_FABRICS select SG_POOL help @@ -72,7 +71,6 @@ config NVME_TCP tristate "NVM Express over Fabrics TCP host driver" depends on INET depends on BLOCK - select NVME_CORE select NVME_FABRICS select CRYPTO select CRYPTO_CRC32C diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig index 4be2ececbc45..973561c93888 100644 --- a/drivers/nvme/target/Kconfig +++ b/drivers/nvme/target/Kconfig @@ -31,7 +31,6 @@ config NVME_TARGET_PASSTHRU config NVME_TARGET_LOOP tristate "NVMe loopback device support" depends on NVME_TARGET - select NVME_CORE select NVME_FABRICS select SG_POOL help @@ -65,7 +64,6 @@ config NVME_TARGET_FC config NVME_TARGET_FCLOOP tristate "NVMe over Fabrics FC Transport Loopback Test driver" depends on NVME_TARGET - select NVME_CORE select NVME_FABRICS select SG_POOL depends on NVME_FC From 77979058dfcf4818abf7dd84423a7d66dafd8487 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 16 Aug 2021 09:24:52 -0700 Subject: [PATCH 164/315] nvme: remove nvm_ndev from ns Now that the lightnvm driver is removed, we don't need a pointer to it's now non-existent struct. Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/nvme.h | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 57d2ac00a6bd..37c5ef5a3331 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -444,7 +444,6 @@ struct nvme_ns { u32 ana_grpid; #endif struct list_head siblings; - struct nvm_dev *ndev; struct kref kref; struct nvme_ns_head *head; From 9891668e43c8e9f2d0d50088b151edefc2e560e5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Aug 2021 14:45:29 +0200 Subject: [PATCH 165/315] nvme: remove the unused NVME_NS_* enum These values are unused now that the lightnvm support is gone. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch --- drivers/nvme/host/nvme.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 37c5ef5a3331..a2e1f298b217 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -47,11 +47,6 @@ extern struct workqueue_struct *nvme_wq; extern struct workqueue_struct *nvme_reset_wq; extern struct workqueue_struct *nvme_delete_wq; -enum { - NVME_NS_LBA = 0, - NVME_NS_LIGHTNVM = 1, -}; - /* * List of workarounds for devices that required behavior not specified in * the standard. From 355a8031dc174450ccad2a61c513ad7222d87a97 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 16 Aug 2021 10:44:47 +0900 Subject: [PATCH 166/315] libata: fix ata_host_start() The loop on entry of ata_host_start() may not initialize host->ops to a non NULL value. The test on the host_stop field of host->ops must then be preceded by a check that host->ops is not NULL. Reported-by: kernel test robot Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210816014456.2191776-3-damien.lemoal@wdc.com Signed-off-by: Jens Axboe --- drivers/ata/libata-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 61c762961ca8..44f434acfce0 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -5573,7 +5573,7 @@ int ata_host_start(struct ata_host *host) have_stop = 1; } - if (host->ops->host_stop) + if (host->ops && host->ops->host_stop) have_stop = 1; if (have_stop) { From 56b4f06c55add95fe508a1746d9173bade6388bf Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 16 Aug 2021 10:44:48 +0900 Subject: [PATCH 167/315] libata: simplify ata_scsi_rbuf_fill() Sparse complains about context imbalance in ata_scsi_rbuf_get() and ata_scsi_rbuf_put() due to these functions respectively only taking and releasing the ata_scsi_rbuf_lock spinlock. Since these functions are only called from ata_scsi_rbuf_fill() with ata_scsi_rbuf_get() being called with a copy_in argument always false, the code can be simplified and ata_scsi_rbuf_{get|put} removed. This change both simplifies the code and fixes the sparse warning. Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210816014456.2191776-4-damien.lemoal@wdc.com Signed-off-by: Jens Axboe --- drivers/ata/libata-scsi.c | 60 ++++++--------------------------------- 1 file changed, 9 insertions(+), 51 deletions(-) diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index b9588c52815d..0b7b4624e4df 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -1765,53 +1765,6 @@ struct ata_scsi_args { struct scsi_cmnd *cmd; }; -/** - * ata_scsi_rbuf_get - Map response buffer. - * @cmd: SCSI command containing buffer to be mapped. - * @flags: unsigned long variable to store irq enable status - * @copy_in: copy in from user buffer - * - * Prepare buffer for simulated SCSI commands. - * - * LOCKING: - * spin_lock_irqsave(ata_scsi_rbuf_lock) on success - * - * RETURNS: - * Pointer to response buffer. - */ -static void *ata_scsi_rbuf_get(struct scsi_cmnd *cmd, bool copy_in, - unsigned long *flags) -{ - spin_lock_irqsave(&ata_scsi_rbuf_lock, *flags); - - memset(ata_scsi_rbuf, 0, ATA_SCSI_RBUF_SIZE); - if (copy_in) - sg_copy_to_buffer(scsi_sglist(cmd), scsi_sg_count(cmd), - ata_scsi_rbuf, ATA_SCSI_RBUF_SIZE); - return ata_scsi_rbuf; -} - -/** - * ata_scsi_rbuf_put - Unmap response buffer. - * @cmd: SCSI command containing buffer to be unmapped. - * @copy_out: copy out result - * @flags: @flags passed to ata_scsi_rbuf_get() - * - * Returns rbuf buffer. The result is copied to @cmd's buffer if - * @copy_back is true. - * - * LOCKING: - * Unlocks ata_scsi_rbuf_lock. - */ -static inline void ata_scsi_rbuf_put(struct scsi_cmnd *cmd, bool copy_out, - unsigned long *flags) -{ - if (copy_out) - sg_copy_from_buffer(scsi_sglist(cmd), scsi_sg_count(cmd), - ata_scsi_rbuf, ATA_SCSI_RBUF_SIZE); - spin_unlock_irqrestore(&ata_scsi_rbuf_lock, *flags); -} - /** * ata_scsi_rbuf_fill - wrapper for SCSI command simulators * @args: device IDENTIFY data / SCSI command of interest. @@ -1830,14 +1783,19 @@ static inline void ata_scsi_rbuf_put(struct scsi_cmnd *cmd, bool copy_out, static void ata_scsi_rbuf_fill(struct ata_scsi_args *args, unsigned int (*actor)(struct ata_scsi_args *args, u8 *rbuf)) { - u8 *rbuf; unsigned int rc; struct scsi_cmnd *cmd = args->cmd; unsigned long flags; - rbuf = ata_scsi_rbuf_get(cmd, false, &flags); - rc = actor(args, rbuf); - ata_scsi_rbuf_put(cmd, rc == 0, &flags); + spin_lock_irqsave(&ata_scsi_rbuf_lock, flags); + + memset(ata_scsi_rbuf, 0, ATA_SCSI_RBUF_SIZE); + rc = actor(args, ata_scsi_rbuf); + if (rc == 0) + sg_copy_from_buffer(scsi_sglist(cmd), scsi_sg_count(cmd), + ata_scsi_rbuf, ATA_SCSI_RBUF_SIZE); + + spin_unlock_irqrestore(&ata_scsi_rbuf_lock, flags); if (rc == 0) cmd->result = SAM_STAT_GOOD; From d8d8778c24cc4689250b59c426489a360032d912 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 16 Aug 2021 10:44:49 +0900 Subject: [PATCH 168/315] libata: cleanup device sleep capability detection Move the code to retrieve the device sleep capability and timings out of ata_dev_configure() into the helper function ata_dev_config_devslp(). While at it, mark the device as supporting the device sleep capability only if the sata settings page was retrieved successfully to ensure that the timing information is correctly initialized. Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210816014456.2191776-5-damien.lemoal@wdc.com Signed-off-by: Jens Axboe --- drivers/ata/libata-core.c | 55 +++++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 44f434acfce0..f96cd2f931bd 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -2363,6 +2363,37 @@ static void ata_dev_config_trusted(struct ata_device *dev) dev->flags |= ATA_DFLAG_TRUSTED; } +static void ata_dev_config_devslp(struct ata_device *dev) +{ + u8 *sata_setting = dev->link->ap->sector_buf; + unsigned int err_mask; + int i, j; + + /* + * Check device sleep capability. Get DevSlp timing variables + * from SATA Settings page of Identify Device Data Log. + */ + if (!ata_id_has_devslp(dev->id)) + return; + + err_mask = ata_read_log_page(dev, + ATA_LOG_IDENTIFY_DEVICE, + ATA_LOG_SATA_SETTINGS, + sata_setting, 1); + if (err_mask) { + ata_dev_dbg(dev, + "failed to get SATA Settings Log, Emask 0x%x\n", + err_mask); + return; + } + + dev->flags |= ATA_DFLAG_DEVSLP; + for (i = 0; i < ATA_LOG_DEVSLP_SIZE; i++) { + j = ATA_LOG_DEVSLP_OFFSET + i; + dev->devslp_timing[i] = sata_setting[j]; + } +} + /** * ata_dev_configure - Configure the specified ATA/ATAPI device * @dev: Target device to configure @@ -2565,29 +2596,7 @@ int ata_dev_configure(struct ata_device *dev) } } - /* Check and mark DevSlp capability. Get DevSlp timing variables - * from SATA Settings page of Identify Device Data Log. - */ - if (ata_id_has_devslp(dev->id)) { - u8 *sata_setting = ap->sector_buf; - int i, j; - - dev->flags |= ATA_DFLAG_DEVSLP; - err_mask = ata_read_log_page(dev, - ATA_LOG_IDENTIFY_DEVICE, - ATA_LOG_SATA_SETTINGS, - sata_setting, - 1); - if (err_mask) - ata_dev_dbg(dev, - "failed to get Identify Device Data, Emask 0x%x\n", - err_mask); - else - for (i = 0; i < ATA_LOG_DEVSLP_SIZE; i++) { - j = ATA_LOG_DEVSLP_OFFSET + i; - dev->devslp_timing[i] = sata_setting[j]; - } - } + ata_dev_config_devslp(dev); ata_dev_config_sense_reporting(dev); ata_dev_config_zac(dev); ata_dev_config_trusted(dev); From 891fd7c61952ed3fddb82a3b00ae4b3edfce8733 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 16 Aug 2021 10:44:50 +0900 Subject: [PATCH 169/315] libata: cleanup ata_dev_configure() Introduce the helper functions ata_dev_config_lba() and ata_dev_config_chs() to configure the addressing capabilities of a device. To control message printing in these new helpers, as well as in ata_dev_configure() and in ata_hpa_resize(), add the helper function ata_dev_print_info() to avoid open coding for the eh context ATA_EHI_PRINTINFO flag in multiple functions. Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210816014456.2191776-6-damien.lemoal@wdc.com Signed-off-by: Jens Axboe --- drivers/ata/libata-core.c | 131 ++++++++++++++++++++++---------------- 1 file changed, 75 insertions(+), 56 deletions(-) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index f96cd2f931bd..f600ea16a92c 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -159,6 +159,12 @@ MODULE_DESCRIPTION("Library module for ATA devices"); MODULE_LICENSE("GPL"); MODULE_VERSION(DRV_VERSION); +static inline bool ata_dev_print_info(struct ata_device *dev) +{ + struct ata_eh_context *ehc = &dev->link->eh_context; + + return ehc->i.flags & ATA_EHI_PRINTINFO; +} static bool ata_sstatus_online(u32 sstatus) { @@ -1266,8 +1272,7 @@ static int ata_set_max_sectors(struct ata_device *dev, u64 new_sectors) */ static int ata_hpa_resize(struct ata_device *dev) { - struct ata_eh_context *ehc = &dev->link->eh_context; - int print_info = ehc->i.flags & ATA_EHI_PRINTINFO; + bool print_info = ata_dev_print_info(dev); bool unlock_hpa = ata_ignore_hpa || dev->flags & ATA_DFLAG_UNLOCK_HPA; u64 sectors = ata_id_n_sectors(dev->id); u64 native_sectors; @@ -2363,6 +2368,65 @@ static void ata_dev_config_trusted(struct ata_device *dev) dev->flags |= ATA_DFLAG_TRUSTED; } +static int ata_dev_config_lba(struct ata_device *dev) +{ + struct ata_port *ap = dev->link->ap; + const u16 *id = dev->id; + const char *lba_desc; + char ncq_desc[24]; + int ret; + + dev->flags |= ATA_DFLAG_LBA; + + if (ata_id_has_lba48(id)) { + lba_desc = "LBA48"; + dev->flags |= ATA_DFLAG_LBA48; + if (dev->n_sectors >= (1UL << 28) && + ata_id_has_flush_ext(id)) + dev->flags |= ATA_DFLAG_FLUSH_EXT; + } else { + lba_desc = "LBA"; + } + + /* config NCQ */ + ret = ata_dev_config_ncq(dev, ncq_desc, sizeof(ncq_desc)); + + /* print device info to dmesg */ + if (ata_msg_drv(ap) && ata_dev_print_info(dev)) + ata_dev_info(dev, + "%llu sectors, multi %u: %s %s\n", + (unsigned long long)dev->n_sectors, + dev->multi_count, lba_desc, ncq_desc); + + return ret; +} + +static void ata_dev_config_chs(struct ata_device *dev) +{ + struct ata_port *ap = dev->link->ap; + const u16 *id = dev->id; + + if (ata_id_current_chs_valid(id)) { + /* Current CHS translation is valid. */ + dev->cylinders = id[54]; + dev->heads = id[55]; + dev->sectors = id[56]; + } else { + /* Default translation */ + dev->cylinders = id[1]; + dev->heads = id[3]; + dev->sectors = id[6]; + } + + /* print device info to dmesg */ + if (ata_msg_drv(ap) && ata_dev_print_info(dev)) + ata_dev_info(dev, + "%llu sectors, multi %u, CHS %u/%u/%u\n", + (unsigned long long)dev->n_sectors, + dev->multi_count, dev->cylinders, + dev->heads, dev->sectors); +} + static void ata_dev_config_devslp(struct ata_device *dev) { u8 *sata_setting = dev->link->ap->sector_buf; @@ -2410,8 +2474,7 @@ static void ata_dev_config_devslp(struct ata_device *dev) int ata_dev_configure(struct ata_device *dev) { struct ata_port *ap = dev->link->ap; - struct ata_eh_context *ehc = &dev->link->eh_context; - int print_info = ehc->i.flags & ATA_EHI_PRINTINFO; + bool print_info = ata_dev_print_info(dev); const u16 *id = dev->id; unsigned long xfer_mask; unsigned int err_mask; @@ -2538,62 +2601,18 @@ int ata_dev_configure(struct ata_device *dev) dev->multi_count = cnt; } + /* print device info to dmesg */ + if (ata_msg_drv(ap) && print_info) + ata_dev_info(dev, "%s: %s, %s, max %s\n", + revbuf, modelbuf, fwrevbuf, + ata_mode_string(xfer_mask)); + if (ata_id_has_lba(id)) { - const char *lba_desc; - char ncq_desc[24]; - - lba_desc = "LBA"; - dev->flags |= ATA_DFLAG_LBA; - if (ata_id_has_lba48(id)) { - dev->flags |= ATA_DFLAG_LBA48; - lba_desc = "LBA48"; - - if (dev->n_sectors >= (1UL << 28) && - ata_id_has_flush_ext(id)) - dev->flags |= ATA_DFLAG_FLUSH_EXT; - } - - /* config NCQ */ - rc = ata_dev_config_ncq(dev, ncq_desc, sizeof(ncq_desc)); + rc = ata_dev_config_lba(dev); if (rc) return rc; - - /* print device info to dmesg */ - if (ata_msg_drv(ap) && print_info) { - ata_dev_info(dev, "%s: %s, %s, max %s\n", - revbuf, modelbuf, fwrevbuf, - ata_mode_string(xfer_mask)); - ata_dev_info(dev, - "%llu sectors, multi %u: %s %s\n", - (unsigned long long)dev->n_sectors, - dev->multi_count, lba_desc, ncq_desc); - } } else { - /* CHS */ - - /* Default translation */ - dev->cylinders = id[1]; - dev->heads = id[3]; - dev->sectors = id[6]; - - if (ata_id_current_chs_valid(id)) { - /* Current CHS translation is valid. */ - dev->cylinders = id[54]; - dev->heads = id[55]; - dev->sectors = id[56]; - } - - /* print device info to dmesg */ - if (ata_msg_drv(ap) && print_info) { - ata_dev_info(dev, "%s: %s, %s, max %s\n", - revbuf, modelbuf, fwrevbuf, - ata_mode_string(xfer_mask)); - ata_dev_info(dev, - "%llu sectors, multi %u, CHS %u/%u/%u\n", - (unsigned long long)dev->n_sectors, - dev->multi_count, dev->cylinders, - dev->heads, dev->sectors); - } + ata_dev_config_chs(dev); } ata_dev_config_devslp(dev); From 2360fa1812cd77e1de13d3cca789fbd23462b651 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 16 Aug 2021 10:44:51 +0900 Subject: [PATCH 170/315] libata: cleanup NCQ priority handling The ata device flag ATA_DFLAG_NCQ_PRIO indicates if a device supports the NCQ Priority feature while the ATA_DFLAG_NCQ_PRIO_ENABLE device flag indicates if the feature is enabled. Enabling NCQ priority use is controlled by the user through the device sysfs attribute ncq_prio_enable. As a result, the ATA_DFLAG_NCQ_PRIO flag should not be cleared when ATA_DFLAG_NCQ_PRIO_ENABLE is not set as the device still supports the feature even after the user disables it. This leads to the following cleanups: - In ata_build_rw_tf(), set a command high priority bit based on the ATA_DFLAG_NCQ_PRIO_ENABLE flag, not on the ATA_DFLAG_NCQ flag. That is, set a command high priority only if the user enabled NCQ priority use. - In ata_dev_config_ncq_prio(), ATA_DFLAG_NCQ_PRIO should not be cleared if ATA_DFLAG_NCQ_PRIO_ENABLE is not set. If the device does not support NCQ priority, both ATA_DFLAG_NCQ_PRIO and ATA_DFLAG_NCQ_PRIO_ENABLE must be cleared. With the above ata_dev_config_ncq_prio() change, ATA_DFLAG_NCQ_PRIO flag is set on device scan and revalidation. There is no need to trigger a device revalidation in ata_ncq_prio_enable_store() when the user enables the use of NCQ priority. Remove the revalidation code from that funciton to simplify it. Also change the return value from -EIO to -EINVAL when a user tries to enable NCQ priority for a device that does not support this feature. While at it, also simplify ata_ncq_prio_enable_show(). Overall, there is no functional change introduced by this patch. Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210816014456.2191776-7-damien.lemoal@wdc.com Signed-off-by: Jens Axboe --- drivers/ata/libata-core.c | 32 ++++++++++++++------------------ drivers/ata/libata-sata.c | 37 ++++++++++++------------------------- 2 files changed, 26 insertions(+), 43 deletions(-) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index f600ea16a92c..f285a2bc8799 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -712,11 +712,9 @@ int ata_build_rw_tf(struct ata_taskfile *tf, struct ata_device *dev, if (tf->flags & ATA_TFLAG_FUA) tf->device |= 1 << 7; - if (dev->flags & ATA_DFLAG_NCQ_PRIO) { - if (class == IOPRIO_CLASS_RT) - tf->hob_nsect |= ATA_PRIO_HIGH << - ATA_SHIFT_PRIO; - } + if (dev->flags & ATA_DFLAG_NCQ_PRIO_ENABLE && + class == IOPRIO_CLASS_RT) + tf->hob_nsect |= ATA_PRIO_HIGH << ATA_SHIFT_PRIO; } else if (dev->flags & ATA_DFLAG_LBA) { tf->flags |= ATA_TFLAG_LBA; @@ -2178,11 +2176,6 @@ static void ata_dev_config_ncq_prio(struct ata_device *dev) struct ata_port *ap = dev->link->ap; unsigned int err_mask; - if (!(dev->flags & ATA_DFLAG_NCQ_PRIO_ENABLE)) { - dev->flags &= ~ATA_DFLAG_NCQ_PRIO; - return; - } - err_mask = ata_read_log_page(dev, ATA_LOG_IDENTIFY_DEVICE, ATA_LOG_SATA_SETTINGS, @@ -2190,18 +2183,21 @@ static void ata_dev_config_ncq_prio(struct ata_device *dev) 1); if (err_mask) { ata_dev_dbg(dev, - "failed to get Identify Device data, Emask 0x%x\n", + "failed to get SATA settings log, Emask 0x%x\n", err_mask); - return; + goto not_supported; } - if (ap->sector_buf[ATA_LOG_NCQ_PRIO_OFFSET] & BIT(3)) { - dev->flags |= ATA_DFLAG_NCQ_PRIO; - } else { - dev->flags &= ~ATA_DFLAG_NCQ_PRIO; - ata_dev_dbg(dev, "SATA page does not support priority\n"); - } + if (!(ap->sector_buf[ATA_LOG_NCQ_PRIO_OFFSET] & BIT(3))) + goto not_supported; + dev->flags |= ATA_DFLAG_NCQ_PRIO; + + return; + +not_supported: + dev->flags &= ~ATA_DFLAG_NCQ_PRIO_ENABLE; + dev->flags &= ~ATA_DFLAG_NCQ_PRIO; } static int ata_dev_config_ncq(struct ata_device *dev, diff --git a/drivers/ata/libata-sata.c b/drivers/ata/libata-sata.c index 8adeab76dd38..dc397ebda089 100644 --- a/drivers/ata/libata-sata.c +++ b/drivers/ata/libata-sata.c @@ -839,23 +839,17 @@ static ssize_t ata_ncq_prio_enable_show(struct device *device, char *buf) { struct scsi_device *sdev = to_scsi_device(device); - struct ata_port *ap; + struct ata_port *ap = ata_shost_to_port(sdev->host); struct ata_device *dev; bool ncq_prio_enable; int rc = 0; - ap = ata_shost_to_port(sdev->host); - spin_lock_irq(ap->lock); dev = ata_scsi_find_dev(ap, sdev); - if (!dev) { + if (!dev) rc = -ENODEV; - goto unlock; - } - - ncq_prio_enable = dev->flags & ATA_DFLAG_NCQ_PRIO_ENABLE; - -unlock: + else + ncq_prio_enable = dev->flags & ATA_DFLAG_NCQ_PRIO_ENABLE; spin_unlock_irq(ap->lock); return rc ? rc : snprintf(buf, 20, "%u\n", ncq_prio_enable); @@ -869,7 +863,7 @@ static ssize_t ata_ncq_prio_enable_store(struct device *device, struct ata_port *ap; struct ata_device *dev; long int input; - int rc; + int rc = 0; rc = kstrtol(buf, 10, &input); if (rc) @@ -883,27 +877,20 @@ static ssize_t ata_ncq_prio_enable_store(struct device *device, return -ENODEV; spin_lock_irq(ap->lock); + + if (!(dev->flags & ATA_DFLAG_NCQ_PRIO)) { + rc = -EINVAL; + goto unlock; + } + if (input) dev->flags |= ATA_DFLAG_NCQ_PRIO_ENABLE; else dev->flags &= ~ATA_DFLAG_NCQ_PRIO_ENABLE; - dev->link->eh_info.action |= ATA_EH_REVALIDATE; - dev->link->eh_info.flags |= ATA_EHI_QUIET; - ata_port_schedule_eh(ap); +unlock: spin_unlock_irq(ap->lock); - ata_port_wait_eh(ap); - - if (input) { - spin_lock_irq(ap->lock); - if (!(dev->flags & ATA_DFLAG_NCQ_PRIO)) { - dev->flags &= ~ATA_DFLAG_NCQ_PRIO_ENABLE; - rc = -EIO; - } - spin_unlock_irq(ap->lock); - } - return rc ? rc : len; } From fc5c8aa7bc4977205e0ceb93425075f8a8f49501 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 16 Aug 2021 10:44:52 +0900 Subject: [PATCH 171/315] libata: fix ata_read_log_page() warning Support for the READ LOG PAGE DMA EXT command is indicated by words 119 and 120 of a device identify data. This is tested in ata_read_log_page() with ata_id_has_read_log_dma_ext() and the READ LOG PAGE DMA command used if the device reports supports for it. However, some devices lie about this support and using the DMA version of the command fails, generating the warning message "READ LOG DMA EXT failed, trying PIO". Since READ LOG PAGE DMA EXT is an optional command, this warning is not at all important but may be scary for the user. Change ata_read_log_page() to suppres this warning and to print an error message if both DMA and PIO attempts failed. With this change, there is no need to print again an error message when ata_read_log_page() returns an error. So simplify the users of this function. Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210816014456.2191776-8-damien.lemoal@wdc.com Signed-off-by: Jens Axboe --- drivers/ata/libata-core.c | 47 +++++++++++---------------------------- 1 file changed, 13 insertions(+), 34 deletions(-) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index f285a2bc8799..5befe6ce6039 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -2026,13 +2026,15 @@ retry: err_mask = ata_exec_internal(dev, &tf, NULL, DMA_FROM_DEVICE, buf, sectors * ATA_SECT_SIZE, 0); - if (err_mask && dma) { - dev->horkage |= ATA_HORKAGE_NO_DMA_LOG; - ata_dev_warn(dev, "READ LOG DMA EXT failed, trying PIO\n"); - goto retry; + if (err_mask) { + if (dma) { + dev->horkage |= ATA_HORKAGE_NO_DMA_LOG; + goto retry; + } + ata_dev_err(dev, "Read log page 0x%02x failed, Emask 0x%x\n", + (unsigned int)page, err_mask); } - DPRINTK("EXIT, err_mask=%x\n", err_mask); return err_mask; } @@ -2061,12 +2063,8 @@ static bool ata_identify_page_supported(struct ata_device *dev, u8 page) */ err = ata_read_log_page(dev, ATA_LOG_IDENTIFY_DEVICE, 0, ap->sector_buf, 1); - if (err) { - ata_dev_info(dev, - "failed to get Device Identify Log Emask 0x%x\n", - err); + if (err) return false; - } for (i = 0; i < ap->sector_buf[8]; i++) { if (ap->sector_buf[9 + i] == page) @@ -2130,11 +2128,7 @@ static void ata_dev_config_ncq_send_recv(struct ata_device *dev) } err_mask = ata_read_log_page(dev, ATA_LOG_NCQ_SEND_RECV, 0, ap->sector_buf, 1); - if (err_mask) { - ata_dev_dbg(dev, - "failed to get NCQ Send/Recv Log Emask 0x%x\n", - err_mask); - } else { + if (!err_mask) { u8 *cmds = dev->ncq_send_recv_cmds; dev->flags |= ATA_DFLAG_NCQ_SEND_RECV; @@ -2160,11 +2154,7 @@ static void ata_dev_config_ncq_non_data(struct ata_device *dev) } err_mask = ata_read_log_page(dev, ATA_LOG_NCQ_NON_DATA, 0, ap->sector_buf, 1); - if (err_mask) { - ata_dev_dbg(dev, - "failed to get NCQ Non-Data Log Emask 0x%x\n", - err_mask); - } else { + if (!err_mask) { u8 *cmds = dev->ncq_non_data_cmds; memcpy(cmds, ap->sector_buf, ATA_LOG_NCQ_NON_DATA_SIZE); @@ -2181,12 +2171,8 @@ static void ata_dev_config_ncq_prio(struct ata_device *dev) ATA_LOG_SATA_SETTINGS, ap->sector_buf, 1); - if (err_mask) { - ata_dev_dbg(dev, - "failed to get SATA settings log, Emask 0x%x\n", - err_mask); + if (err_mask) goto not_supported; - } if (!(ap->sector_buf[ATA_LOG_NCQ_PRIO_OFFSET] & BIT(3))) goto not_supported; @@ -2347,11 +2333,8 @@ static void ata_dev_config_trusted(struct ata_device *dev) err = ata_read_log_page(dev, ATA_LOG_IDENTIFY_DEVICE, ATA_LOG_SECURITY, ap->sector_buf, 1); - if (err) { - ata_dev_dbg(dev, - "failed to read Security Log, Emask 0x%x\n", err); + if (err) return; - } trusted_cap = get_unaligned_le64(&ap->sector_buf[40]); if (!(trusted_cap & (1ULL << 63))) { @@ -2440,12 +2423,8 @@ static void ata_dev_config_devslp(struct ata_device *dev) ATA_LOG_IDENTIFY_DEVICE, ATA_LOG_SATA_SETTINGS, sata_setting, 1); - if (err_mask) { - ata_dev_dbg(dev, - "failed to get SATA Settings Log, Emask 0x%x\n", - err_mask); + if (err_mask) return; - } dev->flags |= ATA_DFLAG_DEVSLP; for (i = 0; i < ATA_LOG_DEVSLP_SIZE; i++) { From d633b8a702ab2eb4ef9263f1ab1610bb8cdf71a5 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 16 Aug 2021 10:44:53 +0900 Subject: [PATCH 172/315] libata: print feature list on device scan Print a list of features supported by a drive when it is configured in ata_dev_configure() using the new function ata_dev_print_features(). The features printed are not already advertized and are: trusted send-recev support, device attention support, device sleep support, NCQ send-recv support and NCQ priority support. Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210816014456.2191776-9-damien.lemoal@wdc.com Signed-off-by: Jens Axboe --- drivers/ata/libata-core.c | 17 +++++++++++++++++ include/linux/libata.h | 4 ++++ 2 files changed, 21 insertions(+) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 5befe6ce6039..b8459c54f739 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -2433,6 +2433,20 @@ static void ata_dev_config_devslp(struct ata_device *dev) } } +static void ata_dev_print_features(struct ata_device *dev) +{ + if (!(dev->flags & ATA_DFLAG_FEATURES_MASK)) + return; + + ata_dev_info(dev, + "Features:%s%s%s%s%s\n", + dev->flags & ATA_DFLAG_TRUSTED ? " Trust" : "", + dev->flags & ATA_DFLAG_DA ? " Dev-Attention" : "", + dev->flags & ATA_DFLAG_DEVSLP ? " Dev-Sleep" : "", + dev->flags & ATA_DFLAG_NCQ_SEND_RECV ? " NCQ-sndrcv" : "", + dev->flags & ATA_DFLAG_NCQ_PRIO ? " NCQ-prio" : ""); +} + /** * ata_dev_configure - Configure the specified ATA/ATAPI device * @dev: Target device to configure @@ -2595,6 +2609,9 @@ int ata_dev_configure(struct ata_device *dev) ata_dev_config_zac(dev); ata_dev_config_trusted(dev); dev->cdb_len = 32; + + if (ata_msg_drv(ap) && print_info) + ata_dev_print_features(dev); } /* ATAPI-specific feature tests */ diff --git a/include/linux/libata.h b/include/linux/libata.h index 3fcd24236793..b23f28cfc8e0 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -161,6 +161,10 @@ enum { ATA_DFLAG_D_SENSE = (1 << 29), /* Descriptor sense requested */ ATA_DFLAG_ZAC = (1 << 30), /* ZAC device */ + ATA_DFLAG_FEATURES_MASK = ATA_DFLAG_TRUSTED | ATA_DFLAG_DA | \ + ATA_DFLAG_DEVSLP | ATA_DFLAG_NCQ_SEND_RECV | \ + ATA_DFLAG_NCQ_PRIO, + ATA_DEV_UNKNOWN = 0, /* unknown device */ ATA_DEV_ATA = 1, /* ATA device */ ATA_DEV_ATA_UNSUP = 2, /* ATA device (unsupported) */ From 5f91b8f54874300a8e3c6c89f39ce5a74a449f2c Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 16 Aug 2021 10:44:54 +0900 Subject: [PATCH 173/315] libata: Introduce ncq_prio_supported sysfs sttribute Currently, the only way a user can determine if a SATA device supports NCQ priority is to try to enable the use of this feature using the ncq_prio_enable sysfs device attribute. If enabling the feature fails, it is because the device does not support NCQ priority. Otherwise, the feature is enabled and success indicates that the device supports NCQ priority. Improve this odd interface by introducing the read-only ncq_prio_supported sysfs device attribute to indicate if a SATA device supports NCQ priority. The value of this attribute reflects the status of device flag ATA_DFLAG_NCQ_PRIO, which is set only for devices supporting NCQ priority. Add this new sysfs attribute to the device attributes group of libahci and libata-sata. Signed-off-by: Damien Le Moal Link: https://lore.kernel.org/r/20210816014456.2191776-10-damien.lemoal@wdc.com Signed-off-by: Jens Axboe --- drivers/ata/libahci.c | 1 + drivers/ata/libata-sata.c | 25 +++++++++++++++++++++++++ include/linux/libata.h | 1 + 3 files changed, 27 insertions(+) diff --git a/drivers/ata/libahci.c b/drivers/ata/libahci.c index fec2e9754aed..5b3fa2cbe722 100644 --- a/drivers/ata/libahci.c +++ b/drivers/ata/libahci.c @@ -125,6 +125,7 @@ EXPORT_SYMBOL_GPL(ahci_shost_attrs); struct device_attribute *ahci_sdev_attrs[] = { &dev_attr_sw_activity, &dev_attr_unload_heads, + &dev_attr_ncq_prio_supported, &dev_attr_ncq_prio_enable, NULL }; diff --git a/drivers/ata/libata-sata.c b/drivers/ata/libata-sata.c index dc397ebda089..8f3ff830ab0c 100644 --- a/drivers/ata/libata-sata.c +++ b/drivers/ata/libata-sata.c @@ -834,6 +834,30 @@ DEVICE_ATTR(link_power_management_policy, S_IRUGO | S_IWUSR, ata_scsi_lpm_show, ata_scsi_lpm_store); EXPORT_SYMBOL_GPL(dev_attr_link_power_management_policy); +static ssize_t ata_ncq_prio_supported_show(struct device *device, + struct device_attribute *attr, + char *buf) +{ + struct scsi_device *sdev = to_scsi_device(device); + struct ata_port *ap = ata_shost_to_port(sdev->host); + struct ata_device *dev; + bool ncq_prio_supported; + int rc = 0; + + spin_lock_irq(ap->lock); + dev = ata_scsi_find_dev(ap, sdev); + if (!dev) + rc = -ENODEV; + else + ncq_prio_supported = dev->flags & ATA_DFLAG_NCQ_PRIO; + spin_unlock_irq(ap->lock); + + return rc ? rc : sysfs_emit(buf, "%u\n", ncq_prio_supported); +} + +DEVICE_ATTR(ncq_prio_supported, S_IRUGO, ata_ncq_prio_supported_show, NULL); +EXPORT_SYMBOL_GPL(dev_attr_ncq_prio_supported); + static ssize_t ata_ncq_prio_enable_show(struct device *device, struct device_attribute *attr, char *buf) @@ -901,6 +925,7 @@ EXPORT_SYMBOL_GPL(dev_attr_ncq_prio_enable); struct device_attribute *ata_ncq_sdev_attrs[] = { &dev_attr_unload_heads, &dev_attr_ncq_prio_enable, + &dev_attr_ncq_prio_supported, NULL }; EXPORT_SYMBOL_GPL(ata_ncq_sdev_attrs); diff --git a/include/linux/libata.h b/include/linux/libata.h index b23f28cfc8e0..a2d1bae7900b 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -539,6 +539,7 @@ typedef void (*ata_postreset_fn_t)(struct ata_link *link, unsigned int *classes) extern struct device_attribute dev_attr_unload_heads; #ifdef CONFIG_SATA_HOST extern struct device_attribute dev_attr_link_power_management_policy; +extern struct device_attribute dev_attr_ncq_prio_supported; extern struct device_attribute dev_attr_ncq_prio_enable; extern struct device_attribute dev_attr_em_message_type; extern struct device_attribute dev_attr_em_message; From 5b8a2345e64b7c9ad00d1bd2d5081d14c574d989 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Mon, 16 Aug 2021 10:44:55 +0900 Subject: [PATCH 174/315] docs: sysfs-block-device: improve ncq_prio_enable documentation NCQ priority is an optional feature of the NCQ feature set and should not be confused with the NCQ feature set itself. Clarify the description of the ncq_prio_enable attribute to avoid this confusion. Also add the missing documentation for the equivalent sas_ncq_prio_enable attribute. Signed-off-by: Niklas Cassel Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210816014456.2191776-11-damien.lemoal@wdc.com Signed-off-by: Jens Axboe --- Documentation/ABI/testing/sysfs-block-device | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-block-device b/Documentation/ABI/testing/sysfs-block-device index aa0fb500e3c9..cf1013df4b92 100644 --- a/Documentation/ABI/testing/sysfs-block-device +++ b/Documentation/ABI/testing/sysfs-block-device @@ -55,6 +55,20 @@ Date: Oct, 2016 KernelVersion: v4.10 Contact: linux-ide@vger.kernel.org Description: - (RW) Write to the file to turn on or off the SATA ncq (native - command queueing) support. By default this feature is turned - off. + (RW) Write to the file to turn on or off the SATA NCQ (native + command queueing) priority support. By default this feature is + turned off. If the device does not support the SATA NCQ + priority feature, writing "1" to this file results in an error. + + +What: /sys/block/*/device/sas_ncq_prio_enable +Date: Oct, 2016 +KernelVersion: v4.10 +Contact: linux-ide@vger.kernel.org +Description: + (RW) This is the equivalent of the ncq_prio_enable attribute + file for SATA devices connected to a SAS host-bus-adapter + (HBA) implementing support for the SATA NCQ priority feature. + This file does not exist if the HBA driver does not implement + support for the SATA NCQ priority feature, regardless of the + device support for this feature. From f5975d18d46ae8485bb08161086e59360844840b Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 16 Aug 2021 10:44:56 +0900 Subject: [PATCH 175/315] docs: sysfs-block-device: document ncq_prio_supported Add documentation for the new device attribute file ncq_prio_supported, and its SAS HBA equivalent sas_ncq_prio_supported. Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210816014456.2191776-12-damien.lemoal@wdc.com Signed-off-by: Jens Axboe --- Documentation/ABI/testing/sysfs-block-device | 25 +++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/Documentation/ABI/testing/sysfs-block-device b/Documentation/ABI/testing/sysfs-block-device index cf1013df4b92..7ac7b19b2f72 100644 --- a/Documentation/ABI/testing/sysfs-block-device +++ b/Documentation/ABI/testing/sysfs-block-device @@ -58,7 +58,8 @@ Description: (RW) Write to the file to turn on or off the SATA NCQ (native command queueing) priority support. By default this feature is turned off. If the device does not support the SATA NCQ - priority feature, writing "1" to this file results in an error. + priority feature, writing "1" to this file results in an error + (see ncq_prio_supported). What: /sys/block/*/device/sas_ncq_prio_enable @@ -71,4 +72,26 @@ Description: (HBA) implementing support for the SATA NCQ priority feature. This file does not exist if the HBA driver does not implement support for the SATA NCQ priority feature, regardless of the + device support for this feature (see sas_ncq_prio_supported). + + +What: /sys/block/*/device/ncq_prio_supported +Date: Aug, 2021 +KernelVersion: v5.15 +Contact: linux-ide@vger.kernel.org +Description: + (RO) Indicates if the device supports the SATA NCQ (native + command queueing) priority feature. + + +What: /sys/block/*/device/sas_ncq_prio_supported +Date: Aug, 2021 +KernelVersion: v5.15 +Contact: linux-ide@vger.kernel.org +Description: + (RO) This is the equivalent of the ncq_prio_supported attribute + file for SATA devices connected to a SAS host-bus-adapter + (HBA) implementing support for the SATA NCQ priority feature. + This file does not exist if the HBA driver does not implement + support for the SATA NCQ priority feature, regardless of the device support for this feature. From a680dd72ec336b81511e3bff48efac6dbfa563e7 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 11 Aug 2021 12:36:57 +0900 Subject: [PATCH 176/315] block: bfq: fix bfq_set_next_ioprio_data() For a request that has a priority level equal to or larger than IOPRIO_BE_NR, bfq_set_next_ioprio_data() prints a critical warning but defaults to setting the request new_ioprio field to IOPRIO_BE_NR. This is not consistent with the warning and the allowed values for priority levels. Fix this by setting the request new_ioprio field to IOPRIO_BE_NR - 1, the lowest priority level allowed. Cc: Fixes: aee69d78dec0 ("block, bfq: introduce the BFQ-v0 I/O scheduler as an extra scheduler") Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210811033702.368488-2-damien.lemoal@wdc.com Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index e4a61eda2d0f..e546a5f4bff9 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5296,7 +5296,7 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) if (bfqq->new_ioprio >= IOPRIO_BE_NR) { pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n", bfqq->new_ioprio); - bfqq->new_ioprio = IOPRIO_BE_NR; + bfqq->new_ioprio = IOPRIO_BE_NR - 1; } bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); From 25bca50e523cbe96c0207fbb92f22ff2bc28e9aa Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 11 Aug 2021 12:36:58 +0900 Subject: [PATCH 177/315] block: improve ioprio class description comment In include/usapi/linux/ioprio.h, change the ioprio class enum comment to remove the outdated reference to CFQ and mention BFQ and mq-deadline instead. Also document the high priority NCQ command use for RT class IOs directed at ATA drives that support NCQ priority. Signed-off-by: Damien Le Moal Link: https://lore.kernel.org/r/20210811033702.368488-3-damien.lemoal@wdc.com Signed-off-by: Jens Axboe --- include/uapi/linux/ioprio.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/ioprio.h b/include/uapi/linux/ioprio.h index 77b17e08b0da..6b735854aebd 100644 --- a/include/uapi/linux/ioprio.h +++ b/include/uapi/linux/ioprio.h @@ -13,10 +13,12 @@ #define IOPRIO_PRIO_VALUE(class, data) (((class) << IOPRIO_CLASS_SHIFT) | data) /* - * These are the io priority groups as implemented by CFQ. RT is the realtime - * class, it always gets premium service. BE is the best-effort scheduling - * class, the default for any process. IDLE is the idle scheduling class, it - * is only served when no one else is using the disk. + * These are the io priority groups as implemented by the BFQ and mq-deadline + * schedulers. RT is the realtime class, it always gets premium service. For + * ATA disks supporting NCQ IO priority, RT class IOs will be processed using + * high priority NCQ commands. BE is the best-effort scheduling class, the + * default for any process. IDLE is the idle scheduling class, it is only + * served when no one else is using the disk. */ enum { IOPRIO_CLASS_NONE, From a553a835ca57668b0d9907d8ec2507ec51292d9a Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 11 Aug 2021 12:36:59 +0900 Subject: [PATCH 178/315] block: change ioprio_valid() to an inline function Change the ioprio_valid() macro in include/usapi/linux/ioprio.h to an inline function declared on the kernel side in include/linux/ioprio.h. Also improve checks on the class value by checking the upper bound value. Signed-off-by: Damien Le Moal Link: https://lore.kernel.org/r/20210811033702.368488-4-damien.lemoal@wdc.com Signed-off-by: Jens Axboe --- include/linux/ioprio.h | 10 ++++++++++ include/uapi/linux/ioprio.h | 2 -- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h index ef9ad4fb245f..2ee3373684b1 100644 --- a/include/linux/ioprio.h +++ b/include/linux/ioprio.h @@ -8,6 +8,16 @@ #include +/* + * Check that a priority value has a valid class. + */ +static inline bool ioprio_valid(unsigned short ioprio) +{ + unsigned short class = IOPRIO_PRIO_CLASS(ioprio); + + return class > IOPRIO_CLASS_NONE && class <= IOPRIO_CLASS_IDLE; +} + /* * if process has set io priority explicitly, use that. if not, convert * the cpu scheduler nice value to an io priority diff --git a/include/uapi/linux/ioprio.h b/include/uapi/linux/ioprio.h index 6b735854aebd..5064e230374c 100644 --- a/include/uapi/linux/ioprio.h +++ b/include/uapi/linux/ioprio.h @@ -27,8 +27,6 @@ enum { IOPRIO_CLASS_IDLE, }; -#define ioprio_valid(mask) (IOPRIO_PRIO_CLASS((mask)) != IOPRIO_CLASS_NONE) - /* * 8 best effort priority levels are supported */ From ba05200fcce0a73fa8db16c514fbaa476d1d9399 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 11 Aug 2021 12:37:00 +0900 Subject: [PATCH 179/315] block: fix IOPRIO_PRIO_CLASS() and IOPRIO_PRIO_VALUE() macros The ki_ioprio field of struct kiocb is 16-bits (u16) but often handled as an int in the block layer. E.g. ioprio_check_cap() takes an int as argument. With such implicit int casting function calls, the upper 16-bits of the int argument may be left uninitialized by the compiler, resulting in invalid values for the IOPRIO_PRIO_CLASS() macro (garbage upper bits) and in an error return for functions such as ioprio_check_cap(). Fix this by masking the result of the shift by IOPRIO_CLASS_SHIFT bits in the IOPRIO_PRIO_CLASS() macro. The new macro IOPRIO_CLASS_MASK defines the 3-bits mask for the priority class. Similarly, apply the IOPRIO_PRIO_MASK mask to the data argument of the IOPRIO_PRIO_VALUE() macro to ignore the upper bits of the data value. The IOPRIO_CLASS_MASK mask is also applied to the class argument of this macro before shifting the result by IOPRIO_CLASS_SHIFT bits. While at it, also change the argument name of the IOPRIO_PRIO_CLASS() and IOPRIO_PRIO_DATA() macros from "mask" to "ioprio" to reflect the fact that a priority value should be passed rather than a mask. Signed-off-by: Damien Le Moal Link: https://lore.kernel.org/r/20210811033702.368488-5-damien.lemoal@wdc.com Signed-off-by: Jens Axboe --- include/uapi/linux/ioprio.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/ioprio.h b/include/uapi/linux/ioprio.h index 5064e230374c..936f0d8f30e1 100644 --- a/include/uapi/linux/ioprio.h +++ b/include/uapi/linux/ioprio.h @@ -5,12 +5,16 @@ /* * Gives us 8 prio classes with 13-bits of data for each class */ -#define IOPRIO_CLASS_SHIFT (13) +#define IOPRIO_CLASS_SHIFT 13 +#define IOPRIO_CLASS_MASK 0x07 #define IOPRIO_PRIO_MASK ((1UL << IOPRIO_CLASS_SHIFT) - 1) -#define IOPRIO_PRIO_CLASS(mask) ((mask) >> IOPRIO_CLASS_SHIFT) -#define IOPRIO_PRIO_DATA(mask) ((mask) & IOPRIO_PRIO_MASK) -#define IOPRIO_PRIO_VALUE(class, data) (((class) << IOPRIO_CLASS_SHIFT) | data) +#define IOPRIO_PRIO_CLASS(ioprio) \ + (((ioprio) >> IOPRIO_CLASS_SHIFT) & IOPRIO_CLASS_MASK) +#define IOPRIO_PRIO_DATA(ioprio) ((ioprio) & IOPRIO_PRIO_MASK) +#define IOPRIO_PRIO_VALUE(class, data) \ + ((((class) & IOPRIO_CLASS_MASK) << IOPRIO_CLASS_SHIFT) | \ + ((data) & IOPRIO_PRIO_MASK)) /* * These are the io priority groups as implemented by the BFQ and mq-deadline From 202bc942c5cd4340d37b06c4e0b8b03f9925d818 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 11 Aug 2021 12:37:01 +0900 Subject: [PATCH 180/315] block: Introduce IOPRIO_NR_LEVELS The BFQ scheduler and ioprio_check_cap() both assume that the RT priority class (IOPRIO_CLASS_RT) can have up to 8 different priority levels, similarly to the BE class (IOPRIO_CLASS_iBE). This is controlled using the IOPRIO_BE_NR macro , which is badly named as the number of levels also applies to the RT class. Introduce the class independent IOPRIO_NR_LEVELS macro, defined to 8, to make things clear. Keep the old IOPRIO_BE_NR macro definition as an alias for IOPRIO_NR_LEVELS. Signed-off-by: Damien Le Moal Link: https://lore.kernel.org/r/20210811033702.368488-6-damien.lemoal@wdc.com Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 8 ++++---- block/bfq-iosched.h | 4 ++-- block/bfq-wf2q.c | 6 +++--- block/ioprio.c | 3 +-- fs/f2fs/sysfs.c | 2 +- include/uapi/linux/ioprio.h | 5 +++-- 6 files changed, 14 insertions(+), 14 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index e546a5f4bff9..4b434369e411 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2508,7 +2508,7 @@ void bfq_end_wr_async_queues(struct bfq_data *bfqd, int i, j; for (i = 0; i < 2; i++) - for (j = 0; j < IOPRIO_BE_NR; j++) + for (j = 0; j < IOPRIO_NR_LEVELS; j++) if (bfqg->async_bfqq[i][j]) bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]); if (bfqg->async_idle_bfqq) @@ -5293,10 +5293,10 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) break; } - if (bfqq->new_ioprio >= IOPRIO_BE_NR) { + if (bfqq->new_ioprio >= IOPRIO_NR_LEVELS) { pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n", bfqq->new_ioprio); - bfqq->new_ioprio = IOPRIO_BE_NR - 1; + bfqq->new_ioprio = IOPRIO_NR_LEVELS - 1; } bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); @@ -6825,7 +6825,7 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) int i, j; for (i = 0; i < 2; i++) - for (j = 0; j < IOPRIO_BE_NR; j++) + for (j = 0; j < IOPRIO_NR_LEVELS; j++) __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 99c2a3cb081e..385e28a843d1 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -931,7 +931,7 @@ struct bfq_group { void *bfqd; - struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; + struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS]; struct bfq_queue *async_idle_bfqq; struct bfq_entity *my_entity; @@ -948,7 +948,7 @@ struct bfq_group { struct bfq_entity entity; struct bfq_sched_data sched_data; - struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; + struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS]; struct bfq_queue *async_idle_bfqq; struct rb_root rq_pos_tree; diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index 7a462df71f68..b74cc0da118e 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -505,7 +505,7 @@ static void bfq_active_insert(struct bfq_service_tree *st, */ unsigned short bfq_ioprio_to_weight(int ioprio) { - return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF; + return (IOPRIO_NR_LEVELS - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF; } /** @@ -514,12 +514,12 @@ unsigned short bfq_ioprio_to_weight(int ioprio) * * To preserve as much as possible the old only-ioprio user interface, * 0 is used as an escape ioprio value for weights (numerically) equal or - * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF. + * larger than IOPRIO_NR_LEVELS * BFQ_WEIGHT_CONVERSION_COEFF. */ static unsigned short bfq_weight_to_ioprio(int weight) { return max_t(int, 0, - IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight); + IOPRIO_NR_LEVELS * BFQ_WEIGHT_CONVERSION_COEFF - weight); } static void bfq_get_entity(struct bfq_entity *entity) diff --git a/block/ioprio.c b/block/ioprio.c index bee628f9f1b2..ca6b136c5586 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -74,9 +74,8 @@ int ioprio_check_cap(int ioprio) fallthrough; /* rt has prio field too */ case IOPRIO_CLASS_BE: - if (data >= IOPRIO_BE_NR || data < 0) + if (data >= IOPRIO_NR_LEVELS || data < 0) return -EINVAL; - break; case IOPRIO_CLASS_IDLE: break; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 6642246206bd..daad532a4e2b 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -378,7 +378,7 @@ out: ret = kstrtol(name, 10, &data); if (ret) return ret; - if (data >= IOPRIO_BE_NR || data < 0) + if (data >= IOPRIO_NR_LEVELS || data < 0) return -EINVAL; cprc->ckpt_thread_ioprio = IOPRIO_PRIO_VALUE(class, data); diff --git a/include/uapi/linux/ioprio.h b/include/uapi/linux/ioprio.h index 936f0d8f30e1..aac39338d02c 100644 --- a/include/uapi/linux/ioprio.h +++ b/include/uapi/linux/ioprio.h @@ -32,9 +32,10 @@ enum { }; /* - * 8 best effort priority levels are supported + * The RT and BE priority classes both support up to 8 priority levels. */ -#define IOPRIO_BE_NR (8) +#define IOPRIO_NR_LEVELS 8 +#define IOPRIO_BE_NR IOPRIO_NR_LEVELS enum { IOPRIO_WHO_PROCESS = 1, From e70344c05995a190a56bbd1a23dc2218bcc8c924 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 11 Aug 2021 12:37:02 +0900 Subject: [PATCH 181/315] block: fix default IO priority handling The default IO priority is the best effort (BE) class with the normal priority level IOPRIO_NORM (4). However, get_task_ioprio() returns IOPRIO_CLASS_NONE/IOPRIO_NORM as the default priority and get_current_ioprio() returns IOPRIO_CLASS_NONE/0. Let's be consistent with the defined default and have both of these functions return the default priority IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM) when the user did not define another default IO priority for the task. In include/uapi/linux/ioprio.h, introduce the IOPRIO_BE_NORM macro as an alias to IOPRIO_NORM to clarify that this default level applies to the BE priotity class. In include/linux/ioprio.h, define the macro IOPRIO_DEFAULT as IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_BE_NORM) and use this new macro when setting a priority to the default. Signed-off-by: Damien Le Moal Link: https://lore.kernel.org/r/20210811033702.368488-7-damien.lemoal@wdc.com [axboe: drop unnecessary lightnvm change] Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 2 +- block/ioprio.c | 6 +++--- include/linux/ioprio.h | 7 ++++++- include/uapi/linux/ioprio.h | 5 +++-- 4 files changed, 13 insertions(+), 7 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 4b434369e411..e92bc0348433 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5411,7 +5411,7 @@ static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, case IOPRIO_CLASS_RT: return &bfqg->async_bfqq[0][ioprio]; case IOPRIO_CLASS_NONE: - ioprio = IOPRIO_NORM; + ioprio = IOPRIO_BE_NORM; fallthrough; case IOPRIO_CLASS_BE: return &bfqg->async_bfqq[1][ioprio]; diff --git a/block/ioprio.c b/block/ioprio.c index ca6b136c5586..0e4ff245f2bf 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -170,7 +170,7 @@ static int get_task_ioprio(struct task_struct *p) ret = security_task_getioprio(p); if (ret) goto out; - ret = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, IOPRIO_NORM); + ret = IOPRIO_DEFAULT; task_lock(p); if (p->io_context) ret = p->io_context->ioprio; @@ -182,9 +182,9 @@ out: int ioprio_best(unsigned short aprio, unsigned short bprio) { if (!ioprio_valid(aprio)) - aprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM); + aprio = IOPRIO_DEFAULT; if (!ioprio_valid(bprio)) - bprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM); + bprio = IOPRIO_DEFAULT; return min(aprio, bprio); } diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h index 2ee3373684b1..3f53bc27a19b 100644 --- a/include/linux/ioprio.h +++ b/include/linux/ioprio.h @@ -8,6 +8,11 @@ #include +/* + * Default IO priority. + */ +#define IOPRIO_DEFAULT IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_BE_NORM) + /* * Check that a priority value has a valid class. */ @@ -51,7 +56,7 @@ static inline int get_current_ioprio(void) if (ioc) return ioc->ioprio; - return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0); + return IOPRIO_DEFAULT; } /* diff --git a/include/uapi/linux/ioprio.h b/include/uapi/linux/ioprio.h index aac39338d02c..f70f2596a6bf 100644 --- a/include/uapi/linux/ioprio.h +++ b/include/uapi/linux/ioprio.h @@ -44,8 +44,9 @@ enum { }; /* - * Fallback BE priority + * Fallback BE priority level. */ -#define IOPRIO_NORM (4) +#define IOPRIO_NORM 4 +#define IOPRIO_BE_NORM IOPRIO_NORM #endif /* _UAPI_LINUX_IOPRIO_H */ From 759e0fd4b67766c96b33a114bba0c7d7521fecd0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 20 Aug 2021 11:49:29 +0200 Subject: [PATCH 182/315] block: add back the bd_holder_dir reference in bd_link_disk_holder This essentially reverts "block: remove the extra kobject reference in bd_link_disk_holder". That commit dropped the extra reference because the condition in the comment can't be true. But it turns out that comment did not actually describe the problematic situation, so add back the extra reference and document it properly. Fixes: fbd9a39542ec ("block: remove the extra kobject reference in bd_link_disk_holder") Reported-by: Tushar Sugandhi Reviewed-by: Mike Snitzer Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/holder.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/block/holder.c b/block/holder.c index 4568cc4f6827..9dc084182337 100644 --- a/block/holder.c +++ b/block/holder.c @@ -106,6 +106,12 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) } list_add(&holder->list, &disk->slave_bdevs); + /* + * del_gendisk drops the initial reference to bd_holder_dir, so we need + * to keep our own here to allow for cleanup past that point. + */ + kobject_get(bdev->bd_holder_dir); + out_unlock: mutex_unlock(&disk->open_mutex); return ret; @@ -138,6 +144,7 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { if (disk->slave_dir) __unlink_disk_holder(bdev, disk); + kobject_put(bdev->bd_holder_dir); list_del_init(&holder->list); kfree(holder); } From f196ae282070d798c9144771db65577910d58566 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=91=A8=E7=90=B0=E6=9D=B0=20=28Zhou=20Yanjie=29?= Date: Fri, 16 Jul 2021 01:36:45 +0800 Subject: [PATCH 183/315] dt-bindings: timer: Add ABIs for new Ingenic SoCs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1.Add OST_CLK_EVENT_TIMER for new XBurst®1 SoCs. 2.Add OST_CLK_EVENT_TIMER0 to OST_CLK_EVENT_TIMER15 for new XBurst®2 SoCs. Signed-off-by: 周琰杰 (Zhou Yanjie) Acked-by: Rob Herring Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/1626370605-120775-1-git-send-email-zhouyanjie@wanyeetech.com --- include/dt-bindings/clock/ingenic,sysost.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/include/dt-bindings/clock/ingenic,sysost.h b/include/dt-bindings/clock/ingenic,sysost.h index 063791b01ab3..d7aa42c08ded 100644 --- a/include/dt-bindings/clock/ingenic,sysost.h +++ b/include/dt-bindings/clock/ingenic,sysost.h @@ -13,4 +13,23 @@ #define OST_CLK_PERCPU_TIMER2 3 #define OST_CLK_PERCPU_TIMER3 4 +#define OST_CLK_EVENT_TIMER 1 + +#define OST_CLK_EVENT_TIMER0 0 +#define OST_CLK_EVENT_TIMER1 1 +#define OST_CLK_EVENT_TIMER2 2 +#define OST_CLK_EVENT_TIMER3 3 +#define OST_CLK_EVENT_TIMER4 4 +#define OST_CLK_EVENT_TIMER5 5 +#define OST_CLK_EVENT_TIMER6 6 +#define OST_CLK_EVENT_TIMER7 7 +#define OST_CLK_EVENT_TIMER8 8 +#define OST_CLK_EVENT_TIMER9 9 +#define OST_CLK_EVENT_TIMER10 10 +#define OST_CLK_EVENT_TIMER11 11 +#define OST_CLK_EVENT_TIMER12 12 +#define OST_CLK_EVENT_TIMER13 13 +#define OST_CLK_EVENT_TIMER14 14 +#define OST_CLK_EVENT_TIMER15 15 + #endif /* __DT_BINDINGS_CLOCK_INGENIC_OST_H__ */ From 5f432cceb3e9de5223fa50d882c4a43cab39a3ee Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Aug 2021 15:19:02 +0200 Subject: [PATCH 184/315] nvme: use blk_mq_alloc_disk Switch to use the blk_mq_alloc_disk helper for allocating the request_queue and gendisk. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Link: https://lore.kernel.org/r/20210816131910.615153-2-hch@lst.de Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 9e51266c9e2c..68acd33c3856 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3728,9 +3728,14 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, if (!ns) goto out_free_id; - ns->queue = blk_mq_init_queue(ctrl->tagset); - if (IS_ERR(ns->queue)) + disk = blk_mq_alloc_disk(ctrl->tagset, ns); + if (IS_ERR(disk)) goto out_free_ns; + disk->fops = &nvme_bdev_ops; + disk->private_data = ns; + + ns->disk = disk; + ns->queue = disk->queue; if (ctrl->opts && ctrl->opts->data_digest) blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, ns->queue); @@ -3739,20 +3744,12 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, if (ctrl->ops->flags & NVME_F_PCI_P2PDMA) blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue); - ns->queue->queuedata = ns; ns->ctrl = ctrl; kref_init(&ns->kref); if (nvme_init_ns_head(ns, nsid, ids, id->nmic & NVME_NS_NMIC_SHARED)) - goto out_free_queue; + goto out_cleanup_disk; - disk = alloc_disk_node(0, node); - if (!disk) - goto out_unlink_ns; - - disk->fops = &nvme_bdev_ops; - disk->private_data = ns; - disk->queue = ns->queue; /* * Without the multipath code enabled, multiple controller per * subsystems are visible as devices and thus we cannot use the @@ -3761,15 +3758,14 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, if (!nvme_mpath_set_disk_name(ns, disk->disk_name, &disk->flags)) sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance); - ns->disk = disk; if (nvme_update_ns_info(ns, id)) - goto out_put_disk; + goto out_unlink_ns; if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) { if (nvme_nvm_register(ns, disk->disk_name, node)) { dev_warn(ctrl->device, "LightNVM init failure\n"); - goto out_put_disk; + goto out_unlink_ns; } } @@ -3788,10 +3784,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, kfree(id); return; - out_put_disk: - /* prevent double queue cleanup */ - ns->disk->queue = NULL; - put_disk(ns->disk); + out_unlink_ns: mutex_lock(&ctrl->subsys->lock); list_del_rcu(&ns->siblings); @@ -3799,8 +3792,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, list_del_init(&ns->head->entry); mutex_unlock(&ctrl->subsys->lock); nvme_put_ns_head(ns->head); - out_free_queue: - blk_cleanup_queue(ns->queue); + out_cleanup_disk: + blk_cleanup_disk(disk); out_free_ns: kfree(ns); out_free_id: From 45938335d0a9773d65a82a7ca722bb76e4b997a8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Aug 2021 15:19:03 +0200 Subject: [PATCH 185/315] st: do not allocate a gendisk st is a character driver and thus does not need to allocate a gendisk, which is only used for file system-like block layer I/O on block devices. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210816131910.615153-3-hch@lst.de Signed-off-by: Jens Axboe --- drivers/scsi/st.c | 49 ++++++++++++----------------------------------- drivers/scsi/st.h | 2 +- 2 files changed, 13 insertions(+), 38 deletions(-) diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index c6f14540ae03..d1abc020f3c0 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -309,13 +309,8 @@ static char * st_incompatible(struct scsi_device* SDp) } -static inline char *tape_name(struct scsi_tape *tape) -{ - return tape->disk->disk_name; -} - #define st_printk(prefix, t, fmt, a...) \ - sdev_prefix_printk(prefix, (t)->device, tape_name(t), fmt, ##a) + sdev_prefix_printk(prefix, (t)->device, (t)->name, fmt, ##a) #ifdef DEBUG #define DEBC_printk(t, fmt, a...) \ if (debugging) { st_printk(ST_DEB_MSG, t, fmt, ##a ); } @@ -363,7 +358,7 @@ static int st_chk_result(struct scsi_tape *STp, struct st_request * SRpnt) int result = SRpnt->result; u8 scode; DEB(const char *stp;) - char *name = tape_name(STp); + char *name = STp->name; struct st_cmdstatus *cmdstatp; if (!result) @@ -3841,8 +3836,9 @@ static long st_ioctl_common(struct file *file, unsigned int cmd_in, void __user !capable(CAP_SYS_RAWIO)) i = -EPERM; else - i = scsi_cmd_ioctl(STp->disk->queue, STp->disk, - file->f_mode, cmd_in, p); + i = scsi_cmd_ioctl(STp->device->request_queue, + NULL, file->f_mode, cmd_in, + p); if (i != -ENOTTY) return i; break; @@ -4216,7 +4212,7 @@ static int create_one_cdev(struct scsi_tape *tape, int mode, int rew) i = mode << (4 - ST_NBR_MODE_BITS); snprintf(name, 10, "%s%s%s", rew ? "n" : "", - tape->disk->disk_name, st_formats[i]); + tape->name, st_formats[i]); dev = device_create(&st_sysfs_class, &tape->device->sdev_gendev, cdev_devno, &tape->modes[mode], "%s", name); @@ -4271,7 +4267,6 @@ static void remove_cdevs(struct scsi_tape *tape) static int st_probe(struct device *dev) { struct scsi_device *SDp = to_scsi_device(dev); - struct gendisk *disk = NULL; struct scsi_tape *tpnt = NULL; struct st_modedef *STm; struct st_partstat *STps; @@ -4301,27 +4296,13 @@ static int st_probe(struct device *dev) goto out; } - disk = alloc_disk(1); - if (!disk) { - sdev_printk(KERN_ERR, SDp, - "st: out of memory. Device not attached.\n"); - goto out_buffer_free; - } - tpnt = kzalloc(sizeof(struct scsi_tape), GFP_KERNEL); if (tpnt == NULL) { sdev_printk(KERN_ERR, SDp, "st: Can't allocate device descriptor.\n"); - goto out_put_disk; + goto out_buffer_free; } kref_init(&tpnt->kref); - tpnt->disk = disk; - disk->private_data = &tpnt->driver; - /* SCSI tape doesn't register this gendisk via add_disk(). Manually - * take queue reference that release_disk() expects. */ - if (!blk_get_queue(SDp->request_queue)) - goto out_put_disk; - disk->queue = SDp->request_queue; tpnt->driver = &st_template; tpnt->device = SDp; @@ -4394,10 +4375,10 @@ static int st_probe(struct device *dev) idr_preload_end(); if (error < 0) { pr_warn("st: idr allocation failed: %d\n", error); - goto out_put_queue; + goto out_free_tape; } tpnt->index = error; - sprintf(disk->disk_name, "st%d", tpnt->index); + sprintf(tpnt->name, "st%d", tpnt->index); tpnt->stats = kzalloc(sizeof(struct scsi_tape_stats), GFP_KERNEL); if (tpnt->stats == NULL) { sdev_printk(KERN_ERR, SDp, @@ -4414,9 +4395,9 @@ static int st_probe(struct device *dev) scsi_autopm_put_device(SDp); sdev_printk(KERN_NOTICE, SDp, - "Attached scsi tape %s\n", tape_name(tpnt)); + "Attached scsi tape %s\n", tpnt->name); sdev_printk(KERN_INFO, SDp, "%s: try direct i/o: %s (alignment %d B)\n", - tape_name(tpnt), tpnt->try_dio ? "yes" : "no", + tpnt->name, tpnt->try_dio ? "yes" : "no", queue_dma_alignment(SDp->request_queue) + 1); return 0; @@ -4428,10 +4409,7 @@ out_idr_remove: spin_lock(&st_index_lock); idr_remove(&st_index_idr, tpnt->index); spin_unlock(&st_index_lock); -out_put_queue: - blk_put_queue(disk->queue); -out_put_disk: - put_disk(disk); +out_free_tape: kfree(tpnt); out_buffer_free: kfree(buffer); @@ -4470,7 +4448,6 @@ static int st_remove(struct device *dev) static void scsi_tape_release(struct kref *kref) { struct scsi_tape *tpnt = to_scsi_tape(kref); - struct gendisk *disk = tpnt->disk; tpnt->device = NULL; @@ -4480,8 +4457,6 @@ static void scsi_tape_release(struct kref *kref) kfree(tpnt->buffer); } - disk->private_data = NULL; - put_disk(disk); kfree(tpnt->stats); kfree(tpnt); return; diff --git a/drivers/scsi/st.h b/drivers/scsi/st.h index 9d3c38bb0794..c0ef0d9aaf8a 100644 --- a/drivers/scsi/st.h +++ b/drivers/scsi/st.h @@ -187,7 +187,7 @@ struct scsi_tape { unsigned char last_cmnd[6]; unsigned char last_sense[16]; #endif - struct gendisk *disk; + char name[DISK_NAME_LEN]; struct kref kref; struct scsi_tape_stats *stats; }; From aebbb5831fbd5352fd9bd2c858bc249026d3c652 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Aug 2021 15:19:04 +0200 Subject: [PATCH 186/315] sg: do not allocate a gendisk sg is a character driver and thus does not need to allocate a gendisk, which is only used for file system-like block layer I/O on block devices. Signed-off-by: Christoph Hellwig Reviewed-by: Luis Chamberlain Link: https://lore.kernel.org/r/20210816131910.615153-4-hch@lst.de Signed-off-by: Jens Axboe --- drivers/scsi/sg.c | 32 +++++++++----------------------- 1 file changed, 9 insertions(+), 23 deletions(-) diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 91e2221bbb0d..477267add49d 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -166,7 +166,7 @@ typedef struct sg_device { /* holds the state of each scsi generic device */ bool exclude; /* 1->open(O_EXCL) succeeded and is active */ int open_cnt; /* count of opens (perhaps < num(sfds) ) */ char sgdebug; /* 0->off, 1->sense, 9->dump dev, 10-> all devs */ - struct gendisk *disk; + char name[DISK_NAME_LEN]; struct cdev * cdev; /* char_dev [sysfs: /sys/cdev/major/sg] */ struct kref d_ref; } Sg_device; @@ -202,8 +202,7 @@ static void sg_device_destroy(struct kref *kref); #define SZ_SG_REQ_INFO sizeof(sg_req_info_t) #define sg_printk(prefix, sdp, fmt, a...) \ - sdev_prefix_printk(prefix, (sdp)->device, \ - (sdp)->disk->disk_name, fmt, ##a) + sdev_prefix_printk(prefix, (sdp)->device, (sdp)->name, fmt, ##a) /* * The SCSI interfaces that use read() and write() as an asynchronous variant of @@ -832,7 +831,7 @@ sg_common_write(Sg_fd * sfp, Sg_request * srp, srp->rq->timeout = timeout; kref_get(&sfp->f_ref); /* sg_rq_end_io() does kref_put(). */ - blk_execute_rq_nowait(sdp->disk, srp->rq, at_head, sg_rq_end_io); + blk_execute_rq_nowait(NULL, srp->rq, at_head, sg_rq_end_io); return 0; } @@ -1119,8 +1118,7 @@ sg_ioctl_common(struct file *filp, Sg_device *sdp, Sg_fd *sfp, return put_user(max_sectors_bytes(sdp->device->request_queue), ip); case BLKTRACESETUP: - return blk_trace_setup(sdp->device->request_queue, - sdp->disk->disk_name, + return blk_trace_setup(sdp->device->request_queue, NULL, MKDEV(SCSI_GENERIC_MAJOR, sdp->index), NULL, p); case BLKTRACESTART: @@ -1456,7 +1454,7 @@ static struct class *sg_sysfs_class; static int sg_sysfs_valid = 0; static Sg_device * -sg_alloc(struct gendisk *disk, struct scsi_device *scsidp) +sg_alloc(struct scsi_device *scsidp) { struct request_queue *q = scsidp->request_queue; Sg_device *sdp; @@ -1492,9 +1490,7 @@ sg_alloc(struct gendisk *disk, struct scsi_device *scsidp) SCSI_LOG_TIMEOUT(3, sdev_printk(KERN_INFO, scsidp, "sg_alloc: dev=%d \n", k)); - sprintf(disk->disk_name, "sg%d", k); - disk->first_minor = k; - sdp->disk = disk; + sprintf(sdp->name, "sg%d", k); sdp->device = scsidp; mutex_init(&sdp->open_rel_lock); INIT_LIST_HEAD(&sdp->sfds); @@ -1521,19 +1517,11 @@ static int sg_add_device(struct device *cl_dev, struct class_interface *cl_intf) { struct scsi_device *scsidp = to_scsi_device(cl_dev->parent); - struct gendisk *disk; Sg_device *sdp = NULL; struct cdev * cdev = NULL; int error; unsigned long iflags; - disk = alloc_disk(1); - if (!disk) { - pr_warn("%s: alloc_disk failed\n", __func__); - return -ENOMEM; - } - disk->major = SCSI_GENERIC_MAJOR; - error = -ENOMEM; cdev = cdev_alloc(); if (!cdev) { @@ -1543,7 +1531,7 @@ sg_add_device(struct device *cl_dev, struct class_interface *cl_intf) cdev->owner = THIS_MODULE; cdev->ops = &sg_fops; - sdp = sg_alloc(disk, scsidp); + sdp = sg_alloc(scsidp); if (IS_ERR(sdp)) { pr_warn("%s: sg_alloc failed\n", __func__); error = PTR_ERR(sdp); @@ -1561,7 +1549,7 @@ sg_add_device(struct device *cl_dev, struct class_interface *cl_intf) sg_class_member = device_create(sg_sysfs_class, cl_dev->parent, MKDEV(SCSI_GENERIC_MAJOR, sdp->index), - sdp, "%s", disk->disk_name); + sdp, "%s", sdp->name); if (IS_ERR(sg_class_member)) { pr_err("%s: device_create failed\n", __func__); error = PTR_ERR(sg_class_member); @@ -1589,7 +1577,6 @@ cdev_add_err: kfree(sdp); out: - put_disk(disk); if (cdev) cdev_del(cdev); return error; @@ -1613,7 +1600,6 @@ sg_device_destroy(struct kref *kref) SCSI_LOG_TIMEOUT(3, sg_printk(KERN_INFO, sdp, "sg_device_destroy\n")); - put_disk(sdp->disk); kfree(sdp); } @@ -2606,7 +2592,7 @@ static int sg_proc_seq_show_debug(struct seq_file *s, void *v) goto skip; read_lock(&sdp->sfd_lock); if (!list_empty(&sdp->sfds)) { - seq_printf(s, " >>> device=%s ", sdp->disk->disk_name); + seq_printf(s, " >>> device=%s ", sdp->name); if (atomic_read(&sdp->detaching)) seq_puts(s, "detaching pending close "); else if (sdp->device) { From 4dcc4874deb41a11ece9c6e8858385235463c1ac Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Aug 2021 15:19:05 +0200 Subject: [PATCH 187/315] block: cleanup the lockdep handling in *alloc_disk Pass the lockdep name to the low-level __blk_alloc_disk helper and hardcode the name for it given that the number of minors or node_id are not very useful information. While this passes a pointless argument for non-lockdep builds that is not really an issue as disk allocation is a probe time only slow path. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210816131910.615153-5-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 5 +++-- block/genhd.c | 8 +++++--- include/linux/blk-mq.h | 10 +++------- include/linux/genhd.h | 23 ++++++----------------- 4 files changed, 17 insertions(+), 29 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index d2725f94491d..4c56e43e6992 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3133,7 +3133,8 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) } EXPORT_SYMBOL(blk_mq_init_queue); -struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata) +struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, + struct lock_class_key *lkclass) { struct request_queue *q; struct gendisk *disk; @@ -3142,7 +3143,7 @@ struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata) if (IS_ERR(q)) return ERR_CAST(q); - disk = __alloc_disk_node(0, set->numa_node); + disk = __alloc_disk_node(0, set->numa_node, lkclass); if (!disk) { blk_cleanup_queue(q); return ERR_PTR(-ENOMEM); diff --git a/block/genhd.c b/block/genhd.c index 731a46063132..2ad2b25dfc87 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1254,7 +1254,8 @@ dev_t blk_lookup_devt(const char *name, int partno) return devt; } -struct gendisk *__alloc_disk_node(int minors, int node_id) +struct gendisk *__alloc_disk_node(int minors, int node_id, + struct lock_class_key *lkclass) { struct gendisk *disk; @@ -1282,6 +1283,7 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) disk_to_dev(disk)->type = &disk_type; device_initialize(disk_to_dev(disk)); inc_diskseq(disk); + lockdep_init_map(&disk->lockdep_map, "(bio completion)", lkclass, 0); #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED INIT_LIST_HEAD(&disk->slave_bdevs); #endif @@ -1298,7 +1300,7 @@ out_free_disk: } EXPORT_SYMBOL(__alloc_disk_node); -struct gendisk *__blk_alloc_disk(int node) +struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass) { struct request_queue *q; struct gendisk *disk; @@ -1307,7 +1309,7 @@ struct gendisk *__blk_alloc_disk(int node) if (!q) return NULL; - disk = __alloc_disk_node(0, node); + disk = __alloc_disk_node(0, node, lkclass); if (!disk) { blk_cleanup_queue(q); return NULL; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 22215db36122..13ba1861e688 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -432,18 +432,14 @@ enum { ((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \ << BLK_MQ_F_ALLOC_POLICY_START_BIT) +struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, + struct lock_class_key *lkclass); #define blk_mq_alloc_disk(set, queuedata) \ ({ \ static struct lock_class_key __key; \ - struct gendisk *__disk = __blk_mq_alloc_disk(set, queuedata); \ \ - if (!IS_ERR(__disk)) \ - lockdep_init_map(&__disk->lockdep_map, \ - "(bio completion)", &__key, 0); \ - __disk; \ + __blk_mq_alloc_disk(set, queuedata, &__key); \ }) -struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, - void *queuedata); struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index b47e297cd551..3d2e5ee30677 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -259,27 +259,21 @@ static inline sector_t get_capacity(struct gendisk *disk) int bdev_disk_changed(struct gendisk *disk, bool invalidate); void blk_drop_partitions(struct gendisk *disk); -extern struct gendisk *__alloc_disk_node(int minors, int node_id); +struct gendisk *__alloc_disk_node(int minors, int node_id, + struct lock_class_key *lkclass); extern void put_disk(struct gendisk *disk); #define alloc_disk_node(minors, node_id) \ ({ \ static struct lock_class_key __key; \ - const char *__name; \ - struct gendisk *__disk; \ \ - __name = "(gendisk_completion)"#minors"("#node_id")"; \ - \ - __disk = __alloc_disk_node(minors, node_id); \ - \ - if (__disk) \ - lockdep_init_map(&__disk->lockdep_map, __name, &__key, 0); \ - \ - __disk; \ + __alloc_disk_node(minors, node_id, &__key); \ }) #define alloc_disk(minors) alloc_disk_node(minors, NUMA_NO_NODE) +struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass); + /** * blk_alloc_disk - allocate a gendisk structure * @node_id: numa node to allocate on @@ -291,15 +285,10 @@ extern void put_disk(struct gendisk *disk); */ #define blk_alloc_disk(node_id) \ ({ \ - struct gendisk *__disk = __blk_alloc_disk(node_id); \ static struct lock_class_key __key; \ \ - if (__disk) \ - lockdep_init_map(&__disk->lockdep_map, \ - "(bio completion)", &__key, 0); \ - __disk; \ + __blk_alloc_disk(node_id, &__key); \ }) -struct gendisk *__blk_alloc_disk(int node); void blk_cleanup_disk(struct gendisk *disk); int __register_blkdev(unsigned int major, const char *name, From 9c2b9dbafc067e173db30c4fd0636392d27944e8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Aug 2021 15:19:06 +0200 Subject: [PATCH 188/315] block: remove alloc_disk and alloc_disk_node Most drivers should use and have been converted to use blk_alloc_disk and blk_mq_alloc_disk. Only the scsi ULPs and dasd still allocate a disk separately from the request_queue, so don't bother with convenience macros for something that should not see significant new users and remove these wrappers. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210816131910.615153-6-hch@lst.de Signed-off-by: Jens Axboe --- drivers/s390/block/dasd_genhd.c | 5 ++++- drivers/scsi/sd.c | 3 ++- drivers/scsi/sr.c | 4 +++- include/linux/genhd.h | 10 ---------- 4 files changed, 9 insertions(+), 13 deletions(-) diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c index 493e8469893c..07a69b19dd31 100644 --- a/drivers/s390/block/dasd_genhd.c +++ b/drivers/s390/block/dasd_genhd.c @@ -24,6 +24,8 @@ #include "dasd_int.h" +static struct lock_class_key dasd_bio_compl_lkclass; + /* * Allocate and register gendisk structure for device. */ @@ -38,7 +40,8 @@ int dasd_gendisk_alloc(struct dasd_block *block) if (base->devindex >= DASD_PER_MAJOR) return -EBUSY; - gdp = alloc_disk(1 << DASD_PARTN_BITS); + gdp = __alloc_disk_node(1 << DASD_PARTN_BITS, NUMA_NO_NODE, + &dasd_bio_compl_lkclass); if (!gdp) return -ENOMEM; diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 5b5b8266e142..a9535c6484de 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -129,6 +129,7 @@ static DEFINE_MUTEX(sd_ref_mutex); static struct kmem_cache *sd_cdb_cache; static mempool_t *sd_cdb_pool; static mempool_t *sd_page_pool; +static struct lock_class_key sd_bio_compl_lkclass; static const char *sd_cache_types[] = { "write through", "none", "write back", @@ -3408,7 +3409,7 @@ static int sd_probe(struct device *dev) if (!sdkp) goto out; - gd = alloc_disk(SD_MINORS); + gd = __alloc_disk_node(SD_MINORS, NUMA_NO_NODE, &sd_bio_compl_lkclass); if (!gd) goto out_free; diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index 94c254e9012e..fee2bdfe6132 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -106,6 +106,8 @@ static struct scsi_driver sr_template = { static unsigned long sr_index_bits[SR_DISKS / BITS_PER_LONG]; static DEFINE_SPINLOCK(sr_index_lock); +static struct lock_class_key sr_bio_compl_lkclass; + /* This semaphore is used to mediate the 0->1 reference get in the * face of object destruction (i.e. we can't allow a get on an * object after last put) */ @@ -712,7 +714,7 @@ static int sr_probe(struct device *dev) kref_init(&cd->kref); - disk = alloc_disk(1); + disk = __alloc_disk_node(1, NUMA_NO_NODE, &sr_bio_compl_lkclass); if (!disk) goto fail_free; mutex_init(&cd->lock); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 3d2e5ee30677..ceda9b255dba 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -262,16 +262,6 @@ void blk_drop_partitions(struct gendisk *disk); struct gendisk *__alloc_disk_node(int minors, int node_id, struct lock_class_key *lkclass); extern void put_disk(struct gendisk *disk); - -#define alloc_disk_node(minors, node_id) \ -({ \ - static struct lock_class_key __key; \ - \ - __alloc_disk_node(minors, node_id, &__key); \ -}) - -#define alloc_disk(minors) alloc_disk_node(minors, NUMA_NO_NODE) - struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass); /** From a58bd7683fcb60ae24c8572f932b48bc65719b7c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Aug 2021 15:19:07 +0200 Subject: [PATCH 189/315] block: remove the minors argument to __alloc_disk_node This was a leftover from the legacy alloc_disk interface. Switch the scsi ULPs and dasd to set ->minors directly like all other drivers and remove the argument. Signed-off-by: Christoph Hellwig Reviewed-by: Stefan Haberland [dasd] Link: https://lore.kernel.org/r/20210816131910.615153-7-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 2 +- block/genhd.c | 6 ++---- drivers/s390/block/dasd_genhd.c | 4 ++-- drivers/scsi/sd.c | 3 ++- drivers/scsi/sr.c | 3 ++- include/linux/genhd.h | 3 +-- 6 files changed, 10 insertions(+), 11 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 4c56e43e6992..8ac30c343c06 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3143,7 +3143,7 @@ struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, if (IS_ERR(q)) return ERR_CAST(q); - disk = __alloc_disk_node(0, set->numa_node, lkclass); + disk = __alloc_disk_node(set->numa_node, lkclass); if (!disk) { blk_cleanup_queue(q); return ERR_PTR(-ENOMEM); diff --git a/block/genhd.c b/block/genhd.c index 2ad2b25dfc87..caeda726189c 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1254,8 +1254,7 @@ dev_t blk_lookup_devt(const char *name, int partno) return devt; } -struct gendisk *__alloc_disk_node(int minors, int node_id, - struct lock_class_key *lkclass) +struct gendisk *__alloc_disk_node(int node_id, struct lock_class_key *lkclass) { struct gendisk *disk; @@ -1277,7 +1276,6 @@ struct gendisk *__alloc_disk_node(int minors, int node_id, if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL)) goto out_destroy_part_tbl; - disk->minors = minors; rand_initialize_disk(disk); disk_to_dev(disk)->class = &block_class; disk_to_dev(disk)->type = &disk_type; @@ -1309,7 +1307,7 @@ struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass) if (!q) return NULL; - disk = __alloc_disk_node(0, node, lkclass); + disk = __alloc_disk_node(node, lkclass); if (!disk) { blk_cleanup_queue(q); return NULL; diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c index 07a69b19dd31..6e44515b4d33 100644 --- a/drivers/s390/block/dasd_genhd.c +++ b/drivers/s390/block/dasd_genhd.c @@ -40,14 +40,14 @@ int dasd_gendisk_alloc(struct dasd_block *block) if (base->devindex >= DASD_PER_MAJOR) return -EBUSY; - gdp = __alloc_disk_node(1 << DASD_PARTN_BITS, NUMA_NO_NODE, - &dasd_bio_compl_lkclass); + gdp = __alloc_disk_node(NUMA_NO_NODE, &dasd_bio_compl_lkclass); if (!gdp) return -ENOMEM; /* Initialize gendisk structure. */ gdp->major = DASD_MAJOR; gdp->first_minor = base->devindex << DASD_PARTN_BITS; + gdp->minors = 1 << DASD_PARTN_BITS; gdp->fops = &dasd_device_operations; /* diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index a9535c6484de..1c6b8f012219 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -3409,7 +3409,7 @@ static int sd_probe(struct device *dev) if (!sdkp) goto out; - gd = __alloc_disk_node(SD_MINORS, NUMA_NO_NODE, &sd_bio_compl_lkclass); + gd = __alloc_disk_node(NUMA_NO_NODE, &sd_bio_compl_lkclass); if (!gd) goto out_free; @@ -3455,6 +3455,7 @@ static int sd_probe(struct device *dev) gd->major = sd_major((index & 0xf0) >> 4); gd->first_minor = ((index & 0xf) << 4) | (index & 0xfff00); + gd->minors = SD_MINORS; gd->fops = &sd_fops; gd->private_data = &sdkp->driver; diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index fee2bdfe6132..2c45b4140e67 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -714,7 +714,7 @@ static int sr_probe(struct device *dev) kref_init(&cd->kref); - disk = __alloc_disk_node(1, NUMA_NO_NODE, &sr_bio_compl_lkclass); + disk = __alloc_disk_node(NUMA_NO_NODE, &sr_bio_compl_lkclass); if (!disk) goto fail_free; mutex_init(&cd->lock); @@ -731,6 +731,7 @@ static int sr_probe(struct device *dev) disk->major = SCSI_CDROM_MAJOR; disk->first_minor = minor; + disk->minors = 1; sprintf(disk->disk_name, "sr%d", minor); disk->fops = &sr_bdops; disk->flags = GENHD_FL_CD | GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index ceda9b255dba..d20f101be758 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -259,8 +259,7 @@ static inline sector_t get_capacity(struct gendisk *disk) int bdev_disk_changed(struct gendisk *disk, bool invalidate); void blk_drop_partitions(struct gendisk *disk); -struct gendisk *__alloc_disk_node(int minors, int node_id, - struct lock_class_key *lkclass); +struct gendisk *__alloc_disk_node(int node_id, struct lock_class_key *lkclass); extern void put_disk(struct gendisk *disk); struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass); From 4a1fa41d304c7129328d4d5c7f31715b95e23b29 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Aug 2021 15:19:08 +0200 Subject: [PATCH 190/315] block: pass a request_queue to __blk_alloc_disk Pass in a request_queue and assign disk->queue in __blk_alloc_disk to ensure struct gendisk always has a valid ->queue pointer. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210816131910.615153-8-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 +-- block/genhd.c | 7 ++++--- drivers/s390/block/dasd_genhd.c | 4 ++-- drivers/scsi/sd.c | 4 ++-- drivers/scsi/sr.c | 4 ++-- include/linux/genhd.h | 3 ++- 6 files changed, 13 insertions(+), 12 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 8ac30c343c06..2ca7e7c94b18 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3143,12 +3143,11 @@ struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, if (IS_ERR(q)) return ERR_CAST(q); - disk = __alloc_disk_node(set->numa_node, lkclass); + disk = __alloc_disk_node(q, set->numa_node, lkclass); if (!disk) { blk_cleanup_queue(q); return ERR_PTR(-ENOMEM); } - disk->queue = q; return disk; } EXPORT_SYMBOL(__blk_mq_alloc_disk); diff --git a/block/genhd.c b/block/genhd.c index caeda726189c..f18122ee2778 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1254,7 +1254,8 @@ dev_t blk_lookup_devt(const char *name, int partno) return devt; } -struct gendisk *__alloc_disk_node(int node_id, struct lock_class_key *lkclass) +struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, + struct lock_class_key *lkclass) { struct gendisk *disk; @@ -1281,6 +1282,7 @@ struct gendisk *__alloc_disk_node(int node_id, struct lock_class_key *lkclass) disk_to_dev(disk)->type = &disk_type; device_initialize(disk_to_dev(disk)); inc_diskseq(disk); + disk->queue = q; lockdep_init_map(&disk->lockdep_map, "(bio completion)", lkclass, 0); #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED INIT_LIST_HEAD(&disk->slave_bdevs); @@ -1307,12 +1309,11 @@ struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass) if (!q) return NULL; - disk = __alloc_disk_node(node, lkclass); + disk = __alloc_disk_node(q, node, lkclass); if (!disk) { blk_cleanup_queue(q); return NULL; } - disk->queue = q; return disk; } EXPORT_SYMBOL(__blk_alloc_disk); diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c index 6e44515b4d33..fa966e0db6ca 100644 --- a/drivers/s390/block/dasd_genhd.c +++ b/drivers/s390/block/dasd_genhd.c @@ -40,7 +40,8 @@ int dasd_gendisk_alloc(struct dasd_block *block) if (base->devindex >= DASD_PER_MAJOR) return -EBUSY; - gdp = __alloc_disk_node(NUMA_NO_NODE, &dasd_bio_compl_lkclass); + gdp = __alloc_disk_node(block->request_queue, NUMA_NO_NODE, + &dasd_bio_compl_lkclass); if (!gdp) return -ENOMEM; @@ -76,7 +77,6 @@ int dasd_gendisk_alloc(struct dasd_block *block) test_bit(DASD_FLAG_DEVICE_RO, &base->flags)) set_disk_ro(gdp, 1); dasd_add_link_to_gendisk(gdp, base); - gdp->queue = block->request_queue; block->gdp = gdp; set_capacity(block->gdp, 0); device_add_disk(&base->cdev->dev, block->gdp, NULL); diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 1c6b8f012219..610ebba0d66e 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -3409,7 +3409,8 @@ static int sd_probe(struct device *dev) if (!sdkp) goto out; - gd = __alloc_disk_node(NUMA_NO_NODE, &sd_bio_compl_lkclass); + gd = __alloc_disk_node(sdp->request_queue, NUMA_NO_NODE, + &sd_bio_compl_lkclass); if (!gd) goto out_free; @@ -3459,7 +3460,6 @@ static int sd_probe(struct device *dev) gd->fops = &sd_fops; gd->private_data = &sdkp->driver; - gd->queue = sdkp->device->request_queue; /* defaults, until the device tells us otherwise */ sdp->sector_size = 512; diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index 2c45b4140e67..a0df27db4d61 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -714,7 +714,8 @@ static int sr_probe(struct device *dev) kref_init(&cd->kref); - disk = __alloc_disk_node(NUMA_NO_NODE, &sr_bio_compl_lkclass); + disk = __alloc_disk_node(sdev->request_queue, NUMA_NO_NODE, + &sr_bio_compl_lkclass); if (!disk) goto fail_free; mutex_init(&cd->lock); @@ -765,7 +766,6 @@ static int sr_probe(struct device *dev) set_capacity(disk, cd->capacity); disk->private_data = &cd->driver; - disk->queue = sdev->request_queue; if (register_cdrom(disk, &cd->cdi)) goto fail_minor; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index d20f101be758..13e90e6231d8 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -259,7 +259,8 @@ static inline sector_t get_capacity(struct gendisk *disk) int bdev_disk_changed(struct gendisk *disk, bool invalidate); void blk_drop_partitions(struct gendisk *disk); -struct gendisk *__alloc_disk_node(int node_id, struct lock_class_key *lkclass); +struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, + struct lock_class_key *lkclass); extern void put_disk(struct gendisk *disk); struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass); From 61a35cfc26334fe1c8e970ca8fafeae2daae257d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Aug 2021 15:19:09 +0200 Subject: [PATCH 191/315] block: hold a request_queue reference for the lifetime of struct gendisk Acquire the queue ref dropped in disk_release in __blk_alloc_disk so any allocate gendisk always has a queue reference. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210816131910.615153-9-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 19 +++++++------------ include/linux/genhd.h | 1 - 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index f18122ee2778..6294517cebe6 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -551,15 +551,6 @@ void device_add_disk(struct device *parent, struct gendisk *disk, register_disk(parent, disk, groups); blk_register_queue(disk); - /* - * Take an extra ref on queue which will be put on disk_release() - * so that it sticks around as long as @disk is there. - */ - if (blk_get_queue(disk->queue)) - set_bit(GD_QUEUE_REF, &disk->state); - else - WARN_ON_ONCE(1); - disk_add_events(disk); blk_integrity_add(disk); } @@ -1087,8 +1078,7 @@ static void disk_release(struct device *dev) disk_release_events(disk); kfree(disk->random); xa_destroy(&disk->part_tbl); - if (test_bit(GD_QUEUE_REF, &disk->state) && disk->queue) - blk_put_queue(disk->queue); + blk_put_queue(disk->queue); iput(disk->part0->bd_inode); /* frees the disk */ } @@ -1259,9 +1249,12 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, { struct gendisk *disk; + if (!blk_get_queue(q)) + return NULL; + disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id); if (!disk) - return NULL; + goto out_put_queue; disk->bdi = bdi_alloc(node_id); if (!disk->bdi) @@ -1296,6 +1289,8 @@ out_free_bdi: bdi_put(disk->bdi); out_free_disk: kfree(disk); +out_put_queue: + blk_put_queue(q); return NULL; } EXPORT_SYMBOL(__alloc_disk_node); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 13e90e6231d8..55acefdd8a20 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -149,7 +149,6 @@ struct gendisk { unsigned long state; #define GD_NEED_PART_SCAN 0 #define GD_READ_ONLY 1 -#define GD_QUEUE_REF 2 struct mutex open_mutex; /* open/close mutex */ unsigned open_partitions; /* number of open partitions */ From d152c682f03ceb65c0d9663d4ba6ee2d46aa784d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Aug 2021 15:46:24 +0200 Subject: [PATCH 192/315] block: add an explicit ->disk backpointer to the request_queue Replace the magic lookup through the kobject tree with an explicit backpointer, given that the device model links are set up and torn down at times when I/O is still possible, leading to potential NULL or invalid pointer dereferences. Fixes: edb0872f44ec ("block: move the bdi from the request_queue to the gendisk") Reported-by: syzbot Signed-off-by: Christoph Hellwig Tested-by: Sven Schnelle Link: https://lore.kernel.org/r/20210816134624.GA24234@lst.de Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 2 +- block/blk-cgroup.c | 4 ++-- block/blk-mq.c | 2 +- block/blk-settings.c | 8 ++++---- block/blk-sysfs.c | 13 ++++++------- block/blk-wbt.c | 10 +++++----- block/genhd.c | 2 ++ include/linux/blkdev.h | 5 ++--- include/trace/events/kyber.h | 6 +++--- 9 files changed, 26 insertions(+), 26 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index e92bc0348433..480e1a134859 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5269,7 +5269,7 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) switch (ioprio_class) { default: pr_err("bdi %s: bfq: bad prio class %d\n", - bdi_dev_name(queue_to_disk(bfqq->bfqd->queue)->bdi), + bdi_dev_name(bfqq->bfqd->queue->disk->bdi), ioprio_class); fallthrough; case IOPRIO_CLASS_NONE: diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index b8ec47dcce42..f575aa42922b 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -489,9 +489,9 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, const char *blkg_dev_name(struct blkcg_gq *blkg) { - if (!queue_has_disk(blkg->q) || !queue_to_disk(blkg->q)->bdi->dev) + if (!blkg->q->disk || !blkg->q->disk->bdi->dev) return NULL; - return bdi_dev_name(queue_to_disk(blkg->q)->bdi); + return bdi_dev_name(blkg->q->disk->bdi); } /** diff --git a/block/blk-mq.c b/block/blk-mq.c index 2ca7e7c94b18..0a33d16a7298 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -525,7 +525,7 @@ void blk_mq_free_request(struct request *rq) __blk_mq_dec_active_requests(hctx); if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) - laptop_io_completion(queue_to_disk(q)->bdi); + laptop_io_completion(q->disk->bdi); rq_qos_done(q, rq); diff --git a/block/blk-settings.c b/block/blk-settings.c index 3613d2cc0688..a7c857ad7d10 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -141,9 +141,9 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto limits->logical_block_size >> SECTOR_SHIFT); limits->max_sectors = max_sectors; - if (!queue_has_disk(q)) + if (!q->disk) return; - queue_to_disk(q)->bdi->io_pages = max_sectors >> (PAGE_SHIFT - 9); + q->disk->bdi->io_pages = max_sectors >> (PAGE_SHIFT - 9); } EXPORT_SYMBOL(blk_queue_max_hw_sectors); @@ -475,9 +475,9 @@ EXPORT_SYMBOL(blk_limits_io_opt); void blk_queue_io_opt(struct request_queue *q, unsigned int opt) { blk_limits_io_opt(&q->limits, opt); - if (!queue_has_disk(q)) + if (!q->disk) return; - queue_to_disk(q)->bdi->ra_pages = + q->disk->bdi->ra_pages = max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES); } EXPORT_SYMBOL(blk_queue_io_opt); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 586507a5b8c2..7fd99487300c 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -90,9 +90,9 @@ static ssize_t queue_ra_show(struct request_queue *q, char *page) { unsigned long ra_kb; - if (!queue_has_disk(q)) + if (!q->disk) return -EINVAL; - ra_kb = queue_to_disk(q)->bdi->ra_pages << (PAGE_SHIFT - 10); + ra_kb = q->disk->bdi->ra_pages << (PAGE_SHIFT - 10); return queue_var_show(ra_kb, page); } @@ -102,12 +102,12 @@ queue_ra_store(struct request_queue *q, const char *page, size_t count) unsigned long ra_kb; ssize_t ret; - if (!queue_has_disk(q)) + if (!q->disk) return -EINVAL; ret = queue_var_store(&ra_kb, page, count); if (ret < 0) return ret; - queue_to_disk(q)->bdi->ra_pages = ra_kb >> (PAGE_SHIFT - 10); + q->disk->bdi->ra_pages = ra_kb >> (PAGE_SHIFT - 10); return ret; } @@ -254,9 +254,8 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) spin_lock_irq(&q->queue_lock); q->limits.max_sectors = max_sectors_kb << 1; - if (queue_has_disk(q)) - queue_to_disk(q)->bdi->io_pages = - max_sectors_kb >> (PAGE_SHIFT - 10); + if (q->disk) + q->disk->bdi->io_pages = max_sectors_kb >> (PAGE_SHIFT - 10); spin_unlock_irq(&q->queue_lock); return ret; diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 31086afaad9c..874c1c37bf0c 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -97,7 +97,7 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) */ static bool wb_recent_wait(struct rq_wb *rwb) { - struct bdi_writeback *wb = &queue_to_disk(rwb->rqos.q)->bdi->wb; + struct bdi_writeback *wb = &rwb->rqos.q->disk->bdi->wb; return time_before(jiffies, wb->dirty_sleep + HZ); } @@ -234,7 +234,7 @@ enum { static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) { - struct backing_dev_info *bdi = queue_to_disk(rwb->rqos.q)->bdi; + struct backing_dev_info *bdi = rwb->rqos.q->disk->bdi; struct rq_depth *rqd = &rwb->rq_depth; u64 thislat; @@ -287,7 +287,7 @@ static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) static void rwb_trace_step(struct rq_wb *rwb, const char *msg) { - struct backing_dev_info *bdi = queue_to_disk(rwb->rqos.q)->bdi; + struct backing_dev_info *bdi = rwb->rqos.q->disk->bdi; struct rq_depth *rqd = &rwb->rq_depth; trace_wbt_step(bdi, msg, rqd->scale_step, rwb->cur_win_nsec, @@ -359,8 +359,8 @@ static void wb_timer_fn(struct blk_stat_callback *cb) status = latency_exceeded(rwb, cb->stat); - trace_wbt_timer(queue_to_disk(rwb->rqos.q)->bdi, status, - rqd->scale_step, inflight); + trace_wbt_timer(rwb->rqos.q->disk->bdi, status, rqd->scale_step, + inflight); /* * If we exceeded the latency target, step down. If we did not, diff --git a/block/genhd.c b/block/genhd.c index 6294517cebe6..02cd9ec93e52 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1078,6 +1078,7 @@ static void disk_release(struct device *dev) disk_release_events(disk); kfree(disk->random); xa_destroy(&disk->part_tbl); + disk->queue->disk = NULL; blk_put_queue(disk->queue); iput(disk->part0->bd_inode); /* frees the disk */ } @@ -1276,6 +1277,7 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, device_initialize(disk_to_dev(disk)); inc_diskseq(disk); disk->queue = q; + q->disk = disk; lockdep_init_map(&disk->lockdep_map, "(bio completion)", lkclass, 0); #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED INIT_LIST_HEAD(&disk->slave_bdevs); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index df404c1fb087..22b5b8502d2a 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -421,6 +421,8 @@ struct request_queue { spinlock_t queue_lock; + struct gendisk *disk; + /* * queue kobject */ @@ -661,9 +663,6 @@ extern void blk_clear_pm_only(struct request_queue *q); dma_map_page_attrs(dev, (bv)->bv_page, (bv)->bv_offset, (bv)->bv_len, \ (dir), (attrs)) -#define queue_has_disk(q) ((q)->kobj.parent != NULL) -#define queue_to_disk(q) (dev_to_disk(kobj_to_dev((q)->kobj.parent))) - static inline bool queue_is_mq(struct request_queue *q) { return q->mq_ops; diff --git a/include/trace/events/kyber.h b/include/trace/events/kyber.h index f9802562edf6..491098a0d8ed 100644 --- a/include/trace/events/kyber.h +++ b/include/trace/events/kyber.h @@ -30,7 +30,7 @@ TRACE_EVENT(kyber_latency, ), TP_fast_assign( - __entry->dev = disk_devt(queue_to_disk(q)); + __entry->dev = disk_devt(q->disk); strlcpy(__entry->domain, domain, sizeof(__entry->domain)); strlcpy(__entry->type, type, sizeof(__entry->type)); __entry->percentile = percentile; @@ -59,7 +59,7 @@ TRACE_EVENT(kyber_adjust, ), TP_fast_assign( - __entry->dev = disk_devt(queue_to_disk(q)); + __entry->dev = disk_devt(q->disk); strlcpy(__entry->domain, domain, sizeof(__entry->domain)); __entry->depth = depth; ), @@ -81,7 +81,7 @@ TRACE_EVENT(kyber_throttled, ), TP_fast_assign( - __entry->dev = disk_devt(queue_to_disk(q)); + __entry->dev = disk_devt(q->disk); strlcpy(__entry->domain, domain, sizeof(__entry->domain)); ), From 40b3a52ffc5bc3b5427d5d35b035cfb19d03fdd6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 18 Aug 2021 16:45:32 +0200 Subject: [PATCH 193/315] block: add a sanity check for a live disk in del_gendisk Add a sanity check to del_gendisk to do nothing when the disk wasn't successfully added. This papers over the complete lack of add_disk error handling, which is about to get fixed gradually. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210818144542.19305-2-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/genhd.c b/block/genhd.c index 02cd9ec93e52..935f74c652f1 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -579,7 +579,7 @@ void del_gendisk(struct gendisk *disk) { might_sleep(); - if (WARN_ON_ONCE(!disk->queue)) + if (WARN_ON_ONCE(!disk_live(disk))) return; blk_integrity_del(disk); From 52b85909f85d06efa69aaf4210e72467f1f58d2b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 18 Aug 2021 16:45:33 +0200 Subject: [PATCH 194/315] block: fold register_disk into device_add_disk There is no real reason these should be separate. Also simplify the groups assignment a bit. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210818144542.19305-3-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 131 +++++++++++++++++++++++--------------------------- 1 file changed, 60 insertions(+), 71 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 935f74c652f1..ec4be5889fbf 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -409,71 +409,6 @@ static void disk_scan_partitions(struct gendisk *disk) blkdev_put(bdev, FMODE_READ); } -static void register_disk(struct device *parent, struct gendisk *disk, - const struct attribute_group **groups) -{ - struct device *ddev = disk_to_dev(disk); - int err; - - ddev->parent = parent; - - dev_set_name(ddev, "%s", disk->disk_name); - - /* delay uevents, until we scanned partition table */ - dev_set_uevent_suppress(ddev, 1); - - if (groups) { - WARN_ON(ddev->groups); - ddev->groups = groups; - } - if (device_add(ddev)) - return; - if (!sysfs_deprecated) { - err = sysfs_create_link(block_depr, &ddev->kobj, - kobject_name(&ddev->kobj)); - if (err) { - device_del(ddev); - return; - } - } - - /* - * avoid probable deadlock caused by allocating memory with - * GFP_KERNEL in runtime_resume callback of its all ancestor - * devices - */ - pm_runtime_set_memalloc_noio(ddev, true); - - disk->part0->bd_holder_dir = - kobject_create_and_add("holders", &ddev->kobj); - disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); - - /* - * XXX: this is a mess, can't wait for real error handling in add_disk. - * Make sure ->slave_dir is NULL if we failed some of the registration - * so that the cleanup in bd_unlink_disk_holder works properly. - */ - if (bd_register_pending_holders(disk) < 0) { - kobject_put(disk->slave_dir); - disk->slave_dir = NULL; - } - - if (disk->flags & GENHD_FL_HIDDEN) - return; - - disk_scan_partitions(disk); - - /* announce the disk and partitions after all partitions are created */ - dev_set_uevent_suppress(ddev, 0); - disk_uevent(disk, KOBJ_ADD); - - if (disk->bdi->dev) { - err = sysfs_create_link(&ddev->kobj, &disk->bdi->dev->kobj, - "bdi"); - WARN_ON(err); - } -} - /** * device_add_disk - add disk information to kernel list * @parent: parent device for the disk @@ -490,6 +425,7 @@ void device_add_disk(struct device *parent, struct gendisk *disk, const struct attribute_group **groups) { + struct device *ddev = disk_to_dev(disk); int ret; /* @@ -538,17 +474,70 @@ void device_add_disk(struct device *parent, struct gendisk *disk, disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO; disk->flags |= GENHD_FL_NO_PART_SCAN; } else { - struct device *dev = disk_to_dev(disk); - /* Register BDI before referencing it from bdev */ - dev->devt = MKDEV(disk->major, disk->first_minor); + ddev->devt = MKDEV(disk->major, disk->first_minor); ret = bdi_register(disk->bdi, "%u:%u", disk->major, disk->first_minor); WARN_ON(ret); - bdi_set_owner(disk->bdi, dev); - bdev_add(disk->part0, dev->devt); + bdi_set_owner(disk->bdi, ddev); + bdev_add(disk->part0, ddev->devt); } - register_disk(parent, disk, groups); + + /* delay uevents, until we scanned partition table */ + dev_set_uevent_suppress(ddev, 1); + + ddev->parent = parent; + ddev->groups = groups; + dev_set_name(ddev, "%s", disk->disk_name); + if (device_add(ddev)) + return; + if (!sysfs_deprecated) { + ret = sysfs_create_link(block_depr, &ddev->kobj, + kobject_name(&ddev->kobj)); + if (ret) { + device_del(ddev); + return; + } + } + + /* + * avoid probable deadlock caused by allocating memory with + * GFP_KERNEL in runtime_resume callback of its all ancestor + * devices + */ + pm_runtime_set_memalloc_noio(ddev, true); + + disk->part0->bd_holder_dir = + kobject_create_and_add("holders", &ddev->kobj); + disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); + + /* + * XXX: this is a mess, can't wait for real error handling in add_disk. + * Make sure ->slave_dir is NULL if we failed some of the registration + * so that the cleanup in bd_unlink_disk_holder works properly. + */ + if (bd_register_pending_holders(disk) < 0) { + kobject_put(disk->slave_dir); + disk->slave_dir = NULL; + } + + if (!(disk->flags & GENHD_FL_HIDDEN)) { + disk_scan_partitions(disk); + + /* + * Announce the disk and partitions after all partitions are + * created. + */ + dev_set_uevent_suppress(ddev, 0); + disk_uevent(disk, KOBJ_ADD); + + if (disk->bdi->dev) { + ret = sysfs_create_link(&ddev->kobj, + &disk->bdi->dev->kobj, "bdi"); + WARN_ON(ret); + } + } + blk_register_queue(disk); disk_add_events(disk); From 8235b5c1e8c1c0537f03a21a2e380098bed25248 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 18 Aug 2021 16:45:34 +0200 Subject: [PATCH 195/315] block: call bdev_add later in device_add_disk Once bdev_add is called userspace can open the block device. Ensure that the struct device, which is used for refcounting of the disk besides various other things, is fully setup at that point. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210818144542.19305-4-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index ec4be5889fbf..ab455f110be2 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -466,29 +466,14 @@ void device_add_disk(struct device *parent, struct gendisk *disk, disk_alloc_events(disk); - if (disk->flags & GENHD_FL_HIDDEN) { - /* - * Don't let hidden disks show up in /proc/partitions, - * and don't bother scanning for partitions either. - */ - disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO; - disk->flags |= GENHD_FL_NO_PART_SCAN; - } else { - /* Register BDI before referencing it from bdev */ - ddev->devt = MKDEV(disk->major, disk->first_minor); - ret = bdi_register(disk->bdi, "%u:%u", - disk->major, disk->first_minor); - WARN_ON(ret); - bdi_set_owner(disk->bdi, ddev); - bdev_add(disk->part0, ddev->devt); - } - /* delay uevents, until we scanned partition table */ dev_set_uevent_suppress(ddev, 1); ddev->parent = parent; ddev->groups = groups; dev_set_name(ddev, "%s", disk->disk_name); + if (!(disk->flags & GENHD_FL_HIDDEN)) + ddev->devt = MKDEV(disk->major, disk->first_minor); if (device_add(ddev)) return; if (!sysfs_deprecated) { @@ -521,12 +506,25 @@ void device_add_disk(struct device *parent, struct gendisk *disk, disk->slave_dir = NULL; } - if (!(disk->flags & GENHD_FL_HIDDEN)) { + if (disk->flags & GENHD_FL_HIDDEN) { + /* + * Don't let hidden disks show up in /proc/partitions, + * and don't bother scanning for partitions either. + */ + disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO; + disk->flags |= GENHD_FL_NO_PART_SCAN; + } else { + ret = bdi_register(disk->bdi, "%u:%u", + disk->major, disk->first_minor); + WARN_ON(ret); + bdi_set_owner(disk->bdi, ddev); + bdev_add(disk->part0, ddev->devt); + disk_scan_partitions(disk); /* * Announce the disk and partitions after all partitions are - * created. + * created. (for hidden disks uevents remain suppressed forever) */ dev_set_uevent_suppress(ddev, 0); disk_uevent(disk, KOBJ_ADD); From 9d5ee6767c85762205b788ed1245f21fafd6c504 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 18 Aug 2021 16:45:35 +0200 Subject: [PATCH 196/315] block: create the bdi link earlier in device_add_disk This will simplify error handling going forward. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210818144542.19305-5-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index ab455f110be2..f05e58f214d2 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -518,8 +518,13 @@ void device_add_disk(struct device *parent, struct gendisk *disk, disk->major, disk->first_minor); WARN_ON(ret); bdi_set_owner(disk->bdi, ddev); - bdev_add(disk->part0, ddev->devt); + if (disk->bdi->dev) { + ret = sysfs_create_link(&ddev->kobj, + &disk->bdi->dev->kobj, "bdi"); + WARN_ON(ret); + } + bdev_add(disk->part0, ddev->devt); disk_scan_partitions(disk); /* @@ -528,12 +533,6 @@ void device_add_disk(struct device *parent, struct gendisk *disk, */ dev_set_uevent_suppress(ddev, 0); disk_uevent(disk, KOBJ_ADD); - - if (disk->bdi->dev) { - ret = sysfs_create_link(&ddev->kobj, - &disk->bdi->dev->kobj, "bdi"); - WARN_ON(ret); - } } blk_register_queue(disk); From bab53f6b617d9f530978d6e3693f88e586d81a8a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 18 Aug 2021 16:45:36 +0200 Subject: [PATCH 197/315] block: call blk_integrity_add earlier in device_add_disk Doing all the sysfs file creation before adding the bdev and thus allowing it to be opened will simplify the about to be added error handling. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210818144542.19305-6-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/block/genhd.c b/block/genhd.c index f05e58f214d2..75d900e4c82f 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -492,6 +492,8 @@ void device_add_disk(struct device *parent, struct gendisk *disk, */ pm_runtime_set_memalloc_noio(ddev, true); + blk_integrity_add(disk); + disk->part0->bd_holder_dir = kobject_create_and_add("holders", &ddev->kobj); disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); @@ -538,7 +540,6 @@ void device_add_disk(struct device *parent, struct gendisk *disk, blk_register_queue(disk); disk_add_events(disk); - blk_integrity_add(disk); } EXPORT_SYMBOL(device_add_disk); From 75f4dca59694dfe288ae6a48d7b147b60d11c95c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 18 Aug 2021 16:45:37 +0200 Subject: [PATCH 198/315] block: call blk_register_queue earlier in device_add_disk Ensure that all the sysfs bits are set up before bdev_add is called, as that will make the upcomding error handling much easier. However this means the call to disk_update_readahead has to be split as that requires a bdi. Also remove various sanity checks that don't make sense now that blk_register_queue only has a single caller. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210818144542.19305-7-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 9 --------- block/genhd.c | 5 +++-- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 7fd99487300c..614d9d47de36 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -856,15 +856,6 @@ int blk_register_queue(struct gendisk *disk) struct device *dev = disk_to_dev(disk); struct request_queue *q = disk->queue; - if (WARN_ON(!q)) - return -ENXIO; - - WARN_ONCE(blk_queue_registered(q), - "%s is registering an already registered queue\n", - kobject_name(&dev->kobj)); - - disk_update_readahead(disk); - ret = blk_trace_init_sysfs(dev); if (ret) return ret; diff --git a/block/genhd.c b/block/genhd.c index 75d900e4c82f..a54b4849242c 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -508,6 +508,8 @@ void device_add_disk(struct device *parent, struct gendisk *disk, disk->slave_dir = NULL; } + blk_register_queue(disk); + if (disk->flags & GENHD_FL_HIDDEN) { /* * Don't let hidden disks show up in /proc/partitions, @@ -537,8 +539,7 @@ void device_add_disk(struct device *parent, struct gendisk *disk, disk_uevent(disk, KOBJ_ADD); } - blk_register_queue(disk); - + disk_update_readahead(disk); disk_add_events(disk); } EXPORT_SYMBOL(device_add_disk); From 614310c9c8ca15359f4e71a5bbd9165897b4d54e Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Wed, 18 Aug 2021 16:45:38 +0200 Subject: [PATCH 199/315] block: return errors from blk_integrity_add Prepare for proper error handling in add_disk. Signed-off-by: Luis Chamberlain [hch: split from a larger patch] Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210818144542.19305-8-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-integrity.c | 12 +++++++----- block/blk.h | 5 +++-- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 410da060d1f5..69a12177dfb6 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -431,13 +431,15 @@ void blk_integrity_unregister(struct gendisk *disk) } EXPORT_SYMBOL(blk_integrity_unregister); -void blk_integrity_add(struct gendisk *disk) +int blk_integrity_add(struct gendisk *disk) { - if (kobject_init_and_add(&disk->integrity_kobj, &integrity_ktype, - &disk_to_dev(disk)->kobj, "%s", "integrity")) - return; + int ret; - kobject_uevent(&disk->integrity_kobj, KOBJ_ADD); + ret = kobject_init_and_add(&disk->integrity_kobj, &integrity_ktype, + &disk_to_dev(disk)->kobj, "%s", "integrity"); + if (!ret) + kobject_uevent(&disk->integrity_kobj, KOBJ_ADD); + return ret; } void blk_integrity_del(struct gendisk *disk) diff --git a/block/blk.h b/block/blk.h index 148bdcd3aa08..c9727dd56da7 100644 --- a/block/blk.h +++ b/block/blk.h @@ -132,7 +132,7 @@ static inline bool integrity_req_gap_front_merge(struct request *req, bip_next->bip_vec[0].bv_offset); } -void blk_integrity_add(struct gendisk *); +int blk_integrity_add(struct gendisk *disk); void blk_integrity_del(struct gendisk *); #else /* CONFIG_BLK_DEV_INTEGRITY */ static inline bool blk_integrity_merge_rq(struct request_queue *rq, @@ -166,8 +166,9 @@ static inline bool bio_integrity_endio(struct bio *bio) static inline void bio_integrity_free(struct bio *bio) { } -static inline void blk_integrity_add(struct gendisk *disk) +static inline int blk_integrity_add(struct gendisk *disk) { + return 0; } static inline void blk_integrity_del(struct gendisk *disk) { From 92e7755ebc69233e25a2d1b760aeff536dc4016b Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Wed, 18 Aug 2021 16:45:39 +0200 Subject: [PATCH 200/315] block: return errors from disk_alloc_events Prepare for proper error handling in add_disk. Signed-off-by: Luis Chamberlain [hch: split from a larger patch] Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210818144542.19305-9-hch@lst.de Signed-off-by: Jens Axboe --- block/blk.h | 2 +- block/disk-events.c | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/block/blk.h b/block/blk.h index c9727dd56da7..bbbcc1a64a2d 100644 --- a/block/blk.h +++ b/block/blk.h @@ -362,7 +362,7 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct request_queue *blk_alloc_queue(int node_id); -void disk_alloc_events(struct gendisk *disk); +int disk_alloc_events(struct gendisk *disk); void disk_add_events(struct gendisk *disk); void disk_del_events(struct gendisk *disk); void disk_release_events(struct gendisk *disk); diff --git a/block/disk-events.c b/block/disk-events.c index 7445b8ff2775..8d5496e7592a 100644 --- a/block/disk-events.c +++ b/block/disk-events.c @@ -444,17 +444,17 @@ module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops, /* * disk_{alloc|add|del|release}_events - initialize and destroy disk_events. */ -void disk_alloc_events(struct gendisk *disk) +int disk_alloc_events(struct gendisk *disk) { struct disk_events *ev; if (!disk->fops->check_events || !disk->events) - return; + return 0; ev = kzalloc(sizeof(*ev), GFP_KERNEL); if (!ev) { pr_warn("%s: failed to initialize events\n", disk->disk_name); - return; + return -ENOMEM; } INIT_LIST_HEAD(&ev->node); @@ -466,6 +466,7 @@ void disk_alloc_events(struct gendisk *disk) INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn); disk->ev = ev; + return 0; } void disk_add_events(struct gendisk *disk) From 83cbce9574462c6b4eed6797bdaf18fae6859ab3 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Wed, 18 Aug 2021 16:45:40 +0200 Subject: [PATCH 201/315] block: add error handling for device_add_disk / add_disk Properly unwind on errors in device_add_disk. This is the initial work as drivers are not converted yet, which will follow in separate patches. Signed-off-by: Luis Chamberlain [hch: major rebase. All bugs are probably mine] Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210818144542.19305-10-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 92 +++++++++++++++++++++++++++---------------- include/linux/genhd.h | 8 ++-- 2 files changed, 62 insertions(+), 38 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index a54b4849242c..a925f773145f 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -417,11 +417,8 @@ static void disk_scan_partitions(struct gendisk *disk) * * This function registers the partitioning information in @disk * with the kernel. - * - * FIXME: error handling */ - -void device_add_disk(struct device *parent, struct gendisk *disk, +int device_add_disk(struct device *parent, struct gendisk *disk, const struct attribute_group **groups) { @@ -444,7 +441,8 @@ void device_add_disk(struct device *parent, struct gendisk *disk, * and all partitions from the extended dev_t space. */ if (disk->major) { - WARN_ON(!disk->minors); + if (WARN_ON(!disk->minors)) + return -EINVAL; if (disk->minors > DISK_MAX_PARTS) { pr_err("block: can't allocate more than %d partitions\n", @@ -452,19 +450,20 @@ void device_add_disk(struct device *parent, struct gendisk *disk, disk->minors = DISK_MAX_PARTS; } } else { - WARN_ON(disk->minors); + if (WARN_ON(disk->minors)) + return -EINVAL; ret = blk_alloc_ext_minor(); - if (ret < 0) { - WARN_ON(1); - return; - } + if (ret < 0) + return ret; disk->major = BLOCK_EXT_MAJOR; disk->first_minor = MINOR(ret); disk->flags |= GENHD_FL_EXT_DEVT; } - disk_alloc_events(disk); + ret = disk_alloc_events(disk); + if (ret) + goto out_free_ext_minor; /* delay uevents, until we scanned partition table */ dev_set_uevent_suppress(ddev, 1); @@ -474,15 +473,14 @@ void device_add_disk(struct device *parent, struct gendisk *disk, dev_set_name(ddev, "%s", disk->disk_name); if (!(disk->flags & GENHD_FL_HIDDEN)) ddev->devt = MKDEV(disk->major, disk->first_minor); - if (device_add(ddev)) - return; + ret = device_add(ddev); + if (ret) + goto out_disk_release_events; if (!sysfs_deprecated) { ret = sysfs_create_link(block_depr, &ddev->kobj, kobject_name(&ddev->kobj)); - if (ret) { - device_del(ddev); - return; - } + if (ret) + goto out_device_del; } /* @@ -492,23 +490,25 @@ void device_add_disk(struct device *parent, struct gendisk *disk, */ pm_runtime_set_memalloc_noio(ddev, true); - blk_integrity_add(disk); + ret = blk_integrity_add(disk); + if (ret) + goto out_del_block_link; disk->part0->bd_holder_dir = kobject_create_and_add("holders", &ddev->kobj); + if (!disk->part0->bd_holder_dir) + goto out_del_integrity; disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); + if (!disk->slave_dir) + goto out_put_holder_dir; - /* - * XXX: this is a mess, can't wait for real error handling in add_disk. - * Make sure ->slave_dir is NULL if we failed some of the registration - * so that the cleanup in bd_unlink_disk_holder works properly. - */ - if (bd_register_pending_holders(disk) < 0) { - kobject_put(disk->slave_dir); - disk->slave_dir = NULL; - } + ret = bd_register_pending_holders(disk); + if (ret < 0) + goto out_put_slave_dir; - blk_register_queue(disk); + ret = blk_register_queue(disk); + if (ret) + goto out_put_slave_dir; if (disk->flags & GENHD_FL_HIDDEN) { /* @@ -520,13 +520,13 @@ void device_add_disk(struct device *parent, struct gendisk *disk, } else { ret = bdi_register(disk->bdi, "%u:%u", disk->major, disk->first_minor); - WARN_ON(ret); + if (ret) + goto out_unregister_queue; bdi_set_owner(disk->bdi, ddev); - if (disk->bdi->dev) { - ret = sysfs_create_link(&ddev->kobj, - &disk->bdi->dev->kobj, "bdi"); - WARN_ON(ret); - } + ret = sysfs_create_link(&ddev->kobj, + &disk->bdi->dev->kobj, "bdi"); + if (ret) + goto out_unregister_bdi; bdev_add(disk->part0, ddev->devt); disk_scan_partitions(disk); @@ -541,6 +541,30 @@ void device_add_disk(struct device *parent, struct gendisk *disk, disk_update_readahead(disk); disk_add_events(disk); + return 0; + +out_unregister_bdi: + if (!(disk->flags & GENHD_FL_HIDDEN)) + bdi_unregister(disk->bdi); +out_unregister_queue: + blk_unregister_queue(disk); +out_put_slave_dir: + kobject_put(disk->slave_dir); +out_put_holder_dir: + kobject_put(disk->part0->bd_holder_dir); +out_del_integrity: + blk_integrity_del(disk); +out_del_block_link: + if (!sysfs_deprecated) + sysfs_remove_link(block_depr, dev_name(ddev)); +out_device_del: + device_del(ddev); +out_disk_release_events: + disk_release_events(disk); +out_free_ext_minor: + if (disk->major == BLOCK_EXT_MAJOR) + blk_free_ext_minor(disk->first_minor); + return WARN_ON_ONCE(ret); /* keep until all callers handle errors */ } EXPORT_SYMBOL(device_add_disk); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 55acefdd8a20..c68d83c87f83 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -214,11 +214,11 @@ static inline dev_t disk_devt(struct gendisk *disk) void disk_uevent(struct gendisk *disk, enum kobject_action action); /* block/genhd.c */ -extern void device_add_disk(struct device *parent, struct gendisk *disk, - const struct attribute_group **groups); -static inline void add_disk(struct gendisk *disk) +int device_add_disk(struct device *parent, struct gendisk *disk, + const struct attribute_group **groups); +static inline int add_disk(struct gendisk *disk) { - device_add_disk(NULL, disk, NULL); + return device_add_disk(NULL, disk, NULL); } extern void del_gendisk(struct gendisk *gp); From dbb301f91fc855dccf9bc42fbc4281d89365906d Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Wed, 18 Aug 2021 16:45:41 +0200 Subject: [PATCH 202/315] virtio_blk: add error handling support for add_disk() We never checked for errors on add_disk() as this function returned void. Now that this is fixed, use the shiny new error handling. Signed-off-by: Luis Chamberlain Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210818144542.19305-11-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/virtio_blk.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 767b4f72a54d..63dc121a4c43 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -875,9 +875,14 @@ static int virtblk_probe(struct virtio_device *vdev) virtblk_update_capacity(vblk, false); virtio_device_ready(vdev); - device_add_disk(&vdev->dev, vblk->disk, virtblk_attr_groups); + err = device_add_disk(&vdev->dev, vblk->disk, virtblk_attr_groups); + if (err) + goto out_cleanup_disk; + return 0; +out_cleanup_disk: + blk_cleanup_disk(vblk->disk); out_free_tags: blk_mq_free_tag_set(&vblk->tag_set); out_free_vq: From 10e7123d5551dec0025f70e61604ab57483a6ed2 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Wed, 18 Aug 2021 16:45:42 +0200 Subject: [PATCH 203/315] null_blk: add error handling support for add_disk() We never checked for errors on add_disk() as this function returned void. Now that this is fixed, use the shiny new error handling. The actual cleanup in case of error is already handled by the caller of null_gendisk_register(). Signed-off-by: Luis Chamberlain Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210818144542.19305-12-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index f128242d1170..187d779c8ca0 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1717,8 +1717,7 @@ static int null_gendisk_register(struct nullb *nullb) return ret; } - add_disk(disk); - return 0; + return add_disk(disk); } static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set) From d3e9f732c415cf22faa33d6f195e291ad82dc92e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 4 Aug 2021 08:37:25 -0600 Subject: [PATCH 204/315] io-wq: remove GFP_ATOMIC allocation off schedule out path Daniel reports that the v5.14-rc4-rt4 kernel throws a BUG when running stress-ng: | [ 90.202543] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:35 | [ 90.202549] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 2047, name: iou-wrk-2041 | [ 90.202555] CPU: 5 PID: 2047 Comm: iou-wrk-2041 Tainted: G W 5.14.0-rc4-rt4+ #89 | [ 90.202559] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2 04/01/2014 | [ 90.202561] Call Trace: | [ 90.202577] dump_stack_lvl+0x34/0x44 | [ 90.202584] ___might_sleep.cold+0x87/0x94 | [ 90.202588] rt_spin_lock+0x19/0x70 | [ 90.202593] ___slab_alloc+0xcb/0x7d0 | [ 90.202598] ? newidle_balance.constprop.0+0xf5/0x3b0 | [ 90.202603] ? dequeue_entity+0xc3/0x290 | [ 90.202605] ? io_wqe_dec_running.isra.0+0x98/0xe0 | [ 90.202610] ? pick_next_task_fair+0xb9/0x330 | [ 90.202612] ? __schedule+0x670/0x1410 | [ 90.202615] ? io_wqe_dec_running.isra.0+0x98/0xe0 | [ 90.202618] kmem_cache_alloc_trace+0x79/0x1f0 | [ 90.202621] io_wqe_dec_running.isra.0+0x98/0xe0 | [ 90.202625] io_wq_worker_sleeping+0x37/0x50 | [ 90.202628] schedule+0x30/0xd0 | [ 90.202630] schedule_timeout+0x8f/0x1a0 | [ 90.202634] ? __bpf_trace_tick_stop+0x10/0x10 | [ 90.202637] io_wqe_worker+0xfd/0x320 | [ 90.202641] ? finish_task_switch.isra.0+0xd3/0x290 | [ 90.202644] ? io_worker_handle_work+0x670/0x670 | [ 90.202646] ? io_worker_handle_work+0x670/0x670 | [ 90.202649] ret_from_fork+0x22/0x30 which is due to the RT kernel not liking a GFP_ATOMIC allocation inside a raw spinlock. Besides that not working on RT, doing any kind of allocation from inside schedule() is kind of nasty and should be avoided if at all possible. This particular path happens when an io-wq worker goes to sleep, and we need a new worker to handle pending work. We currently allocate a small data item to hold the information we need to create a new worker, but we can instead include this data in the io_worker struct itself and just protect it with a single bit lock. We only really need one per worker anyway, as we will have run pending work between to sleep cycles. https://lore.kernel.org/lkml/20210804082418.fbibprcwtzyt5qax@beryllium.lan/ Reported-by: Daniel Wagner Tested-by: Daniel Wagner Acked-by: Peter Zijlstra (Intel) Signed-off-by: Jens Axboe --- fs/io-wq.c | 72 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 40 insertions(+), 32 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index 7d2ed8c7dd31..4ce83bb48021 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -51,6 +51,10 @@ struct io_worker { struct completion ref_done; + unsigned long create_state; + struct callback_head create_work; + int create_index; + struct rcu_head rcu; }; @@ -272,24 +276,18 @@ static void io_wqe_inc_running(struct io_worker *worker) atomic_inc(&acct->nr_running); } -struct create_worker_data { - struct callback_head work; - struct io_wqe *wqe; - int index; -}; - static void create_worker_cb(struct callback_head *cb) { - struct create_worker_data *cwd; + struct io_worker *worker; struct io_wq *wq; struct io_wqe *wqe; struct io_wqe_acct *acct; bool do_create = false, first = false; - cwd = container_of(cb, struct create_worker_data, work); - wqe = cwd->wqe; + worker = container_of(cb, struct io_worker, create_work); + wqe = worker->wqe; wq = wqe->wq; - acct = &wqe->acct[cwd->index]; + acct = &wqe->acct[worker->create_index]; raw_spin_lock_irq(&wqe->lock); if (acct->nr_workers < acct->max_workers) { if (!acct->nr_workers) @@ -299,33 +297,42 @@ static void create_worker_cb(struct callback_head *cb) } raw_spin_unlock_irq(&wqe->lock); if (do_create) { - create_io_worker(wq, wqe, cwd->index, first); + create_io_worker(wq, wqe, worker->create_index, first); } else { atomic_dec(&acct->nr_running); io_worker_ref_put(wq); } - kfree(cwd); + clear_bit_unlock(0, &worker->create_state); + io_worker_release(worker); } -static void io_queue_worker_create(struct io_wqe *wqe, struct io_wqe_acct *acct) +static void io_queue_worker_create(struct io_wqe *wqe, struct io_worker *worker, + struct io_wqe_acct *acct) { - struct create_worker_data *cwd; struct io_wq *wq = wqe->wq; /* raced with exit, just ignore create call */ if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) goto fail; + if (!io_worker_get(worker)) + goto fail; + /* + * create_state manages ownership of create_work/index. We should + * only need one entry per worker, as the worker going to sleep + * will trigger the condition, and waking will clear it once it + * runs the task_work. + */ + if (test_bit(0, &worker->create_state) || + test_and_set_bit_lock(0, &worker->create_state)) + goto fail_release; - cwd = kmalloc(sizeof(*cwd), GFP_ATOMIC); - if (cwd) { - init_task_work(&cwd->work, create_worker_cb); - cwd->wqe = wqe; - cwd->index = acct->index; - if (!task_work_add(wq->task, &cwd->work, TWA_SIGNAL)) - return; - - kfree(cwd); - } + init_task_work(&worker->create_work, create_worker_cb); + worker->create_index = acct->index; + if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL)) + return; + clear_bit_unlock(0, &worker->create_state); +fail_release: + io_worker_release(worker); fail: atomic_dec(&acct->nr_running); io_worker_ref_put(wq); @@ -343,7 +350,7 @@ static void io_wqe_dec_running(struct io_worker *worker) if (atomic_dec_and_test(&acct->nr_running) && io_wqe_run_queue(wqe)) { atomic_inc(&acct->nr_running); atomic_inc(&wqe->wq->worker_refs); - io_queue_worker_create(wqe, acct); + io_queue_worker_create(wqe, worker, acct); } } @@ -1004,12 +1011,12 @@ err_wq: static bool io_task_work_match(struct callback_head *cb, void *data) { - struct create_worker_data *cwd; + struct io_worker *worker; if (cb->func != create_worker_cb) return false; - cwd = container_of(cb, struct create_worker_data, work); - return cwd->wqe->wq == data; + worker = container_of(cb, struct io_worker, create_work); + return worker->wqe->wq == data; } void io_wq_exit_start(struct io_wq *wq) @@ -1026,12 +1033,13 @@ static void io_wq_exit_workers(struct io_wq *wq) return; while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) { - struct create_worker_data *cwd; + struct io_worker *worker; - cwd = container_of(cb, struct create_worker_data, work); - atomic_dec(&cwd->wqe->acct[cwd->index].nr_running); + worker = container_of(cb, struct io_worker, create_work); + atomic_dec(&worker->wqe->acct[worker->create_index].nr_running); io_worker_ref_put(wq); - kfree(cwd); + clear_bit_unlock(0, &worker->create_state); + io_worker_release(worker); } rcu_read_lock(); From 5fd4617840596884334332f36cabfe0deabe85c8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 6 Aug 2021 14:04:31 -0600 Subject: [PATCH 205/315] io_uring: be smarter about waking multiple CQ ring waiters Currently we only wake the first waiter, even if we have enough entries posted to satisfy multiple waiters. Improve that situation so that every waiter knows how much the CQ tail has to advance before they can be safely woken up. With this change, if we have N waiters each asking for 1 event and we get 4 completions, then we wake up 4 waiters. If we have N waiters asking for 2 completions and we get 4 completions, then we wake up the first two. Previously, only the first waiter would've been woken up. Signed-off-by: Jens Axboe --- fs/io_uring.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index a2e20a6fbfed..626c2784723d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1436,11 +1436,13 @@ static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx) static void io_cqring_ev_posted(struct io_ring_ctx *ctx) { - /* see waitqueue_active() comment */ - smp_mb(); - - if (waitqueue_active(&ctx->cq_wait)) - wake_up(&ctx->cq_wait); + /* + * wake_up_all() may seem excessive, but io_wake_function() and + * io_should_wake() handle the termination of the loop and only + * wake as many waiters as we need to. + */ + if (wq_has_sleeper(&ctx->cq_wait)) + wake_up_all(&ctx->cq_wait); if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait)) wake_up(&ctx->sq_data->wait); if (io_should_trigger_evfd(ctx)) @@ -1453,12 +1455,9 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) { - /* see waitqueue_active() comment */ - smp_mb(); - if (ctx->flags & IORING_SETUP_SQPOLL) { - if (waitqueue_active(&ctx->cq_wait)) - wake_up(&ctx->cq_wait); + if (wq_has_sleeper(&ctx->cq_wait)) + wake_up_all(&ctx->cq_wait); } if (io_should_trigger_evfd(ctx)) eventfd_signal(ctx->cq_ev_fd, 1); @@ -6976,21 +6975,21 @@ static int io_sq_thread(void *data) struct io_wait_queue { struct wait_queue_entry wq; struct io_ring_ctx *ctx; - unsigned to_wait; + unsigned cq_tail; unsigned nr_timeouts; }; static inline bool io_should_wake(struct io_wait_queue *iowq) { struct io_ring_ctx *ctx = iowq->ctx; + int dist = ctx->cached_cq_tail - (int) iowq->cq_tail; /* * Wake up if we have enough events, or if a timeout occurred since we * started waiting. For timeouts, we always want to return to userspace, * regardless of event count. */ - return io_cqring_events(ctx) >= iowq->to_wait || - atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts; + return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts; } static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, @@ -7053,7 +7052,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, .entry = LIST_HEAD_INIT(iowq.wq.entry), }, .ctx = ctx, - .to_wait = min_events, }; struct io_rings *rings = ctx->rings; signed long timeout = MAX_SCHEDULE_TIMEOUT; @@ -7089,6 +7087,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, } iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); + iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events; trace_io_uring_cqring_wait(ctx, min_events); do { /* if we can't even flush overflow, don't wait for more */ From 042b0d85eabb79909ef29063fb45d363cbc0a85d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 9 Aug 2021 13:04:01 +0100 Subject: [PATCH 206/315] io_uring: use kvmalloc for fixed files Instead of hand-coded two-level tables for registered files, allocate them with kvmalloc(). In many cases small enough tables are enough, and so can be kmalloc()'ed removing an extra memory load and a bunch of bit logic instructions from the hot path. If the table is larger, we trade off all the pros with a TLB-assisted memory lookup. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/280421d3b48775dabab773006bb5588c7b2dabc0.1628471125.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 33 ++++++++++----------------------- 1 file changed, 10 insertions(+), 23 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 626c2784723d..f8e6eb9de776 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -92,13 +92,8 @@ #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8 -/* - * Shift of 9 is 512 entries, or exactly one page on 64-bit archs - */ -#define IORING_FILE_TABLE_SHIFT 9 -#define IORING_MAX_FILES_TABLE (1U << IORING_FILE_TABLE_SHIFT) -#define IORING_FILE_TABLE_MASK (IORING_MAX_FILES_TABLE - 1) -#define IORING_MAX_FIXED_FILES (64 * IORING_MAX_FILES_TABLE) +/* 512 entries per page on 64-bit archs, 64 pages max */ +#define IORING_MAX_FIXED_FILES (1U << 15) #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ IORING_REGISTER_LAST + IORING_OP_LAST) @@ -235,8 +230,7 @@ struct io_rsrc_put { }; struct io_file_table { - /* two level table */ - struct io_fixed_file **files; + struct io_fixed_file *files; }; struct io_rsrc_node { @@ -6333,12 +6327,9 @@ static void io_wq_submit_work(struct io_wq_work *work) #define FFS_MASK ~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG) static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table, - unsigned i) + unsigned i) { - struct io_fixed_file *table_l2; - - table_l2 = table->files[i >> IORING_FILE_TABLE_SHIFT]; - return &table_l2[i & IORING_FILE_TABLE_MASK]; + return &table->files[i]; } static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, @@ -7276,17 +7267,13 @@ fail: static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files) { - size_t size = nr_files * sizeof(struct io_fixed_file); - - table->files = (struct io_fixed_file **)io_alloc_page_table(size); + table->files = kvcalloc(nr_files, sizeof(table->files[0]), GFP_KERNEL); return !!table->files; } -static void io_free_file_tables(struct io_file_table *table, unsigned nr_files) +static void io_free_file_tables(struct io_file_table *table) { - size_t size = nr_files * sizeof(struct io_fixed_file); - - io_free_page_table((void **)table->files, size); + kvfree(table->files); table->files = NULL; } @@ -7311,7 +7298,7 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) fput(file); } #endif - io_free_file_tables(&ctx->file_table, ctx->nr_user_files); + io_free_file_tables(&ctx->file_table); io_rsrc_data_free(ctx->file_data); ctx->file_data = NULL; ctx->nr_user_files = 0; @@ -7779,7 +7766,7 @@ out_fput: if (file) fput(file); } - io_free_file_tables(&ctx->file_table, nr_args); + io_free_file_tables(&ctx->file_table); ctx->nr_user_files = 0; out_free: io_rsrc_data_free(ctx->file_data); From ac177053bb2cb1f3c4c8bf89bce34c3f2c4823a7 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 9 Aug 2021 13:04:02 +0100 Subject: [PATCH 207/315] io_uring: inline fixed part of io_file_get() Optimise io_file_get() with registered files, which is in a hot path, by inlining parts of the function. Saves a function call, and inefficiencies of passing arguments, e.g. evaluating (sqe_flags & IOSQE_FIXED_FILE). It couldn't have been done before as compilers were refusing to inline it because of the function size. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/52115cd6ce28f33bd0923149c0e6cb611084a0b1.1628471125.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 71 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 42 insertions(+), 29 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index f8e6eb9de776..69cf1665307f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1059,7 +1059,8 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, struct io_uring_rsrc_update2 *up, unsigned nr_args); static void io_clean_op(struct io_kiocb *req); -static struct file *io_file_get(struct io_submit_state *state, +static struct file *io_file_get(struct io_ring_ctx *ctx, + struct io_submit_state *state, struct io_kiocb *req, int fd, bool fixed); static void __io_queue_sqe(struct io_kiocb *req); static void io_rsrc_put_work(struct work_struct *work); @@ -3621,7 +3622,8 @@ static int __io_splice_prep(struct io_kiocb *req, if (unlikely(sp->flags & ~valid_flags)) return -EINVAL; - sp->file_in = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in), + sp->file_in = io_file_get(req->ctx, NULL, req, + READ_ONCE(sqe->splice_fd_in), (sp->flags & SPLICE_F_FD_IN_FIXED)); if (!sp->file_in) return -EBADF; @@ -6353,36 +6355,48 @@ static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file file_slot->file_ptr = file_ptr; } -static struct file *io_file_get(struct io_submit_state *state, - struct io_kiocb *req, int fd, bool fixed) +static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx, + struct io_kiocb *req, int fd) { - struct io_ring_ctx *ctx = req->ctx; struct file *file; + unsigned long file_ptr; - if (fixed) { - unsigned long file_ptr; - - if (unlikely((unsigned int)fd >= ctx->nr_user_files)) - return NULL; - fd = array_index_nospec(fd, ctx->nr_user_files); - file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr; - file = (struct file *) (file_ptr & FFS_MASK); - file_ptr &= ~FFS_MASK; - /* mask in overlapping REQ_F and FFS bits */ - req->flags |= (file_ptr << REQ_F_ASYNC_READ_BIT); - io_req_set_rsrc_node(req); - } else { - trace_io_uring_file_get(ctx, fd); - file = __io_file_get(state, fd); - - /* we don't allow fixed io_uring files */ - if (file && unlikely(file->f_op == &io_uring_fops)) - io_req_track_inflight(req); - } - + if (unlikely((unsigned int)fd >= ctx->nr_user_files)) + return NULL; + fd = array_index_nospec(fd, ctx->nr_user_files); + file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr; + file = (struct file *) (file_ptr & FFS_MASK); + file_ptr &= ~FFS_MASK; + /* mask in overlapping REQ_F and FFS bits */ + req->flags |= (file_ptr << REQ_F_ASYNC_READ_BIT); + io_req_set_rsrc_node(req); return file; } +static struct file *io_file_get_normal(struct io_ring_ctx *ctx, + struct io_submit_state *state, + struct io_kiocb *req, int fd) +{ + struct file *file = __io_file_get(state, fd); + + trace_io_uring_file_get(ctx, fd); + + /* we don't allow fixed io_uring files */ + if (file && unlikely(file->f_op == &io_uring_fops)) + io_req_track_inflight(req); + return file; +} + +static inline struct file *io_file_get(struct io_ring_ctx *ctx, + struct io_submit_state *state, + struct io_kiocb *req, int fd, bool fixed) +{ + if (fixed) + return io_file_get_fixed(ctx, req, fd); + else + return io_file_get_normal(ctx, state, req, fd); +} + static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) { struct io_timeout_data *data = container_of(timer, @@ -6589,9 +6603,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, } if (io_op_defs[req->opcode].needs_file) { - bool fixed = req->flags & REQ_F_FIXED_FILE; - - req->file = io_file_get(state, req, READ_ONCE(sqe->fd), fixed); + req->file = io_file_get(ctx, state, req, READ_ONCE(sqe->fd), + (sqe_flags & IOSQE_FIXED_FILE)); if (unlikely(!req->file)) ret = -EBADF; } From b191e2dfe5955b392bc8c0ae546dfa5a13649c38 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 9 Aug 2021 13:04:03 +0100 Subject: [PATCH 208/315] io_uring: rename io_file_supports_async() io_file_supports_async() checks whether a file supports nowait operations, so "async" in the name is misleading. Rename it. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/33d55b5ce43aa1884c637c1957f1e30d30dc3bec.1628471125.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 69cf1665307f..7027c7e1f562 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -719,8 +719,8 @@ enum { REQ_F_DONT_REISSUE_BIT, REQ_F_CREDS_BIT, /* keep async read/write and isreg together and in order */ - REQ_F_ASYNC_READ_BIT, - REQ_F_ASYNC_WRITE_BIT, + REQ_F_NOWAIT_READ_BIT, + REQ_F_NOWAIT_WRITE_BIT, REQ_F_ISREG_BIT, /* not a real bit, just to check we're not overflowing the space */ @@ -766,9 +766,9 @@ enum { /* don't attempt request reissue, see io_rw_reissue() */ REQ_F_DONT_REISSUE = BIT(REQ_F_DONT_REISSUE_BIT), /* supports async reads */ - REQ_F_ASYNC_READ = BIT(REQ_F_ASYNC_READ_BIT), + REQ_F_NOWAIT_READ = BIT(REQ_F_NOWAIT_READ_BIT), /* supports async writes */ - REQ_F_ASYNC_WRITE = BIT(REQ_F_ASYNC_WRITE_BIT), + REQ_F_NOWAIT_WRITE = BIT(REQ_F_NOWAIT_WRITE_BIT), /* regular file */ REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), /* has creds assigned */ @@ -2631,7 +2631,7 @@ static bool io_bdev_nowait(struct block_device *bdev) * any file. For now, just ensure that anything potentially problematic is done * inline. */ -static bool __io_file_supports_async(struct file *file, int rw) +static bool __io_file_supports_nowait(struct file *file, int rw) { umode_t mode = file_inode(file)->i_mode; @@ -2664,14 +2664,14 @@ static bool __io_file_supports_async(struct file *file, int rw) return file->f_op->write_iter != NULL; } -static bool io_file_supports_async(struct io_kiocb *req, int rw) +static bool io_file_supports_nowait(struct io_kiocb *req, int rw) { - if (rw == READ && (req->flags & REQ_F_ASYNC_READ)) + if (rw == READ && (req->flags & REQ_F_NOWAIT_READ)) return true; - else if (rw == WRITE && (req->flags & REQ_F_ASYNC_WRITE)) + else if (rw == WRITE && (req->flags & REQ_F_NOWAIT_WRITE)) return true; - return __io_file_supports_async(req->file, rw); + return __io_file_supports_nowait(req->file, rw); } static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -3295,7 +3295,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) kiocb->ki_flags |= IOCB_NOWAIT; /* If the file doesn't support async, just async punt */ - if (force_nonblock && !io_file_supports_async(req, READ)) { + if (force_nonblock && !io_file_supports_nowait(req, READ)) { ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true); return ret ?: -EAGAIN; } @@ -3400,7 +3400,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) kiocb->ki_flags |= IOCB_NOWAIT; /* If the file doesn't support async, just async punt */ - if (force_nonblock && !io_file_supports_async(req, WRITE)) + if (force_nonblock && !io_file_supports_nowait(req, WRITE)) goto copy_iov; /* file path doesn't support NOWAIT for non-direct_IO */ @@ -5208,7 +5208,7 @@ static int io_arm_poll_handler(struct io_kiocb *req) } /* if we can't nonblock try, then no point in arming a poll handler */ - if (!io_file_supports_async(req, rw)) + if (!io_file_supports_nowait(req, rw)) return IO_APOLL_ABORTED; apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); @@ -6346,9 +6346,9 @@ static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file { unsigned long file_ptr = (unsigned long) file; - if (__io_file_supports_async(file, READ)) + if (__io_file_supports_nowait(file, READ)) file_ptr |= FFS_ASYNC_READ; - if (__io_file_supports_async(file, WRITE)) + if (__io_file_supports_nowait(file, WRITE)) file_ptr |= FFS_ASYNC_WRITE; if (S_ISREG(file_inode(file)->i_mode)) file_ptr |= FFS_ISREG; @@ -6368,7 +6368,7 @@ static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx, file = (struct file *) (file_ptr & FFS_MASK); file_ptr &= ~FFS_MASK; /* mask in overlapping REQ_F and FFS bits */ - req->flags |= (file_ptr << REQ_F_ASYNC_READ_BIT); + req->flags |= (file_ptr << REQ_F_NOWAIT_READ_BIT); io_req_set_rsrc_node(req); return file; } From c97d8a0f68b30960e9c8089bc37cc3b96a96f84d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 9 Aug 2021 13:04:04 +0100 Subject: [PATCH 209/315] io_uring: avoid touching inode in rw prep If we use fixed files, we can be sure (almost) that REQ_F_ISREG is set. However, for non-reg files io_prep_rw() still will look into inode to double check, and that's expensive and can be avoided. The only caveat is that it only currently works with 64+ bit architectures, see FFS_ISREG, so we should consider that. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/0a62780c491ca2522cd52db4ae3f16e03aafed0f.1628471125.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 7027c7e1f562..69e5a2102f94 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1231,6 +1231,20 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq) return false; } +#define FFS_ASYNC_READ 0x1UL +#define FFS_ASYNC_WRITE 0x2UL +#ifdef CONFIG_64BIT +#define FFS_ISREG 0x4UL +#else +#define FFS_ISREG 0x0UL +#endif +#define FFS_MASK ~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG) + +static inline bool io_req_ffs_set(struct io_kiocb *req) +{ + return IS_ENABLED(CONFIG_64BIT) && (req->flags & REQ_F_FIXED_FILE); +} + static void io_req_track_inflight(struct io_kiocb *req) { if (!(req->flags & REQ_F_INFLIGHT)) { @@ -2682,7 +2696,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) unsigned ioprio; int ret; - if (!(req->flags & REQ_F_ISREG) && S_ISREG(file_inode(file)->i_mode)) + if (!io_req_ffs_set(req) && S_ISREG(file_inode(file)->i_mode)) req->flags |= REQ_F_ISREG; kiocb->ki_pos = READ_ONCE(sqe->off); @@ -6319,15 +6333,6 @@ static void io_wq_submit_work(struct io_wq_work *work) } } -#define FFS_ASYNC_READ 0x1UL -#define FFS_ASYNC_WRITE 0x2UL -#ifdef CONFIG_64BIT -#define FFS_ISREG 0x4UL -#else -#define FFS_ISREG 0x0UL -#endif -#define FFS_MASK ~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG) - static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table, unsigned i) { From ebc11b6c6b87da5c83b4d934893a893f49160bc3 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 9 Aug 2021 13:04:05 +0100 Subject: [PATCH 210/315] io_uring: clean io-wq callbacks Move io-wq callbacks closer to each other, so it's easier to work with them, and rename io_free_work() into io_wq_free_work() for consistency. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/851bbc7f0f86f206d8c1333efee8bcb9c26e419f.1628471125.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 69e5a2102f94..1a7c91e9891e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6298,6 +6298,14 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) return 0; } +static struct io_wq_work *io_wq_free_work(struct io_wq_work *work) +{ + struct io_kiocb *req = container_of(work, struct io_kiocb, work); + + req = io_put_req_find_next(req); + return req ? &req->work : NULL; +} + static void io_wq_submit_work(struct io_wq_work *work) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); @@ -7930,14 +7938,6 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, return done ? done : err; } -static struct io_wq_work *io_free_work(struct io_wq_work *work) -{ - struct io_kiocb *req = container_of(work, struct io_kiocb, work); - - req = io_put_req_find_next(req); - return req ? &req->work : NULL; -} - static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx, struct task_struct *task) { @@ -7961,7 +7961,7 @@ static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx, data.hash = hash; data.task = task; - data.free_work = io_free_work; + data.free_work = io_wq_free_work; data.do_work = io_wq_submit_work; /* Do QD, or 4 * CPUS, whatever is smallest */ From 2215bed9246dbb95df50fcef788b0765c7c2aac0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 9 Aug 2021 13:04:06 +0100 Subject: [PATCH 211/315] io_uring: remove unnecessary PF_EXITING check We prefer nornal task_works even if it would fail requests inside. Kill a PF_EXITING check in io_req_task_work_add(), task_work_add() handles well dying tasks, i.e. return error when can't enqueue due to late stages of do_exit(). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/fc14297e8441cd8f5d1743a2488cf0df09bf48ac.1628471125.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 1a7c91e9891e..620ab4c90dff 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2006,8 +2006,6 @@ static void io_req_task_work_add(struct io_kiocb *req) if (test_bit(0, &tctx->task_state) || test_and_set_bit(0, &tctx->task_state)) return; - if (unlikely(tsk->flags & PF_EXITING)) - goto fail; /* * SQPOLL kernel thread doesn't need notification, just a wakeup. For @@ -2020,7 +2018,7 @@ static void io_req_task_work_add(struct io_kiocb *req) wake_up_process(tsk); return; } -fail: + clear_bit(0, &tctx->task_state); spin_lock_irqsave(&tctx->task_lock, flags); node = tctx->task_list.first; From 8724dd8c833832c398c3578340374f45d6d9dd0d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 9 Aug 2021 13:04:07 +0100 Subject: [PATCH 212/315] io-wq: improve wq_list_add_tail() Prepare nodes that we're going to add before actually linking them, it's always safer and costs us nothing. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/f7e53f0c84c02ed6748c488ed0789b98f8cc6185.1628471125.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io-wq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/io-wq.h b/fs/io-wq.h index 3999ee58ff26..308af3928424 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -44,6 +44,7 @@ static inline void wq_list_add_after(struct io_wq_work_node *node, static inline void wq_list_add_tail(struct io_wq_work_node *node, struct io_wq_work_list *list) { + node->next = NULL; if (!list->first) { list->last = node; WRITE_ONCE(list->first, node); @@ -51,7 +52,6 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node, list->last->next = node; list->last = node; } - node->next = NULL; } static inline void wq_list_cut(struct io_wq_work_list *list, From 864ea921b0300fe5a4db9136b7e307e94b369530 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 9 Aug 2021 13:04:08 +0100 Subject: [PATCH 213/315] io_uring: refactor io_alloc_req Replace the main if of io_flush_cached_reqs() with inverted condition + goto, so all the cases are handled in the same way. And also extract io_preinit_req() to make it cleaner and easier to refer to. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1abcba1f7b55dc53bf1dbe95036e345ffb1d5b01.1628471125.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 66 +++++++++++++++++++++++++-------------------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 620ab4c90dff..8892747bd173 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1702,6 +1702,19 @@ static void io_req_complete_failed(struct io_kiocb *req, long res) io_req_complete_post(req, res, 0); } +/* + * Don't initialise the fields below on every allocation, but do that in + * advance and keep them valid across allocations. + */ +static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx) +{ + req->ctx = ctx; + req->link = NULL; + req->async_data = NULL; + /* not necessary, but safer to zero */ + req->result = 0; +} + static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, struct io_comp_state *cs) { @@ -1744,45 +1757,31 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx) static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx) { struct io_submit_state *state = &ctx->submit_state; + gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; + int ret, i; BUILD_BUG_ON(ARRAY_SIZE(state->reqs) < IO_REQ_ALLOC_BATCH); - if (!state->free_reqs) { - gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; - int ret, i; + if (likely(state->free_reqs || io_flush_cached_reqs(ctx))) + goto got_req; - if (io_flush_cached_reqs(ctx)) - goto got_req; + ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH, + state->reqs); - ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH, - state->reqs); - - /* - * Bulk alloc is all-or-nothing. If we fail to get a batch, - * retry single alloc to be on the safe side. - */ - if (unlikely(ret <= 0)) { - state->reqs[0] = kmem_cache_alloc(req_cachep, gfp); - if (!state->reqs[0]) - return NULL; - ret = 1; - } - - /* - * Don't initialise the fields below on every allocation, but - * do that in advance and keep valid on free. - */ - for (i = 0; i < ret; i++) { - struct io_kiocb *req = state->reqs[i]; - - req->ctx = ctx; - req->link = NULL; - req->async_data = NULL; - /* not necessary, but safer to zero */ - req->result = 0; - } - state->free_reqs = ret; + /* + * Bulk alloc is all-or-nothing. If we fail to get a batch, + * retry single alloc to be on the safe side. + */ + if (unlikely(ret <= 0)) { + state->reqs[0] = kmem_cache_alloc(req_cachep, gfp); + if (!state->reqs[0]) + return NULL; + ret = 1; } + + for (i = 0; i < ret; i++) + io_preinit_req(state->reqs[i], ctx); + state->free_reqs = ret; got_req: state->free_reqs--; return state->reqs[state->free_reqs]; @@ -6569,6 +6568,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, unsigned int sqe_flags; int personality, ret = 0; + /* req is partially pre-initialised, see io_preinit_req() */ req->opcode = READ_ONCE(sqe->opcode); /* same numerical values with corresponding REQ_F_*, safe to copy */ req->flags = sqe_flags = READ_ONCE(sqe->flags); From a2416e1ec23c6b79010d03d69c0e4e035339b4ad Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 9 Aug 2021 13:04:09 +0100 Subject: [PATCH 214/315] io_uring: don't halt iopoll too early IOPOLL users should care more about getting completions for requests they submitted, but not in "device did/completed something". Currently, io_do_iopoll() may return a positive number, which will instruct io_iopoll_check() to break the loop and end the syscall, even if there is not enough CQEs or none at all. Don't return positive numbers, so io_iopoll_check() exits only when it gets an actual error, need reschedule or got enough CQEs. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/641a88f751623b6758303b3171f0a4141f06726e.1628471125.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 8892747bd173..6a3079a038fb 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2288,7 +2288,6 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, struct io_kiocb *req, *tmp; LIST_HEAD(done); bool spin; - int ret; /* * Only spin for completions if we don't have multiple devices hanging @@ -2296,9 +2295,9 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, */ spin = !ctx->poll_multi_queue && *nr_events < min; - ret = 0; list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) { struct kiocb *kiocb = &req->rw.kiocb; + int ret; /* * Move completed and retryable entries to our local lists. @@ -2313,22 +2312,20 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, break; ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin); - if (ret < 0) - break; + if (unlikely(ret < 0)) + return ret; + else if (ret) + spin = false; /* iopoll may have completed current req */ if (READ_ONCE(req->iopoll_completed)) list_move_tail(&req->inflight_entry, &done); - - if (ret && spin) - spin = false; - ret = 0; } if (!list_empty(&done)) io_iopoll_complete(ctx, nr_events, &done, resubmit); - return ret; + return 0; } /* From 282cdc86937bd31cf0ea49978ad7a42cfe12ea35 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 9 Aug 2021 13:04:10 +0100 Subject: [PATCH 215/315] io_uring: add more locking annotations for submit Add more annotations for submission path functions holding ->uring_lock. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/128ec4185e26fbd661dd3a424aa66108ee8ff951.1628471125.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6a3079a038fb..ff17c4e9aa42 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2133,6 +2133,7 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req, } static void io_submit_flush_completions(struct io_ring_ctx *ctx) + __must_hold(&req->ctx->uring_lock) { struct io_comp_state *cs = &ctx->submit_state.comp; int i, nr = cs->nr; @@ -6473,6 +6474,7 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) } static void __io_queue_sqe(struct io_kiocb *req) + __must_hold(&req->ctx->uring_lock) { struct io_kiocb *linked_timeout = io_prep_linked_timeout(req); int ret; @@ -6516,6 +6518,7 @@ issue_sqe: } static inline void io_queue_sqe(struct io_kiocb *req) + __must_hold(&req->ctx->uring_lock) { if (unlikely(req->ctx->drain_active) && io_drain_req(req)) return; @@ -6560,6 +6563,7 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx, static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct io_uring_sqe *sqe) + __must_hold(&ctx->uring_lock) { struct io_submit_state *state; unsigned int sqe_flags; @@ -6623,6 +6627,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct io_uring_sqe *sqe) + __must_hold(&ctx->uring_lock) { struct io_submit_link *link = &ctx->submit_state.link; int ret; @@ -6755,6 +6760,7 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) } static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) + __must_hold(&ctx->uring_lock) { struct io_uring_task *tctx; int submitted = 0; From 90291099f24a82863e00de136d95ad7e73560107 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 9 Aug 2021 09:07:32 -0600 Subject: [PATCH 216/315] io_uring: optimise io_cqring_wait() hot path Turns out we always init struct io_wait_queue in io_cqring_wait(), even if it's not used after, i.e. there are already enough of CQEs. And often it's exactly what happens, for instance, requests may have been completed inline, or in case of io_uring_enter(submit=N, wait=1). It shows up in my profiler, so optimise it by delaying the struct init. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/6f1b81c60b947d165583dc333947869c3d85d037.1628471125.git.asml.silence@gmail.com [axboe: fixed up for new cqring wait] Signed-off-by: Jens Axboe --- fs/io_uring.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index ff17c4e9aa42..14aaeb87b149 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7063,14 +7063,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, const sigset_t __user *sig, size_t sigsz, struct __kernel_timespec __user *uts) { - struct io_wait_queue iowq = { - .wq = { - .private = current, - .func = io_wake_function, - .entry = LIST_HEAD_INIT(iowq.wq.entry), - }, - .ctx = ctx, - }; + struct io_wait_queue iowq; struct io_rings *rings = ctx->rings; signed long timeout = MAX_SCHEDULE_TIMEOUT; int ret; @@ -7104,8 +7097,13 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, timeout = timespec64_to_jiffies(&ts); } + init_waitqueue_func_entry(&iowq.wq, io_wake_function); + iowq.wq.private = current; + INIT_LIST_HEAD(&iowq.wq.entry); + iowq.ctx = ctx; iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events; + trace_io_uring_cqring_wait(ctx, min_events); do { /* if we can't even flush overflow, don't wait for more */ From e73c5c7cd3e21bb95032a9ed3593c000f17f9ab8 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 9 Aug 2021 13:04:12 +0100 Subject: [PATCH 217/315] io_uring: extract a helper for ctx quiesce Refactor __io_uring_register() by extracting a helper responsible for ctx queisce. Looks better and will make it easier to add more optimisations. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/0339e0027504176be09237eefa7945bf9a6f153d.1628471125.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 53 ++++++++++++++++++++++++++++----------------------- 1 file changed, 29 insertions(+), 24 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 14aaeb87b149..03c32d392735 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -10074,6 +10074,33 @@ static bool io_register_op_must_quiesce(int op) } } +static int io_ctx_quiesce(struct io_ring_ctx *ctx) +{ + long ret; + + percpu_ref_kill(&ctx->refs); + + /* + * Drop uring mutex before waiting for references to exit. If another + * thread is currently inside io_uring_enter() it might need to grab the + * uring_lock to make progress. If we hold it here across the drain + * wait, then we can deadlock. It's safe to drop the mutex here, since + * no new references will come in after we've killed the percpu ref. + */ + mutex_unlock(&ctx->uring_lock); + do { + ret = wait_for_completion_interruptible(&ctx->ref_comp); + if (!ret) + break; + ret = io_run_task_work_sig(); + } while (ret >= 0); + mutex_lock(&ctx->uring_lock); + + if (ret) + io_refs_resurrect(&ctx->refs, &ctx->ref_comp); + return ret; +} + static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, void __user *arg, unsigned nr_args) __releases(ctx->uring_lock) @@ -10098,31 +10125,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, } if (io_register_op_must_quiesce(opcode)) { - percpu_ref_kill(&ctx->refs); - - /* - * Drop uring mutex before waiting for references to exit. If - * another thread is currently inside io_uring_enter() it might - * need to grab the uring_lock to make progress. If we hold it - * here across the drain wait, then we can deadlock. It's safe - * to drop the mutex here, since no new references will come in - * after we've killed the percpu ref. - */ - mutex_unlock(&ctx->uring_lock); - do { - ret = wait_for_completion_interruptible(&ctx->ref_comp); - if (!ret) - break; - ret = io_run_task_work_sig(); - if (ret < 0) - break; - } while (1); - mutex_lock(&ctx->uring_lock); - - if (ret) { - io_refs_resurrect(&ctx->refs, &ctx->ref_comp); + ret = io_ctx_quiesce(ctx); + if (ret) return ret; - } } switch (opcode) { From 6a290a1442b45afb55d6a87619b716e5031d7c3e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 9 Aug 2021 13:04:13 +0100 Subject: [PATCH 218/315] io_uring: move io_put_task() definition Move the function in the source file as it is to get rid of forward declarations. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/33d917d69e4206557c75a5b98fe22bcdf77ce47d.1628471125.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 03c32d392735..a536b2509d6d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1052,7 +1052,6 @@ static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, static void io_put_req(struct io_kiocb *req); static void io_put_req_deferred(struct io_kiocb *req, int nr); static void io_dismantle_req(struct io_kiocb *req); -static void io_put_task(struct task_struct *task, int nr); static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req); static void io_queue_linked_timeout(struct io_kiocb *req); static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, @@ -1571,6 +1570,17 @@ static inline void req_ref_get(struct io_kiocb *req) atomic_inc(&req->refs); } +/* must to be called somewhat shortly after putting a request */ +static inline void io_put_task(struct task_struct *task, int nr) +{ + struct io_uring_task *tctx = task->io_uring; + + percpu_counter_sub(&tctx->inflight, nr); + if (unlikely(atomic_read(&tctx->in_idle))) + wake_up(&tctx->wait); + put_task_struct_many(task, nr); +} + static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, long res, unsigned int cflags) { @@ -1809,17 +1819,6 @@ static void io_dismantle_req(struct io_kiocb *req) } } -/* must to be called somewhat shortly after putting a request */ -static inline void io_put_task(struct task_struct *task, int nr) -{ - struct io_uring_task *tctx = task->io_uring; - - percpu_counter_sub(&tctx->inflight, nr); - if (unlikely(atomic_read(&tctx->in_idle))) - wake_up(&tctx->wait); - put_task_struct_many(task, nr); -} - static void __io_free_req(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; From b9bd2bea0f22f502019266dce368a9cd477ac721 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 9 Aug 2021 09:09:47 -0600 Subject: [PATCH 219/315] io_uring: move io_rsrc_node_alloc() definition Move the function together with io_rsrc_node_ref_zero() in the source file as it is to get rid of forward declarations. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/4d81f6f833e7d017860b24463a9a68b14a8a5ed2.1628471125.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 89 +++++++++++++++++++++++++-------------------------- 1 file changed, 44 insertions(+), 45 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index a536b2509d6d..ee98aebb6580 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1045,7 +1045,6 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct task_struct *task, bool cancel_all); static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); -static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx); static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, long res, unsigned int cflags); @@ -7160,6 +7159,50 @@ static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node) kfree(ref_node); } +static void io_rsrc_node_ref_zero(struct percpu_ref *ref) +{ + struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs); + struct io_ring_ctx *ctx = node->rsrc_data->ctx; + unsigned long flags; + bool first_add = false; + + spin_lock_irqsave(&ctx->rsrc_ref_lock, flags); + node->done = true; + + while (!list_empty(&ctx->rsrc_ref_list)) { + node = list_first_entry(&ctx->rsrc_ref_list, + struct io_rsrc_node, node); + /* recycle ref nodes in order */ + if (!node->done) + break; + list_del(&node->node); + first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist); + } + spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags); + + if (first_add) + mod_delayed_work(system_wq, &ctx->rsrc_put_work, HZ); +} + +static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) +{ + struct io_rsrc_node *ref_node; + + ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); + if (!ref_node) + return NULL; + + if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero, + 0, GFP_KERNEL)) { + kfree(ref_node); + return NULL; + } + INIT_LIST_HEAD(&ref_node->node); + INIT_LIST_HEAD(&ref_node->rsrc_list); + ref_node->done = false; + return ref_node; +} + static void io_rsrc_node_switch(struct io_ring_ctx *ctx, struct io_rsrc_data *data_to_kill) { @@ -7674,50 +7717,6 @@ static void io_rsrc_put_work(struct work_struct *work) } } -static void io_rsrc_node_ref_zero(struct percpu_ref *ref) -{ - struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs); - struct io_ring_ctx *ctx = node->rsrc_data->ctx; - unsigned long flags; - bool first_add = false; - - spin_lock_irqsave(&ctx->rsrc_ref_lock, flags); - node->done = true; - - while (!list_empty(&ctx->rsrc_ref_list)) { - node = list_first_entry(&ctx->rsrc_ref_list, - struct io_rsrc_node, node); - /* recycle ref nodes in order */ - if (!node->done) - break; - list_del(&node->node); - first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist); - } - spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags); - - if (first_add) - mod_delayed_work(system_wq, &ctx->rsrc_put_work, HZ); -} - -static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) -{ - struct io_rsrc_node *ref_node; - - ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); - if (!ref_node) - return NULL; - - if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero, - 0, GFP_KERNEL)) { - kfree(ref_node); - return NULL; - } - INIT_LIST_HEAD(&ref_node->node); - INIT_LIST_HEAD(&ref_node->rsrc_list); - ref_node->done = false; - return ref_node; -} - static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args, u64 __user *tags) { From 543af3a13da308f2cea954644b43c2c9f864c350 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 9 Aug 2021 13:04:15 +0100 Subject: [PATCH 220/315] io_uring: inline io_free_req_deferred Inline io_free_req_deferred(), there is no reason to keep it separated. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/ce04b7180d4eac0d69dd00677b227eefe80c2cc5.1628471125.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index ee98aebb6580..576100b23147 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2182,16 +2182,12 @@ static inline void io_put_req(struct io_kiocb *req) io_free_req(req); } -static void io_free_req_deferred(struct io_kiocb *req) -{ - req->io_task_work.func = io_free_req; - io_req_task_work_add(req); -} - static inline void io_put_req_deferred(struct io_kiocb *req, int refs) { - if (req_ref_sub_and_test(req, refs)) - io_free_req_deferred(req); + if (req_ref_sub_and_test(req, refs)) { + req->io_task_work.func = io_free_req; + io_req_task_work_add(req); + } } static unsigned io_cqring_events(struct io_ring_ctx *ctx) From d3fddf6dddd84432161eb070ed8e34d14c8bf56a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 9 Aug 2021 13:04:16 +0100 Subject: [PATCH 221/315] io_uring: deduplicate open iopoll check Move IORING_SETUP_IOPOLL check into __io_openat_prep(), so both openat and openat2 reuse it. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/9a73ce83e4ee60d011180ef177eecef8e87ff2a2.1628471125.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 576100b23147..26a3ab9b6aed 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3789,6 +3789,8 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe const char __user *fname; int ret; + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; if (unlikely(sqe->ioprio || sqe->buf_index)) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) @@ -3813,12 +3815,9 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - u64 flags, mode; + u64 mode = READ_ONCE(sqe->len); + u64 flags = READ_ONCE(sqe->open_flags); - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - mode = READ_ONCE(sqe->len); - flags = READ_ONCE(sqe->open_flags); req->open.how = build_open_how(flags, mode); return __io_openat_prep(req, sqe); } @@ -3829,8 +3828,6 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) size_t len; int ret; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; how = u64_to_user_ptr(READ_ONCE(sqe->addr2)); len = READ_ONCE(sqe->len); if (len < OPEN_HOW_SIZE_VER0) From 58d3be2c60d2cf4e6bb65bb6200fa39a7bc477f9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 9 Aug 2021 13:04:17 +0100 Subject: [PATCH 222/315] io_uring: improve ctx hang handling If io_ring_exit_work() can't get it done in 5 minutes, something is going very wrong, don't keep spinning at HZ / 20 rate, it doesn't help and it may take much of CPU time if there is a lot of workers stuck as such. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/9e2d1ca81d569f6bc628af1a42ff6663bff7ce9c.1628471125.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 26a3ab9b6aed..af9472e3f17b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8795,6 +8795,7 @@ static void io_ring_exit_work(struct work_struct *work) { struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work); unsigned long timeout = jiffies + HZ * 60 * 5; + unsigned long interval = HZ / 20; struct io_tctx_exit exit; struct io_tctx_node *node; int ret; @@ -8819,8 +8820,11 @@ static void io_ring_exit_work(struct work_struct *work) io_sq_thread_unpark(sqd); } - WARN_ON_ONCE(time_after(jiffies, timeout)); - } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)); + if (WARN_ON_ONCE(time_after(jiffies, timeout))) { + /* there is little hope left, don't run it too often */ + interval = HZ * 60; + } + } while (!wait_for_completion_timeout(&ctx->ref_comp, interval)); init_completion(&exit.completion); init_task_work(&exit.task_work, io_tctx_exit_cb); From bbbca0948989aa1a8a75b99bcdece677ad06dfe6 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 9 Aug 2021 13:04:18 +0100 Subject: [PATCH 223/315] io_uring: kill unused IO_IOPOLL_BATCH IO_IOPOLL_BATCH is not used, delete it. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/b2bdf19dbee2c9fc8865bbab9412135a14e24a64.1628471125.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index af9472e3f17b..8cdcd645f3d8 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -295,7 +295,6 @@ struct io_sq_data { struct completion exited; }; -#define IO_IOPOLL_BATCH 8 #define IO_COMPL_BATCH 32 #define IO_REQ_CACHE_SIZE 32 #define IO_REQ_ALLOC_BATCH 8 From af066f31eb3dac2a11516315d47a286a7b3b07df Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 9 Aug 2021 13:04:19 +0100 Subject: [PATCH 224/315] io_uring: drop exec checks from io_req_task_submit In case of on-exec io_uring cancellations, tasks already wait for all submitted requests to get completed/cancelled, so we don't need to check for ->in_execve separately. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/be8707049f10df9d20ca03dc4ca3316239b5e8e0.1628471125.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 8cdcd645f3d8..cf75de0d74d2 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2046,7 +2046,7 @@ static void io_req_task_submit(struct io_kiocb *req) /* ctx stays valid until unlock, even if we drop all ours ctx->refs */ mutex_lock(&ctx->uring_lock); - if (!(req->task->flags & PF_EXITING) && !req->task->in_execve) + if (likely(!(req->task->flags & PF_EXITING))) __io_queue_sqe(req); else io_req_complete_failed(req, -EFAULT); From e9dbe221f5d1c974c853da94eee456803239cab5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 9 Aug 2021 13:04:20 +0100 Subject: [PATCH 225/315] io_uring: optimise putting task struct We cache all the reference to task + tctx, so if io_put_task() is called by the corresponding task itself, we can save on atomics and return the refs right back into the cache. It's beneficial for all inline completions, and also iopolling, when polling and submissions are done by the same task, including SQPOLL|IOPOLL. Note: io_uring_cancel_generic() can return refs to the cache as well, so those should be flushed in the loop for tctx_inflight() to work right. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/6fe9646b3cb70e46aca1f58426776e368c8926b3.1628471125.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index cf75de0d74d2..2eeaf4a15367 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2102,10 +2102,12 @@ static inline void io_init_req_batch(struct req_batch *rb) static void io_req_free_batch_finish(struct io_ring_ctx *ctx, struct req_batch *rb) { - if (rb->task) - io_put_task(rb->task, rb->task_refs); if (rb->ctx_refs) percpu_ref_put_many(&ctx->refs, rb->ctx_refs); + if (rb->task == current) + current->io_uring->cached_refs += rb->task_refs; + else if (rb->task) + io_put_task(rb->task, rb->task_refs); } static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req, @@ -9138,9 +9140,11 @@ static void io_uring_drop_tctx_refs(struct task_struct *task) struct io_uring_task *tctx = task->io_uring; unsigned int refs = tctx->cached_refs; - tctx->cached_refs = 0; - percpu_counter_sub(&tctx->inflight, refs); - put_task_struct_many(task, refs); + if (refs) { + tctx->cached_refs = 0; + percpu_counter_sub(&tctx->inflight, refs); + put_task_struct_many(task, refs); + } } /* @@ -9161,9 +9165,9 @@ static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) if (tctx->io_wq) io_wq_exit_start(tctx->io_wq); - io_uring_drop_tctx_refs(current); atomic_inc(&tctx->in_idle); do { + io_uring_drop_tctx_refs(current); /* read completions before cancelations */ inflight = tctx_inflight(tctx, !cancel_all); if (!inflight) @@ -9187,6 +9191,7 @@ static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) } prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE); + io_uring_drop_tctx_refs(current); /* * If we've seen completions, retry without waiting. This * avoids a race where a completion comes in before we did From f56165e62fae78200292857628e4f1d8d12a0ed0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 9 Aug 2021 20:18:07 +0100 Subject: [PATCH 226/315] io_uring: move io_fallback_req_func() Move io_fallback_req_func() to kill yet another forward declaration. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/d0a8f9d9a0057ed761d6237167d51c9378798d2d.1628536684.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 2eeaf4a15367..86162ebd6bae 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1067,8 +1067,6 @@ static void io_submit_flush_completions(struct io_ring_ctx *ctx); static bool io_poll_remove_waitqs(struct io_kiocb *req); static int io_req_prep_async(struct io_kiocb *req); -static void io_fallback_req_func(struct work_struct *unused); - static struct kmem_cache *req_cachep; static const struct file_operations io_uring_fops; @@ -1145,6 +1143,19 @@ static inline bool io_is_timeout_noseq(struct io_kiocb *req) return !req->timeout.off; } +static void io_fallback_req_func(struct work_struct *work) +{ + struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, + fallback_work.work); + struct llist_node *node = llist_del_all(&ctx->fallback_llist); + struct io_kiocb *req, *tmp; + + percpu_ref_get(&ctx->refs); + llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node) + req->io_task_work.func(req); + percpu_ref_put(&ctx->refs); +} + static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; @@ -2468,19 +2479,6 @@ static bool io_rw_should_reissue(struct io_kiocb *req) } #endif -static void io_fallback_req_func(struct work_struct *work) -{ - struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, - fallback_work.work); - struct llist_node *node = llist_del_all(&ctx->fallback_llist); - struct io_kiocb *req, *tmp; - - percpu_ref_get(&ctx->refs); - llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node) - req->io_task_work.func(req); - percpu_ref_put(&ctx->refs); -} - static void __io_complete_rw(struct io_kiocb *req, long res, long res2, unsigned int issue_flags) { From c34b025f2d2149d4351b994a923fa687a32478f8 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 9 Aug 2021 20:18:08 +0100 Subject: [PATCH 227/315] io_uring: cache __io_free_req()'d requests Don't kfree requests in __io_free_req() but put them back into the internal request cache. That makes allocations more sustainable and will be used for refcounting optimisations. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/9f4950fbe7771c8d41799366d0a3a08ac3040236.1628536684.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 86162ebd6bae..cc2d3de16423 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1831,11 +1831,16 @@ static void io_dismantle_req(struct io_kiocb *req) static void __io_free_req(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; + unsigned long flags; io_dismantle_req(req); io_put_task(req->task, 1); - kmem_cache_free(req_cachep, req); + spin_lock_irqsave(&ctx->completion_lock, flags); + list_add(&req->compl.list, &ctx->locked_free_list); + ctx->locked_free_nr++; + spin_unlock_irqrestore(&ctx->completion_lock, flags); + percpu_ref_put(&ctx->refs); } From 7255834ed6ef9658b9e7fb192da6a323a64eac98 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 9 Aug 2021 20:18:09 +0100 Subject: [PATCH 228/315] io_uring: remove redundant args from cache_free We don't use @tsk argument of io_req_cache_free(), remove it. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/6a28b4a58ee0aaf0db98e2179b9c9f06f9b0cca1.1628536684.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index cc2d3de16423..c6f07a3d8e0d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8622,13 +8622,11 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx) __io_remove_buffers(ctx, buf, index, -1U); } -static void io_req_cache_free(struct list_head *list, struct task_struct *tsk) +static void io_req_cache_free(struct list_head *list) { struct io_kiocb *req, *nxt; list_for_each_entry_safe(req, nxt, list, compl.list) { - if (tsk && req->task != tsk) - continue; list_del(&req->compl.list); kmem_cache_free(req_cachep, req); } @@ -8648,7 +8646,7 @@ static void io_req_caches_free(struct io_ring_ctx *ctx) } io_flush_cached_locked_reqs(ctx, cs); - io_req_cache_free(&cs->free_list, NULL); + io_req_cache_free(&cs->free_list); mutex_unlock(&ctx->uring_lock); } From bb943b8265c84e9553903161bc39ff45f427d00d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 9 Aug 2021 20:18:10 +0100 Subject: [PATCH 229/315] io_uring: use inflight_entry instead of compl.list req->compl.list is used to cache freed requests, and so can't overlap in time with req->inflight_entry. So, use inflight_entry to link requests and remove compl.list. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/e430e79d22d70a190d718831bda7bfed1daf8976.1628536684.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index c6f07a3d8e0d..f05ec6e07eec 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -670,7 +670,6 @@ struct io_unlink { struct io_completion { struct file *file; - struct list_head list; u32 cflags; }; @@ -1670,7 +1669,7 @@ static void io_req_complete_post(struct io_kiocb *req, long res, } io_dismantle_req(req); io_put_task(req->task, 1); - list_add(&req->compl.list, &ctx->locked_free_list); + list_add(&req->inflight_entry, &ctx->locked_free_list); ctx->locked_free_nr++; } else { if (!percpu_ref_tryget(&ctx->refs)) @@ -1761,9 +1760,9 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx) nr = state->free_reqs; while (!list_empty(&cs->free_list)) { struct io_kiocb *req = list_first_entry(&cs->free_list, - struct io_kiocb, compl.list); + struct io_kiocb, inflight_entry); - list_del(&req->compl.list); + list_del(&req->inflight_entry); state->reqs[nr++] = req; if (nr == ARRAY_SIZE(state->reqs)) break; @@ -1837,7 +1836,7 @@ static void __io_free_req(struct io_kiocb *req) io_put_task(req->task, 1); spin_lock_irqsave(&ctx->completion_lock, flags); - list_add(&req->compl.list, &ctx->locked_free_list); + list_add(&req->inflight_entry, &ctx->locked_free_list); ctx->locked_free_nr++; spin_unlock_irqrestore(&ctx->completion_lock, flags); @@ -2144,7 +2143,7 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req, if (state->free_reqs != ARRAY_SIZE(state->reqs)) state->reqs[state->free_reqs++] = req; else - list_add(&req->compl.list, &state->comp.free_list); + list_add(&req->inflight_entry, &state->comp.free_list); } static void io_submit_flush_completions(struct io_ring_ctx *ctx) @@ -8626,8 +8625,8 @@ static void io_req_cache_free(struct list_head *list) { struct io_kiocb *req, *nxt; - list_for_each_entry_safe(req, nxt, list, compl.list) { - list_del(&req->compl.list); + list_for_each_entry_safe(req, nxt, list, inflight_entry) { + list_del(&req->inflight_entry); kmem_cache_free(req_cachep, req); } } From cd0ca2e048dc0ddea4f59354b0b8ce4548a76a91 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 9 Aug 2021 20:18:11 +0100 Subject: [PATCH 230/315] io_uring: inline struct io_comp_state Inline struct io_comp_state into struct io_submit_state. They are already coupled tightly, together with mixed responsibilities it only brings confusion having them separately. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/e55bba77426b399e3a2e54e3c6c267c6a0fc4b57.1628536684.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 61 +++++++++++++++++++++++---------------------------- 1 file changed, 27 insertions(+), 34 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index f05ec6e07eec..a927e2173d98 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -299,13 +299,6 @@ struct io_sq_data { #define IO_REQ_CACHE_SIZE 32 #define IO_REQ_ALLOC_BATCH 8 -struct io_comp_state { - struct io_kiocb *reqs[IO_COMPL_BATCH]; - unsigned int nr; - /* inline/task_work completion list, under ->uring_lock */ - struct list_head free_list; -}; - struct io_submit_link { struct io_kiocb *head; struct io_kiocb *last; @@ -326,7 +319,10 @@ struct io_submit_state { /* * Batch completion logic */ - struct io_comp_state comp; + struct io_kiocb *compl_reqs[IO_COMPL_BATCH]; + unsigned int compl_nr; + /* inline/task_work completion list, under ->uring_lock */ + struct list_head free_list; /* * File reference cache @@ -1208,7 +1204,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work); init_llist_head(&ctx->rsrc_put_llist); INIT_LIST_HEAD(&ctx->tctx_list); - INIT_LIST_HEAD(&ctx->submit_state.comp.free_list); + INIT_LIST_HEAD(&ctx->submit_state.free_list); INIT_LIST_HEAD(&ctx->locked_free_list); INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func); return ctx; @@ -1734,10 +1730,10 @@ static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx) } static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, - struct io_comp_state *cs) + struct io_submit_state *state) { spin_lock_irq(&ctx->completion_lock); - list_splice_init(&ctx->locked_free_list, &cs->free_list); + list_splice_init(&ctx->locked_free_list, &state->free_list); ctx->locked_free_nr = 0; spin_unlock_irq(&ctx->completion_lock); } @@ -1746,7 +1742,6 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, static bool io_flush_cached_reqs(struct io_ring_ctx *ctx) { struct io_submit_state *state = &ctx->submit_state; - struct io_comp_state *cs = &state->comp; int nr; /* @@ -1755,11 +1750,11 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx) * side cache. */ if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH) - io_flush_cached_locked_reqs(ctx, cs); + io_flush_cached_locked_reqs(ctx, state); nr = state->free_reqs; - while (!list_empty(&cs->free_list)) { - struct io_kiocb *req = list_first_entry(&cs->free_list, + while (!list_empty(&state->free_list)) { + struct io_kiocb *req = list_first_entry(&state->free_list, struct io_kiocb, inflight_entry); list_del(&req->inflight_entry); @@ -1946,7 +1941,7 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx) { if (!ctx) return; - if (ctx->submit_state.comp.nr) { + if (ctx->submit_state.compl_nr) { mutex_lock(&ctx->uring_lock); io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); @@ -2143,19 +2138,19 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req, if (state->free_reqs != ARRAY_SIZE(state->reqs)) state->reqs[state->free_reqs++] = req; else - list_add(&req->inflight_entry, &state->comp.free_list); + list_add(&req->inflight_entry, &state->free_list); } static void io_submit_flush_completions(struct io_ring_ctx *ctx) __must_hold(&req->ctx->uring_lock) { - struct io_comp_state *cs = &ctx->submit_state.comp; - int i, nr = cs->nr; + struct io_submit_state *state = &ctx->submit_state; + int i, nr = state->compl_nr; struct req_batch rb; spin_lock_irq(&ctx->completion_lock); for (i = 0; i < nr; i++) { - struct io_kiocb *req = cs->reqs[i]; + struct io_kiocb *req = state->compl_reqs[i]; __io_cqring_fill_event(ctx, req->user_data, req->result, req->compl.cflags); @@ -2166,7 +2161,7 @@ static void io_submit_flush_completions(struct io_ring_ctx *ctx) io_init_req_batch(&rb); for (i = 0; i < nr; i++) { - struct io_kiocb *req = cs->reqs[i]; + struct io_kiocb *req = state->compl_reqs[i]; /* submission and completion refs */ if (req_ref_sub_and_test(req, 2)) @@ -2174,7 +2169,7 @@ static void io_submit_flush_completions(struct io_ring_ctx *ctx) } io_req_free_batch_finish(ctx, &rb); - cs->nr = 0; + state->compl_nr = 0; } /* @@ -6484,10 +6479,10 @@ issue_sqe: /* drop submission reference */ if (req->flags & REQ_F_COMPLETE_INLINE) { struct io_ring_ctx *ctx = req->ctx; - struct io_comp_state *cs = &ctx->submit_state.comp; + struct io_submit_state *state = &ctx->submit_state; - cs->reqs[cs->nr++] = req; - if (cs->nr == ARRAY_SIZE(cs->reqs)) + state->compl_reqs[state->compl_nr++] = req; + if (state->compl_nr == ARRAY_SIZE(state->compl_reqs)) io_submit_flush_completions(ctx); } else { io_put_req(req); @@ -6690,7 +6685,7 @@ static void io_submit_state_end(struct io_submit_state *state, { if (state->link.head) io_queue_sqe(state->link.head); - if (state->comp.nr) + if (state->compl_nr) io_submit_flush_completions(ctx); if (state->plug_started) blk_finish_plug(&state->plug); @@ -8633,19 +8628,17 @@ static void io_req_cache_free(struct list_head *list) static void io_req_caches_free(struct io_ring_ctx *ctx) { - struct io_submit_state *submit_state = &ctx->submit_state; - struct io_comp_state *cs = &ctx->submit_state.comp; + struct io_submit_state *state = &ctx->submit_state; mutex_lock(&ctx->uring_lock); - if (submit_state->free_reqs) { - kmem_cache_free_bulk(req_cachep, submit_state->free_reqs, - submit_state->reqs); - submit_state->free_reqs = 0; + if (state->free_reqs) { + kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs); + state->free_reqs = 0; } - io_flush_cached_locked_reqs(ctx, cs); - io_req_cache_free(&cs->free_list); + io_flush_cached_locked_reqs(ctx, state); + io_req_cache_free(&state->free_list); mutex_unlock(&ctx->uring_lock); } From 90f67366cb8871951399fb5bcf182e902b896615 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 9 Aug 2021 20:18:12 +0100 Subject: [PATCH 231/315] io_uring: remove extra argument for overflow flush Unlike __io_cqring_overflow_flush(), nobody does forced flushing with io_cqring_overflow_flush(), so removed the argument from it. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/7594f869ca41b7cfb5a35a3c7c2d402242834e9e.1628536684.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index a927e2173d98..e2718ce6dfbc 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1523,7 +1523,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) return all_flushed; } -static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) +static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx) { bool ret = true; @@ -1531,7 +1531,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) /* iopoll syncs against uring_lock, not completion_lock */ if (ctx->flags & IORING_SETUP_IOPOLL) mutex_lock(&ctx->uring_lock); - ret = __io_cqring_overflow_flush(ctx, force); + ret = __io_cqring_overflow_flush(ctx, false); if (ctx->flags & IORING_SETUP_IOPOLL) mutex_unlock(&ctx->uring_lock); } @@ -7058,7 +7058,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, int ret; do { - io_cqring_overflow_flush(ctx, false); + io_cqring_overflow_flush(ctx); if (io_cqring_events(ctx) >= min_events) return 0; if (!io_run_task_work()) @@ -7096,7 +7096,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, trace_io_uring_cqring_wait(ctx, min_events); do { /* if we can't even flush overflow, don't wait for more */ - if (!io_cqring_overflow_flush(ctx, false)) { + if (!io_cqring_overflow_flush(ctx)) { ret = -EBUSY; break; } @@ -9365,7 +9365,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, */ ret = 0; if (ctx->flags & IORING_SETUP_SQPOLL) { - io_cqring_overflow_flush(ctx, false); + io_cqring_overflow_flush(ctx); if (unlikely(ctx->sq_data->thread == NULL)) { ret = -EOWNERDEAD; From 5d70904367b45b74dab9da5c023b6629f511e48f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 9 Aug 2021 20:18:13 +0100 Subject: [PATCH 232/315] io_uring: inline io_poll_remove_waitqs Inline io_poll_remove_waitqs() into its only user and clean it up. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/2f1a91a19ffcd591531dc4c61e2f11c64a2d6a6d.1628536684.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index e2718ce6dfbc..a8cef973a601 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1059,7 +1059,6 @@ static void io_rsrc_put_work(struct work_struct *work); static void io_req_task_queue(struct io_kiocb *req); static void io_submit_flush_completions(struct io_ring_ctx *ctx); -static bool io_poll_remove_waitqs(struct io_kiocb *req); static int io_req_prep_async(struct io_kiocb *req); static struct kmem_cache *req_cachep; @@ -5251,34 +5250,24 @@ static bool __io_poll_remove_one(struct io_kiocb *req, return do_complete; } -static bool io_poll_remove_waitqs(struct io_kiocb *req) +static bool io_poll_remove_one(struct io_kiocb *req) __must_hold(&req->ctx->completion_lock) { + int refs; bool do_complete; io_poll_remove_double(req); do_complete = __io_poll_remove_one(req, io_poll_get_single(req), true); - if (req->opcode != IORING_OP_POLL_ADD && do_complete) { - /* non-poll requests have submit ref still */ - req_ref_put(req); - } - return do_complete; -} - -static bool io_poll_remove_one(struct io_kiocb *req) - __must_hold(&req->ctx->completion_lock) -{ - bool do_complete; - - do_complete = io_poll_remove_waitqs(req); if (do_complete) { io_cqring_fill_event(req->ctx, req->user_data, -ECANCELED, 0); io_commit_cqring(req->ctx); req_set_fail(req); - io_put_req_deferred(req, 1); - } + /* non-poll requests have submit ref still */ + refs = 1 + (req->opcode != IORING_OP_POLL_ADD); + io_put_req_deferred(req, refs); + } return do_complete; } From 6294f3686b4d77771ab8b161304ada546e71d36a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 10 Aug 2021 17:53:55 +0100 Subject: [PATCH 233/315] io_uring: clean up tctx_task_work() After recent fixes, tctx_task_work() always does proper spinlocking before looking into ->task_list, so now we don't need atomics for ->task_state, replace it with non-atomic task_running using the critical section. Tide it up, combine two separate block with spinlocking, and always try to splice in there, so we do less locking when new requests are arriving during the function execution. Signed-off-by: Pavel Begunkov [axboe: fix missing ->task_running reset on task_work_add() failure] Signed-off-by: Jens Axboe --- fs/io_uring.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index a8cef973a601..60313502a234 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -476,8 +476,8 @@ struct io_uring_task { spinlock_t task_lock; struct io_wq_work_list task_list; - unsigned long task_state; struct callback_head task_work; + bool task_running; }; /* @@ -1960,9 +1960,13 @@ static void tctx_task_work(struct callback_head *cb) spin_lock_irq(&tctx->task_lock); node = tctx->task_list.first; INIT_WQ_LIST(&tctx->task_list); + if (!node) + tctx->task_running = false; spin_unlock_irq(&tctx->task_lock); + if (!node) + break; - while (node) { + do { struct io_wq_work_node *next = node->next; struct io_kiocb *req = container_of(node, struct io_kiocb, io_task_work.node); @@ -1974,19 +1978,8 @@ static void tctx_task_work(struct callback_head *cb) } req->io_task_work.func(req); node = next; - } - if (wq_list_empty(&tctx->task_list)) { - spin_lock_irq(&tctx->task_lock); - clear_bit(0, &tctx->task_state); - if (wq_list_empty(&tctx->task_list)) { - spin_unlock_irq(&tctx->task_lock); - break; - } - spin_unlock_irq(&tctx->task_lock); - /* another tctx_task_work() is enqueued, yield */ - if (test_and_set_bit(0, &tctx->task_state)) - break; - } + } while (node); + cond_resched(); } @@ -2000,16 +1993,19 @@ static void io_req_task_work_add(struct io_kiocb *req) enum task_work_notify_mode notify; struct io_wq_work_node *node; unsigned long flags; + bool running; WARN_ON_ONCE(!tctx); spin_lock_irqsave(&tctx->task_lock, flags); wq_list_add_tail(&req->io_task_work.node, &tctx->task_list); + running = tctx->task_running; + if (!running) + tctx->task_running = true; spin_unlock_irqrestore(&tctx->task_lock, flags); /* task_work already pending, we're done */ - if (test_bit(0, &tctx->task_state) || - test_and_set_bit(0, &tctx->task_state)) + if (running) return; /* @@ -2024,8 +2020,8 @@ static void io_req_task_work_add(struct io_kiocb *req) return; } - clear_bit(0, &tctx->task_state); spin_lock_irqsave(&tctx->task_lock, flags); + tctx->task_running = false; node = tctx->task_list.first; INIT_WQ_LIST(&tctx->task_list); spin_unlock_irqrestore(&tctx->task_lock, flags); From 62906e89e63ba497105c0e3558089a10365f4f33 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 10 Aug 2021 14:52:47 +0100 Subject: [PATCH 234/315] io_uring: remove file batch-get optimisation For requests with non-fixed files, instead of grabbing just one reference, we get by the number of left requests, so the following requests using the same file can take it without atomics. However, it's not all win. If there is one request in the middle not using files or having a fixed file, we'll need to put back the left references. Even worse if an application submits requests dealing with different files, it will do a put for each new request, so doubling the number of atomics needed. Also, even if not used, it's still takes some cycles in the submission path. If a file used many times, it rather makes sense to pre-register it, if not, we may fall in the described pitfall. So, this optimisation is a matter of use case. Go with the simpliest code-wise way, remove it. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 53 ++++----------------------------------------------- 1 file changed, 4 insertions(+), 49 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 60313502a234..64241e2d3dd9 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -324,12 +324,6 @@ struct io_submit_state { /* inline/task_work completion list, under ->uring_lock */ struct list_head free_list; - /* - * File reference cache - */ - struct file *file; - unsigned int fd; - unsigned int file_refs; unsigned int ios_left; }; @@ -1052,7 +1046,6 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, unsigned nr_args); static void io_clean_op(struct io_kiocb *req); static struct file *io_file_get(struct io_ring_ctx *ctx, - struct io_submit_state *state, struct io_kiocb *req, int fd, bool fixed); static void __io_queue_sqe(struct io_kiocb *req); static void io_rsrc_put_work(struct work_struct *work); @@ -2583,40 +2576,6 @@ static void io_iopoll_req_issued(struct io_kiocb *req) } } -static inline void io_state_file_put(struct io_submit_state *state) -{ - if (state->file_refs) { - fput_many(state->file, state->file_refs); - state->file_refs = 0; - } -} - -/* - * Get as many references to a file as we have IOs left in this submission, - * assuming most submissions are for one file, or at least that each file - * has more than one submission. - */ -static struct file *__io_file_get(struct io_submit_state *state, int fd) -{ - if (!state) - return fget(fd); - - if (state->file_refs) { - if (state->fd == fd) { - state->file_refs--; - return state->file; - } - io_state_file_put(state); - } - state->file = fget_many(fd, state->ios_left); - if (unlikely(!state->file)) - return NULL; - - state->fd = fd; - state->file_refs = state->ios_left - 1; - return state->file; -} - static bool io_bdev_nowait(struct block_device *bdev) { return !bdev || blk_queue_nowait(bdev_get_queue(bdev)); @@ -3618,8 +3577,7 @@ static int __io_splice_prep(struct io_kiocb *req, if (unlikely(sp->flags & ~valid_flags)) return -EINVAL; - sp->file_in = io_file_get(req->ctx, NULL, req, - READ_ONCE(sqe->splice_fd_in), + sp->file_in = io_file_get(req->ctx, req, READ_ONCE(sqe->splice_fd_in), (sp->flags & SPLICE_F_FD_IN_FIXED)); if (!sp->file_in) return -EBADF; @@ -6356,10 +6314,9 @@ static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx, } static struct file *io_file_get_normal(struct io_ring_ctx *ctx, - struct io_submit_state *state, struct io_kiocb *req, int fd) { - struct file *file = __io_file_get(state, fd); + struct file *file = fget(fd); trace_io_uring_file_get(ctx, fd); @@ -6370,13 +6327,12 @@ static struct file *io_file_get_normal(struct io_ring_ctx *ctx, } static inline struct file *io_file_get(struct io_ring_ctx *ctx, - struct io_submit_state *state, struct io_kiocb *req, int fd, bool fixed) { if (fixed) return io_file_get_fixed(ctx, req, fd); else - return io_file_get_normal(ctx, state, req, fd); + return io_file_get_normal(ctx, req, fd); } static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) @@ -6589,7 +6545,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, } if (io_op_defs[req->opcode].needs_file) { - req->file = io_file_get(ctx, state, req, READ_ONCE(sqe->fd), + req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd), (sqe_flags & IOSQE_FIXED_FILE)); if (unlikely(!req->file)) ret = -EBADF; @@ -6674,7 +6630,6 @@ static void io_submit_state_end(struct io_submit_state *state, io_submit_flush_completions(ctx); if (state->plug_started) blk_finish_plug(&state->plug); - io_state_file_put(state); } /* From 89850fce16a1a75caacca77cfa0c829aeea4f886 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 10 Aug 2021 15:11:51 -0600 Subject: [PATCH 235/315] io_uring: run timeouts from task_work This is in preparation to making the completion lock work outside of hard/soft IRQ context. Add a timeout_lock to handle the ordering of timeout completions or cancelations with the timeouts actually triggering. Signed-off-by: Jens Axboe --- fs/io_uring.c | 54 ++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 40 insertions(+), 14 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 64241e2d3dd9..b8d09565d3ba 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -409,6 +409,8 @@ struct io_ring_ctx { struct { spinlock_t completion_lock; + spinlock_t timeout_lock; + /* * ->iopoll_list is protected by the ctx->uring_lock for * io_uring instances that don't use IORING_SETUP_SQPOLL. @@ -1188,6 +1190,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->cq_wait); spin_lock_init(&ctx->completion_lock); + spin_lock_init(&ctx->timeout_lock); INIT_LIST_HEAD(&ctx->iopoll_list); INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); @@ -1328,6 +1331,7 @@ static void io_queue_async_work(struct io_kiocb *req) static void io_kill_timeout(struct io_kiocb *req, int status) __must_hold(&req->ctx->completion_lock) + __must_hold(&req->ctx->timeout_lock) { struct io_timeout_data *io = req->async_data; @@ -1355,9 +1359,12 @@ static void io_queue_deferred(struct io_ring_ctx *ctx) } static void io_flush_timeouts(struct io_ring_ctx *ctx) + __must_hold(&ctx->completion_lock) { u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); + unsigned long flags; + spin_lock_irqsave(&ctx->timeout_lock, flags); while (!list_empty(&ctx->timeout_list)) { u32 events_needed, events_got; struct io_kiocb *req = list_first_entry(&ctx->timeout_list, @@ -1382,6 +1389,7 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx) io_kill_timeout(req, 0); } ctx->cq_last_tm_flush = seq; + spin_unlock_irqrestore(&ctx->timeout_lock, flags); } static void __io_commit_cqring_flush(struct io_ring_ctx *ctx) @@ -5455,6 +5463,20 @@ err: return 0; } +static void io_req_task_timeout(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + + spin_lock_irq(&ctx->completion_lock); + io_cqring_fill_event(ctx, req->user_data, -ETIME, 0); + io_commit_cqring(ctx); + spin_unlock_irq(&ctx->completion_lock); + + io_cqring_ev_posted(ctx); + req_set_fail(req); + io_put_req(req); +} + static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) { struct io_timeout_data *data = container_of(timer, @@ -5463,24 +5485,20 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) struct io_ring_ctx *ctx = req->ctx; unsigned long flags; - spin_lock_irqsave(&ctx->completion_lock, flags); + spin_lock_irqsave(&ctx->timeout_lock, flags); list_del_init(&req->timeout.list); atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); + spin_unlock_irqrestore(&ctx->timeout_lock, flags); - io_cqring_fill_event(ctx, req->user_data, -ETIME, 0); - io_commit_cqring(ctx); - spin_unlock_irqrestore(&ctx->completion_lock, flags); - - io_cqring_ev_posted(ctx); - req_set_fail(req); - io_put_req(req); + req->io_task_work.func = io_req_task_timeout; + io_req_task_work_add(req); return HRTIMER_NORESTART; } static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, __u64 user_data) - __must_hold(&ctx->completion_lock) + __must_hold(&ctx->timeout_lock) { struct io_timeout_data *io; struct io_kiocb *req; @@ -5502,7 +5520,7 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, } static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) - __must_hold(&ctx->completion_lock) + __must_hold(&ctx->timeout_lock) { struct io_kiocb *req = io_timeout_extract(ctx, user_data); @@ -5517,7 +5535,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, struct timespec64 *ts, enum hrtimer_mode mode) - __must_hold(&ctx->completion_lock) + __must_hold(&ctx->timeout_lock) { struct io_kiocb *req = io_timeout_extract(ctx, user_data); struct io_timeout_data *data; @@ -5576,13 +5594,15 @@ static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) struct io_ring_ctx *ctx = req->ctx; int ret; - spin_lock_irq(&ctx->completion_lock); + spin_lock_irq(&ctx->timeout_lock); if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) ret = io_timeout_cancel(ctx, tr->addr); else ret = io_timeout_update(ctx, tr->addr, &tr->ts, io_translate_timeout_mode(tr->flags)); + spin_unlock_irq(&ctx->timeout_lock); + spin_lock_irq(&ctx->completion_lock); io_cqring_fill_event(ctx, req->user_data, ret, 0); io_commit_cqring(ctx); spin_unlock_irq(&ctx->completion_lock); @@ -5637,7 +5657,7 @@ static int io_timeout(struct io_kiocb *req, unsigned int issue_flags) struct list_head *entry; u32 tail, off = req->timeout.off; - spin_lock_irq(&ctx->completion_lock); + spin_lock_irq(&ctx->timeout_lock); /* * sqe->off holds how many events that need to occur for this @@ -5676,7 +5696,7 @@ add: list_add(&req->timeout.list, entry); data->timer.function = io_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); - spin_unlock_irq(&ctx->completion_lock); + spin_unlock_irq(&ctx->timeout_lock); return 0; } @@ -5730,7 +5750,9 @@ static void io_async_find_and_cancel(struct io_ring_ctx *ctx, spin_lock_irqsave(&ctx->completion_lock, flags); if (ret != -ENOENT) goto done; + spin_lock(&ctx->timeout_lock); ret = io_timeout_cancel(ctx, sqe_addr); + spin_unlock(&ctx->timeout_lock); if (ret != -ENOENT) goto done; ret = io_poll_cancel(ctx, sqe_addr, false); @@ -5772,7 +5794,9 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) spin_lock_irq(&ctx->completion_lock); if (ret != -ENOENT) goto done; + spin_lock(&ctx->timeout_lock); ret = io_timeout_cancel(ctx, sqe_addr); + spin_unlock(&ctx->timeout_lock); if (ret != -ENOENT) goto done; ret = io_poll_cancel(ctx, sqe_addr, false); @@ -8801,12 +8825,14 @@ static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, int canceled = 0; spin_lock_irq(&ctx->completion_lock); + spin_lock(&ctx->timeout_lock); list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { if (io_match_task(req, tsk, cancel_all)) { io_kill_timeout(req, -ECANCELED); canceled++; } } + spin_unlock(&ctx->timeout_lock); if (canceled != 0) io_commit_cqring(ctx); spin_unlock_irq(&ctx->completion_lock); From 89b263f6d56e683ddcf7643140271ef6e36c72b9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 10 Aug 2021 15:14:18 -0600 Subject: [PATCH 236/315] io_uring: run linked timeouts from task_work This is in preparation to making the completion lock work outside of hard/soft IRQ context. Signed-off-by: Jens Axboe --- fs/io_uring.c | 42 ++++++++++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index b8d09565d3ba..8a1c461559ac 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -538,6 +538,8 @@ struct io_timeout { struct list_head list; /* head of the link, used by linked timeouts only */ struct io_kiocb *head; + /* for linked completions */ + struct io_kiocb *prev; }; struct io_timeout_rem { @@ -1848,6 +1850,7 @@ static inline void io_remove_next_linked(struct io_kiocb *req) static bool io_kill_linked_timeout(struct io_kiocb *req) __must_hold(&req->ctx->completion_lock) + __must_hold(&req->ctx->timeout_lock) { struct io_kiocb *link = req->link; @@ -1892,8 +1895,13 @@ static bool io_disarm_next(struct io_kiocb *req) { bool posted = false; - if (likely(req->flags & REQ_F_LINK_TIMEOUT)) + if (likely(req->flags & REQ_F_LINK_TIMEOUT)) { + struct io_ring_ctx *ctx = req->ctx; + + spin_lock_irq(&ctx->timeout_lock); posted = io_kill_linked_timeout(req); + spin_unlock_irq(&ctx->timeout_lock); + } if (unlikely((req->flags & REQ_F_FAIL) && !(req->flags & REQ_F_HARDLINK))) { posted |= (req->link != NULL); @@ -6359,6 +6367,20 @@ static inline struct file *io_file_get(struct io_ring_ctx *ctx, return io_file_get_normal(ctx, req, fd); } +static void io_req_task_link_timeout(struct io_kiocb *req) +{ + struct io_kiocb *prev = req->timeout.prev; + struct io_ring_ctx *ctx = req->ctx; + + if (prev) { + io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME); + io_put_req(prev); + io_put_req(req); + } else { + io_req_complete_post(req, -ETIME, 0); + } +} + static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) { struct io_timeout_data *data = container_of(timer, @@ -6367,7 +6389,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) struct io_ring_ctx *ctx = req->ctx; unsigned long flags; - spin_lock_irqsave(&ctx->completion_lock, flags); + spin_lock_irqsave(&ctx->timeout_lock, flags); prev = req->timeout.head; req->timeout.head = NULL; @@ -6380,15 +6402,11 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) if (!req_ref_inc_not_zero(prev)) prev = NULL; } - spin_unlock_irqrestore(&ctx->completion_lock, flags); + req->timeout.prev = prev; + spin_unlock_irqrestore(&ctx->timeout_lock, flags); - if (prev) { - io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME); - io_put_req_deferred(prev, 1); - io_put_req_deferred(req, 1); - } else { - io_req_complete_post(req, -ETIME, 0); - } + req->io_task_work.func = io_req_task_link_timeout; + io_req_task_work_add(req); return HRTIMER_NORESTART; } @@ -6396,7 +6414,7 @@ static void io_queue_linked_timeout(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - spin_lock_irq(&ctx->completion_lock); + spin_lock_irq(&ctx->timeout_lock); /* * If the back reference is NULL, then our linked request finished * before we got a chance to setup the timer @@ -6408,7 +6426,7 @@ static void io_queue_linked_timeout(struct io_kiocb *req) hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); } - spin_unlock_irq(&ctx->completion_lock); + spin_unlock_irq(&ctx->timeout_lock); /* drop submission reference */ io_put_req(req); } From 8ef12efe26c8e44323011e57753b8c0e87af1582 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 10 Aug 2021 15:15:25 -0600 Subject: [PATCH 237/315] io_uring: run regular file completions from task_work This is in preparation to making the completion lock work outside of hard/soft IRQ context. Signed-off-by: Jens Axboe --- fs/io_uring.c | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 8a1c461559ac..6477b0d52c19 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2482,31 +2482,48 @@ static bool io_rw_should_reissue(struct io_kiocb *req) } #endif -static void __io_complete_rw(struct io_kiocb *req, long res, long res2, - unsigned int issue_flags) +static bool __io_complete_rw_common(struct io_kiocb *req, long res) { - int cflags = 0; - if (req->rw.kiocb.ki_flags & IOCB_WRITE) kiocb_end_write(req); if (res != req->result) { if ((res == -EAGAIN || res == -EOPNOTSUPP) && io_rw_should_reissue(req)) { req->flags |= REQ_F_REISSUE; - return; + return true; } req_set_fail(req); + req->result = res; } + return false; +} + +static void io_req_task_complete(struct io_kiocb *req) +{ + int cflags = 0; + if (req->flags & REQ_F_BUFFER_SELECTED) cflags = io_put_rw_kbuf(req); - __io_req_complete(req, issue_flags, res, cflags); + __io_req_complete(req, 0, req->result, cflags); +} + +static void __io_complete_rw(struct io_kiocb *req, long res, long res2, + unsigned int issue_flags) +{ + if (__io_complete_rw_common(req, res)) + return; + io_req_task_complete(req); } static void io_complete_rw(struct kiocb *kiocb, long res, long res2) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); - __io_complete_rw(req, res, res2, 0); + if (__io_complete_rw_common(req, res)) + return; + req->result = res; + req->io_task_work.func = io_req_task_complete; + io_req_task_work_add(req); } static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) From 79ebeaee8a21a00417d89f1a02019f79840d9bad Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 10 Aug 2021 15:18:27 -0600 Subject: [PATCH 238/315] io_uring: remove IRQ aspect of io_ring_ctx completion lock We have no hard/soft IRQ users of this lock left, remove any IRQ disabling/saving and restoring when grabbing this lock. This is straight forward with no users entering with IRQs disabled anymore, the only thing to look out for is the waitqueue poll head lock which nests inside the completion lock. That needs IRQs disabled, and hence we have to do that now instead of relying on the outer lock doing so. Signed-off-by: Jens Axboe --- fs/io_uring.c | 154 ++++++++++++++++++++++++-------------------------- 1 file changed, 74 insertions(+), 80 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6477b0d52c19..52cad59c0739 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1292,10 +1292,10 @@ static void io_prep_async_link(struct io_kiocb *req) if (req->flags & REQ_F_LINK_TIMEOUT) { struct io_ring_ctx *ctx = req->ctx; - spin_lock_irq(&ctx->completion_lock); + spin_lock(&ctx->completion_lock); io_for_each_link(cur, req) io_prep_async_work(cur); - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); } else { io_for_each_link(cur, req) io_prep_async_work(cur); @@ -1364,9 +1364,8 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx) __must_hold(&ctx->completion_lock) { u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); - unsigned long flags; - spin_lock_irqsave(&ctx->timeout_lock, flags); + spin_lock_irq(&ctx->timeout_lock); while (!list_empty(&ctx->timeout_list)) { u32 events_needed, events_got; struct io_kiocb *req = list_first_entry(&ctx->timeout_list, @@ -1391,7 +1390,7 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx) io_kill_timeout(req, 0); } ctx->cq_last_tm_flush = seq; - spin_unlock_irqrestore(&ctx->timeout_lock, flags); + spin_unlock_irq(&ctx->timeout_lock); } static void __io_commit_cqring_flush(struct io_ring_ctx *ctx) @@ -1484,14 +1483,13 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) /* Returns true if there are no backlogged entries after the flush */ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) { - unsigned long flags; bool all_flushed, posted; if (!force && __io_cqring_events(ctx) == ctx->cq_entries) return false; posted = false; - spin_lock_irqsave(&ctx->completion_lock, flags); + spin_lock(&ctx->completion_lock); while (!list_empty(&ctx->cq_overflow_list)) { struct io_uring_cqe *cqe = io_get_cqe(ctx); struct io_overflow_cqe *ocqe; @@ -1519,7 +1517,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) if (posted) io_commit_cqring(ctx); - spin_unlock_irqrestore(&ctx->completion_lock, flags); + spin_unlock(&ctx->completion_lock); if (posted) io_cqring_ev_posted(ctx); return all_flushed; @@ -1648,9 +1646,8 @@ static void io_req_complete_post(struct io_kiocb *req, long res, unsigned int cflags) { struct io_ring_ctx *ctx = req->ctx; - unsigned long flags; - spin_lock_irqsave(&ctx->completion_lock, flags); + spin_lock(&ctx->completion_lock); __io_cqring_fill_event(ctx, req->user_data, res, cflags); /* * If we're the last reference to this request, add to our locked @@ -1674,7 +1671,7 @@ static void io_req_complete_post(struct io_kiocb *req, long res, req = NULL; } io_commit_cqring(ctx); - spin_unlock_irqrestore(&ctx->completion_lock, flags); + spin_unlock(&ctx->completion_lock); if (req) { io_cqring_ev_posted(ctx); @@ -1734,10 +1731,10 @@ static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx) static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, struct io_submit_state *state) { - spin_lock_irq(&ctx->completion_lock); + spin_lock(&ctx->completion_lock); list_splice_init(&ctx->locked_free_list, &state->free_list); ctx->locked_free_nr = 0; - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); } /* Returns true IFF there are requests in the cache */ @@ -1827,15 +1824,14 @@ static void io_dismantle_req(struct io_kiocb *req) static void __io_free_req(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - unsigned long flags; io_dismantle_req(req); io_put_task(req->task, 1); - spin_lock_irqsave(&ctx->completion_lock, flags); + spin_lock(&ctx->completion_lock); list_add(&req->inflight_entry, &ctx->locked_free_list); ctx->locked_free_nr++; - spin_unlock_irqrestore(&ctx->completion_lock, flags); + spin_unlock(&ctx->completion_lock); percpu_ref_put(&ctx->refs); } @@ -1922,14 +1918,13 @@ static struct io_kiocb *__io_req_find_next(struct io_kiocb *req) */ if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL)) { struct io_ring_ctx *ctx = req->ctx; - unsigned long flags; bool posted; - spin_lock_irqsave(&ctx->completion_lock, flags); + spin_lock(&ctx->completion_lock); posted = io_disarm_next(req); if (posted) io_commit_cqring(req->ctx); - spin_unlock_irqrestore(&ctx->completion_lock, flags); + spin_unlock(&ctx->completion_lock); if (posted) io_cqring_ev_posted(ctx); } @@ -2152,7 +2147,7 @@ static void io_submit_flush_completions(struct io_ring_ctx *ctx) int i, nr = state->compl_nr; struct req_batch rb; - spin_lock_irq(&ctx->completion_lock); + spin_lock(&ctx->completion_lock); for (i = 0; i < nr; i++) { struct io_kiocb *req = state->compl_reqs[i]; @@ -2160,7 +2155,7 @@ static void io_submit_flush_completions(struct io_ring_ctx *ctx) req->compl.cflags); } io_commit_cqring(ctx); - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); io_cqring_ev_posted(ctx); io_init_req_batch(&rb); @@ -4865,7 +4860,7 @@ static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll) req->result = vfs_poll(req->file, &pt) & poll->events; } - spin_lock_irq(&ctx->completion_lock); + spin_lock(&ctx->completion_lock); if (!req->result && !READ_ONCE(poll->canceled)) { add_wait_queue(poll->head, &poll->wait); return true; @@ -4899,12 +4894,12 @@ static void io_poll_remove_double(struct io_kiocb *req) if (poll && poll->head) { struct wait_queue_head *head = poll->head; - spin_lock(&head->lock); + spin_lock_irq(&head->lock); list_del_init(&poll->wait.entry); if (poll->wait.private) req_ref_put(req); poll->head = NULL; - spin_unlock(&head->lock); + spin_unlock_irq(&head->lock); } } @@ -4940,7 +4935,7 @@ static void io_poll_task_func(struct io_kiocb *req) struct io_kiocb *nxt; if (io_poll_rewait(req, &req->poll)) { - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); } else { bool done; @@ -4952,7 +4947,7 @@ static void io_poll_task_func(struct io_kiocb *req) req->result = 0; add_wait_queue(req->poll.head, &req->poll.wait); } - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); io_cqring_ev_posted(ctx); if (done) { @@ -4969,6 +4964,7 @@ static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode, struct io_kiocb *req = wait->private; struct io_poll_iocb *poll = io_poll_get_single(req); __poll_t mask = key_to_poll(key); + unsigned long flags; /* for instances that support it check for an event match first: */ if (mask && !(mask & poll->events)) @@ -4981,13 +4977,13 @@ static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode, if (poll->head) { bool done; - spin_lock(&poll->head->lock); + spin_lock_irqsave(&poll->head->lock, flags); done = list_empty(&poll->wait.entry); if (!done) list_del_init(&poll->wait.entry); /* make sure double remove sees this as being gone */ wait->private = NULL; - spin_unlock(&poll->head->lock); + spin_unlock_irqrestore(&poll->head->lock, flags); if (!done) { /* use wait func handler, so it matches the rq type */ poll->wait.func(&poll->wait, mode, sync, key); @@ -5075,13 +5071,13 @@ static void io_async_task_func(struct io_kiocb *req) trace_io_uring_task_run(req->ctx, req, req->opcode, req->user_data); if (io_poll_rewait(req, &apoll->poll)) { - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); return; } hash_del(&req->hash_node); io_poll_remove_double(req); - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); if (!READ_ONCE(apoll->poll.canceled)) io_req_task_submit(req); @@ -5133,11 +5129,11 @@ static __poll_t __io_arm_poll_handler(struct io_kiocb *req, if (unlikely(!ipt->nr_entries) && !ipt->error) ipt->error = -EINVAL; - spin_lock_irq(&ctx->completion_lock); + spin_lock(&ctx->completion_lock); if (ipt->error || (mask && (poll->events & EPOLLONESHOT))) io_poll_remove_double(req); if (likely(poll->head)) { - spin_lock(&poll->head->lock); + spin_lock_irq(&poll->head->lock); if (unlikely(list_empty(&poll->wait.entry))) { if (ipt->error) cancel = true; @@ -5150,7 +5146,7 @@ static __poll_t __io_arm_poll_handler(struct io_kiocb *req, WRITE_ONCE(poll->canceled, true); else if (!poll->done) /* actually waiting for an event */ io_poll_req_insert(req); - spin_unlock(&poll->head->lock); + spin_unlock_irq(&poll->head->lock); } return mask; @@ -5206,12 +5202,12 @@ static int io_arm_poll_handler(struct io_kiocb *req) ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, io_async_wake); if (ret || ipt.error) { - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); if (ret) return IO_APOLL_READY; return IO_APOLL_ABORTED; } - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); trace_io_uring_poll_arm(ctx, req, req->opcode, req->user_data, mask, apoll->poll.events); return IO_APOLL_OK; @@ -5225,14 +5221,14 @@ static bool __io_poll_remove_one(struct io_kiocb *req, if (!poll->head) return false; - spin_lock(&poll->head->lock); + spin_lock_irq(&poll->head->lock); if (do_cancel) WRITE_ONCE(poll->canceled, true); if (!list_empty(&poll->wait.entry)) { list_del_init(&poll->wait.entry); do_complete = true; } - spin_unlock(&poll->head->lock); + spin_unlock_irq(&poll->head->lock); hash_del(&req->hash_node); return do_complete; } @@ -5268,7 +5264,7 @@ static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, struct io_kiocb *req; int posted = 0, i; - spin_lock_irq(&ctx->completion_lock); + spin_lock(&ctx->completion_lock); for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { struct hlist_head *list; @@ -5278,7 +5274,7 @@ static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, posted += io_poll_remove_one(req); } } - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); if (posted) io_cqring_ev_posted(ctx); @@ -5416,7 +5412,7 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) ipt.error = 0; io_poll_complete(req, mask); } - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); if (mask) { io_cqring_ev_posted(ctx); @@ -5433,7 +5429,7 @@ static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags) bool completing; int ret; - spin_lock_irq(&ctx->completion_lock); + spin_lock(&ctx->completion_lock); preq = io_poll_find(ctx, req->poll_update.old_user_data, true); if (!preq) { ret = -ENOENT; @@ -5460,7 +5456,7 @@ static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags) ret = 0; err: if (ret < 0) { - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); req_set_fail(req); io_req_complete(req, ret); return 0; @@ -5473,7 +5469,7 @@ err: } if (req->poll_update.update_user_data) preq->user_data = req->poll_update.new_user_data; - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); /* complete update request, we're done with it */ io_req_complete(req, ret); @@ -5492,10 +5488,10 @@ static void io_req_task_timeout(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - spin_lock_irq(&ctx->completion_lock); + spin_lock(&ctx->completion_lock); io_cqring_fill_event(ctx, req->user_data, -ETIME, 0); io_commit_cqring(ctx); - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); io_cqring_ev_posted(ctx); req_set_fail(req); @@ -5627,10 +5623,10 @@ static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) io_translate_timeout_mode(tr->flags)); spin_unlock_irq(&ctx->timeout_lock); - spin_lock_irq(&ctx->completion_lock); + spin_lock(&ctx->completion_lock); io_cqring_fill_event(ctx, req->user_data, ret, 0); io_commit_cqring(ctx); - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); io_cqring_ev_posted(ctx); if (ret < 0) req_set_fail(req); @@ -5768,16 +5764,15 @@ static void io_async_find_and_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req, __u64 sqe_addr, int success_ret) { - unsigned long flags; int ret; ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx); - spin_lock_irqsave(&ctx->completion_lock, flags); + spin_lock(&ctx->completion_lock); if (ret != -ENOENT) goto done; - spin_lock(&ctx->timeout_lock); + spin_lock_irq(&ctx->timeout_lock); ret = io_timeout_cancel(ctx, sqe_addr); - spin_unlock(&ctx->timeout_lock); + spin_unlock_irq(&ctx->timeout_lock); if (ret != -ENOENT) goto done; ret = io_poll_cancel(ctx, sqe_addr, false); @@ -5786,7 +5781,7 @@ done: ret = success_ret; io_cqring_fill_event(ctx, req->user_data, ret, 0); io_commit_cqring(ctx); - spin_unlock_irqrestore(&ctx->completion_lock, flags); + spin_unlock(&ctx->completion_lock); io_cqring_ev_posted(ctx); if (ret < 0) @@ -5816,18 +5811,18 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) /* tasks should wait for their io-wq threads, so safe w/o sync */ ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx); - spin_lock_irq(&ctx->completion_lock); + spin_lock(&ctx->completion_lock); if (ret != -ENOENT) goto done; - spin_lock(&ctx->timeout_lock); + spin_lock_irq(&ctx->timeout_lock); ret = io_timeout_cancel(ctx, sqe_addr); - spin_unlock(&ctx->timeout_lock); + spin_unlock_irq(&ctx->timeout_lock); if (ret != -ENOENT) goto done; ret = io_poll_cancel(ctx, sqe_addr, false); if (ret != -ENOENT) goto done; - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); /* slow path, try all io-wq's */ io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); @@ -5841,11 +5836,11 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) } io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); - spin_lock_irq(&ctx->completion_lock); + spin_lock(&ctx->completion_lock); done: io_cqring_fill_event(ctx, req->user_data, ret, 0); io_commit_cqring(ctx); - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); io_cqring_ev_posted(ctx); if (ret < 0) @@ -6061,9 +6056,9 @@ fail: return true; } - spin_lock_irq(&ctx->completion_lock); + spin_lock(&ctx->completion_lock); if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) { - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); kfree(de); io_queue_async_work(req); return true; @@ -6073,7 +6068,7 @@ fail: de->req = req; de->seq = seq; list_add_tail(&de->list, &ctx->defer_list); - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); return true; } @@ -6813,18 +6808,18 @@ static inline bool io_sqd_events_pending(struct io_sq_data *sqd) static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx) { /* Tell userspace we may need a wakeup call */ - spin_lock_irq(&ctx->completion_lock); + spin_lock(&ctx->completion_lock); WRITE_ONCE(ctx->rings->sq_flags, ctx->rings->sq_flags | IORING_SQ_NEED_WAKEUP); - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); } static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx) { - spin_lock_irq(&ctx->completion_lock); + spin_lock(&ctx->completion_lock); WRITE_ONCE(ctx->rings->sq_flags, ctx->rings->sq_flags & ~IORING_SQ_NEED_WAKEUP); - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); } static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) @@ -7671,11 +7666,11 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL; io_ring_submit_lock(ctx, lock_ring); - spin_lock_irq(&ctx->completion_lock); + spin_lock(&ctx->completion_lock); io_cqring_fill_event(ctx, prsrc->tag, 0, 0); ctx->cq_extra++; io_commit_cqring(ctx); - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); io_cqring_ev_posted(ctx); io_ring_submit_unlock(ctx, lock_ring); } @@ -8846,8 +8841,8 @@ static void io_ring_exit_work(struct work_struct *work) mutex_lock(&ctx->uring_lock); } mutex_unlock(&ctx->uring_lock); - spin_lock_irq(&ctx->completion_lock); - spin_unlock_irq(&ctx->completion_lock); + spin_lock(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); io_ring_ctx_free(ctx); } @@ -8859,18 +8854,18 @@ static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, struct io_kiocb *req, *tmp; int canceled = 0; - spin_lock_irq(&ctx->completion_lock); - spin_lock(&ctx->timeout_lock); + spin_lock(&ctx->completion_lock); + spin_lock_irq(&ctx->timeout_lock); list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { if (io_match_task(req, tsk, cancel_all)) { io_kill_timeout(req, -ECANCELED); canceled++; } } - spin_unlock(&ctx->timeout_lock); + spin_unlock_irq(&ctx->timeout_lock); if (canceled != 0) io_commit_cqring(ctx); - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); if (canceled != 0) io_cqring_ev_posted(ctx); return canceled != 0; @@ -8926,13 +8921,12 @@ static bool io_cancel_task_cb(struct io_wq_work *work, void *data) bool ret; if (!cancel->all && (req->flags & REQ_F_LINK_TIMEOUT)) { - unsigned long flags; struct io_ring_ctx *ctx = req->ctx; /* protect against races with linked timeouts */ - spin_lock_irqsave(&ctx->completion_lock, flags); + spin_lock(&ctx->completion_lock); ret = io_match_task(req, cancel->task, cancel->all); - spin_unlock_irqrestore(&ctx->completion_lock, flags); + spin_unlock(&ctx->completion_lock); } else { ret = io_match_task(req, cancel->task, cancel->all); } @@ -8945,14 +8939,14 @@ static bool io_cancel_defer_files(struct io_ring_ctx *ctx, struct io_defer_entry *de; LIST_HEAD(list); - spin_lock_irq(&ctx->completion_lock); + spin_lock(&ctx->completion_lock); list_for_each_entry_reverse(de, &ctx->defer_list, list) { if (io_match_task(de->req, task, cancel_all)) { list_cut_position(&list, &ctx->defer_list, &de->list); break; } } - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); if (list_empty(&list)) return false; @@ -9502,7 +9496,7 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) io_uring_show_cred(m, index, cred); } seq_printf(m, "PollList:\n"); - spin_lock_irq(&ctx->completion_lock); + spin_lock(&ctx->completion_lock); for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { struct hlist_head *list = &ctx->cancel_hash[i]; struct io_kiocb *req; @@ -9511,7 +9505,7 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) seq_printf(m, " op=%d, task_works=%d\n", req->opcode, req->task->task_works != NULL); } - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); if (has_lock) mutex_unlock(&ctx->uring_lock); } From 21c843d5825b949332fe58495007ca531ef6ae91 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 11 Aug 2021 19:28:27 +0100 Subject: [PATCH 239/315] io_uring: move req_ref_get() and friends Move all request refcount helpers to avoid forward declarations in the future. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/89fd36f6f3fe5b733dfe4546c24725eee40df605.1628705069.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 70 +++++++++++++++++++++++++-------------------------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 52cad59c0739..1b79f6e2da2e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1078,6 +1078,41 @@ EXPORT_SYMBOL(io_uring_get_socket); #define io_for_each_link(pos, head) \ for (pos = (head); pos; pos = pos->link) +/* + * Shamelessly stolen from the mm implementation of page reference checking, + * see commit f958d7b528b1 for details. + */ +#define req_ref_zero_or_close_to_overflow(req) \ + ((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u) + +static inline bool req_ref_inc_not_zero(struct io_kiocb *req) +{ + return atomic_inc_not_zero(&req->refs); +} + +static inline bool req_ref_sub_and_test(struct io_kiocb *req, int refs) +{ + WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); + return atomic_sub_and_test(refs, &req->refs); +} + +static inline bool req_ref_put_and_test(struct io_kiocb *req) +{ + WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); + return atomic_dec_and_test(&req->refs); +} + +static inline void req_ref_put(struct io_kiocb *req) +{ + WARN_ON_ONCE(req_ref_put_and_test(req)); +} + +static inline void req_ref_get(struct io_kiocb *req) +{ + WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); + atomic_inc(&req->refs); +} + static inline void io_req_set_rsrc_node(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; @@ -1539,41 +1574,6 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx) return ret; } -/* - * Shamelessly stolen from the mm implementation of page reference checking, - * see commit f958d7b528b1 for details. - */ -#define req_ref_zero_or_close_to_overflow(req) \ - ((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u) - -static inline bool req_ref_inc_not_zero(struct io_kiocb *req) -{ - return atomic_inc_not_zero(&req->refs); -} - -static inline bool req_ref_sub_and_test(struct io_kiocb *req, int refs) -{ - WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); - return atomic_sub_and_test(refs, &req->refs); -} - -static inline bool req_ref_put_and_test(struct io_kiocb *req) -{ - WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); - return atomic_dec_and_test(&req->refs); -} - -static inline void req_ref_put(struct io_kiocb *req) -{ - WARN_ON_ONCE(req_ref_put_and_test(req)); -} - -static inline void req_ref_get(struct io_kiocb *req) -{ - WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); - atomic_inc(&req->refs); -} - /* must to be called somewhat shortly after putting a request */ static inline void io_put_task(struct task_struct *task, int nr) { From 91c2f6978311afe1f49094fdd90fd6ab29b66223 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 11 Aug 2021 19:28:28 +0100 Subject: [PATCH 240/315] io_uring: remove req_ref_sub_and_test() Soon, we won't need to put several references at once, remove req_ref_sub_and_test() and @nr argument from io_put_req_deferred(), and put the rest of the references by hand. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1868c7554108bff9194fb5757e77be23fadf7fc0.1628705069.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 1b79f6e2da2e..9d1c18865ef4 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1041,7 +1041,7 @@ static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, long res, unsigned int cflags); static void io_put_req(struct io_kiocb *req); -static void io_put_req_deferred(struct io_kiocb *req, int nr); +static void io_put_req_deferred(struct io_kiocb *req); static void io_dismantle_req(struct io_kiocb *req); static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req); static void io_queue_linked_timeout(struct io_kiocb *req); @@ -1090,12 +1090,6 @@ static inline bool req_ref_inc_not_zero(struct io_kiocb *req) return atomic_inc_not_zero(&req->refs); } -static inline bool req_ref_sub_and_test(struct io_kiocb *req, int refs) -{ - WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); - return atomic_sub_and_test(refs, &req->refs); -} - static inline bool req_ref_put_and_test(struct io_kiocb *req) { WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); @@ -1377,7 +1371,7 @@ static void io_kill_timeout(struct io_kiocb *req, int status) atomic_read(&req->ctx->cq_timeouts) + 1); list_del_init(&req->timeout.list); io_cqring_fill_event(req->ctx, req->user_data, status, 0); - io_put_req_deferred(req, 1); + io_put_req_deferred(req); } } @@ -1862,7 +1856,7 @@ static bool io_kill_linked_timeout(struct io_kiocb *req) if (hrtimer_try_to_cancel(&io->timer) != -1) { io_cqring_fill_event(link->ctx, link->user_data, -ECANCELED, 0); - io_put_req_deferred(link, 1); + io_put_req_deferred(link); return true; } } @@ -1881,7 +1875,9 @@ static void io_fail_links(struct io_kiocb *req) trace_io_uring_fail_link(req, link); io_cqring_fill_event(link->ctx, link->user_data, -ECANCELED, 0); - io_put_req_deferred(link, 2); + + io_put_req(link); + io_put_req_deferred(link); link = nxt; } } @@ -2163,7 +2159,8 @@ static void io_submit_flush_completions(struct io_ring_ctx *ctx) struct io_kiocb *req = state->compl_reqs[i]; /* submission and completion refs */ - if (req_ref_sub_and_test(req, 2)) + io_put_req(req); + if (req_ref_put_and_test(req)) io_req_free_batch(&rb, req, &ctx->submit_state); } @@ -2192,9 +2189,9 @@ static inline void io_put_req(struct io_kiocb *req) io_free_req(req); } -static inline void io_put_req_deferred(struct io_kiocb *req, int refs) +static inline void io_put_req_deferred(struct io_kiocb *req) { - if (req_ref_sub_and_test(req, refs)) { + if (req_ref_put_and_test(req)) { req->io_task_work.func = io_free_req; io_req_task_work_add(req); } @@ -5236,7 +5233,6 @@ static bool __io_poll_remove_one(struct io_kiocb *req, static bool io_poll_remove_one(struct io_kiocb *req) __must_hold(&req->ctx->completion_lock) { - int refs; bool do_complete; io_poll_remove_double(req); @@ -5248,8 +5244,9 @@ static bool io_poll_remove_one(struct io_kiocb *req) req_set_fail(req); /* non-poll requests have submit ref still */ - refs = 1 + (req->opcode != IORING_OP_POLL_ADD); - io_put_req_deferred(req, refs); + if (req->opcode != IORING_OP_POLL_ADD) + io_put_req(req); + io_put_req_deferred(req); } return do_complete; } @@ -5550,7 +5547,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) req_set_fail(req); io_cqring_fill_event(ctx, req->user_data, -ECANCELED, 0); - io_put_req_deferred(req, 1); + io_put_req_deferred(req); return 0; } From 5d5901a3434064e98c1dbb3047b9f9793825ea42 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 11 Aug 2021 19:28:29 +0100 Subject: [PATCH 241/315] io_uring: remove submission references Requests are by default given with two references, submission and completion. Completion references are straightforward, they represent request ownership and are put when a request is completed or so. Submission references are a bit more trickier. They're needed when io_issue_sqe() followed deep into the submission stack (e.g. in fs, block, drivers, etc.), request may have given away for concurrent execution or already completed, and the code unwinding back to io_issue_sqe() may be accessing some pieces of our requests, e.g. file or iov. Now, we prevent such async/in-depth completions by pushing requests through task_work. Punting to io-wq is also done through task_works, apart from a couple of cases with a pretty well known context. So, there're two cases: 1) io_issue_sqe() from the task context and protected by ->uring_lock. Either requests return back to io_uring or handed to task_work, which won't be executed because we're currently controlling that task. So, we can be sure that requests are staying alive all the time and we don't need submission references to pin them. 2) io_issue_sqe() from io-wq, which doesn't hold the mutex. The role of submission reference is played by io-wq reference, which is put by io_wq_submit_work(). Hence, it should be fine. Considering that, we can carefully kill the submission reference. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/6b68f1c763229a590f2a27148aee77767a8d7750.1628705069.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 37 ++++++++++++++----------------------- 1 file changed, 14 insertions(+), 23 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 9d1c18865ef4..6ad6c56e7f4d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1705,7 +1705,6 @@ static inline void io_req_complete(struct io_kiocb *req, long res) static void io_req_complete_failed(struct io_kiocb *req, long res) { req_set_fail(req); - io_put_req(req); io_req_complete_post(req, res, 0); } @@ -1760,7 +1759,14 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx) return nr != 0; } +/* + * A request might get retired back into the request caches even before opcode + * handlers and io_issue_sqe() are done with it, e.g. inline completion path. + * Because of that, io_alloc_req() should be called only under ->uring_lock + * and with extra caution to not get a request that is still worked on. + */ static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx) + __must_hold(&ctx->uring_lock) { struct io_submit_state *state = &ctx->submit_state; gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; @@ -1875,8 +1881,6 @@ static void io_fail_links(struct io_kiocb *req) trace_io_uring_fail_link(req, link); io_cqring_fill_event(link->ctx, link->user_data, -ECANCELED, 0); - - io_put_req(link); io_put_req_deferred(link); link = nxt; } @@ -2158,8 +2162,6 @@ static void io_submit_flush_completions(struct io_ring_ctx *ctx) for (i = 0; i < nr; i++) { struct io_kiocb *req = state->compl_reqs[i]; - /* submission and completion refs */ - io_put_req(req); if (req_ref_put_and_test(req)) io_req_free_batch(&rb, req, &ctx->submit_state); } @@ -2264,7 +2266,6 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, if (READ_ONCE(req->result) == -EAGAIN && resubmit && !(req->flags & REQ_F_DONT_REISSUE)) { req->iopoll_completed = 0; - req_ref_get(req); io_req_task_queue_reissue(req); continue; } @@ -2762,7 +2763,6 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret, if (check_reissue && (req->flags & REQ_F_REISSUE)) { req->flags &= ~REQ_F_REISSUE; if (io_resubmit_prep(req)) { - req_ref_get(req); io_req_task_queue_reissue(req); } else { int cflags = 0; @@ -3188,9 +3188,6 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, req->rw.kiocb.ki_flags &= ~IOCB_WAITQ; list_del_init(&wait->entry); - - /* submit ref gets dropped, acquire a new one */ - req_ref_get(req); io_req_task_queue(req); return 1; } @@ -5242,10 +5239,6 @@ static bool io_poll_remove_one(struct io_kiocb *req) io_cqring_fill_event(req->ctx, req->user_data, -ECANCELED, 0); io_commit_cqring(req->ctx); req_set_fail(req); - - /* non-poll requests have submit ref still */ - if (req->opcode != IORING_OP_POLL_ADD) - io_put_req(req); io_put_req_deferred(req); } return do_complete; @@ -6280,6 +6273,9 @@ static void io_wq_submit_work(struct io_wq_work *work) struct io_kiocb *timeout; int ret = 0; + /* will be dropped by ->io_free_work() after returning to io-wq */ + req_ref_get(req); + timeout = io_prep_linked_timeout(req); if (timeout) io_queue_linked_timeout(timeout); @@ -6302,11 +6298,8 @@ static void io_wq_submit_work(struct io_wq_work *work) } /* avoid locking problems by failing it from a clean context */ - if (ret) { - /* io-wq is going to take one down */ - req_ref_get(req); + if (ret) io_req_task_queue_fail(req, ret); - } } static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table, @@ -6448,6 +6441,8 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) nxt->opcode != IORING_OP_LINK_TIMEOUT) return NULL; + /* linked timeouts should have two refs once prep'ed */ + req_ref_get(nxt); nxt->timeout.head = req; nxt->flags |= REQ_F_LTIMEOUT_ACTIVE; req->flags |= REQ_F_LINK_TIMEOUT; @@ -6468,7 +6463,6 @@ issue_sqe: * doesn't support non-blocking read/write attempts */ if (likely(!ret)) { - /* drop submission reference */ if (req->flags & REQ_F_COMPLETE_INLINE) { struct io_ring_ctx *ctx = req->ctx; struct io_submit_state *state = &ctx->submit_state; @@ -6476,8 +6470,6 @@ issue_sqe: state->compl_reqs[state->compl_nr++] = req; if (state->compl_nr == ARRAY_SIZE(state->compl_reqs)) io_submit_flush_completions(ctx); - } else { - io_put_req(req); } } else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { switch (io_arm_poll_handler(req)) { @@ -6557,8 +6549,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->user_data = READ_ONCE(sqe->user_data); req->file = NULL; req->fixed_rsrc_refs = NULL; - /* one is dropped after submission, the other at completion */ - atomic_set(&req->refs, 2); + atomic_set(&req->refs, 1); req->task = current; /* enforce forwards compatibility on users */ From 20e60a3832089741d6b25c13d291050c5d00b4e7 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 11 Aug 2021 19:28:30 +0100 Subject: [PATCH 242/315] io_uring: skip request refcounting As submission references are gone, there is only one initial reference left. Instead of actually doing atomic refcounting, add a flag indicating whether we're going to take more refs or doing any other sync magic. The flag should be set before the request may get used in parallel. Together with the previous patch it saves 2 refcount atomics per request for IOPOLL and IRQ completions, and 1 atomic per req for inline completions, with some exceptions. In particular, currently, there are three cases, when the refcounting have to be enabled: - Polling, including apoll. Because double poll entries takes a ref. Might get relaxed in the near future. - Link timeouts, enabled for both, the timeout and the request it's bound to, because they work in-parallel and we need to synchronise to cancel one of them on completion. - When a request gets in io-wq, because it doesn't hold uring_lock and we need guarantees of submission references. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/8b204b6c5f6643062270a1913d6d3a7f8f795fd9.1628705069.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6ad6c56e7f4d..dae87c576943 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -710,6 +710,7 @@ enum { REQ_F_REISSUE_BIT, REQ_F_DONT_REISSUE_BIT, REQ_F_CREDS_BIT, + REQ_F_REFCOUNT_BIT, /* keep async read/write and isreg together and in order */ REQ_F_NOWAIT_READ_BIT, REQ_F_NOWAIT_WRITE_BIT, @@ -765,6 +766,8 @@ enum { REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), /* has creds assigned */ REQ_F_CREDS = BIT(REQ_F_CREDS_BIT), + /* skip refcounting if not set */ + REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT), }; struct async_poll { @@ -1087,26 +1090,40 @@ EXPORT_SYMBOL(io_uring_get_socket); static inline bool req_ref_inc_not_zero(struct io_kiocb *req) { + WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); return atomic_inc_not_zero(&req->refs); } static inline bool req_ref_put_and_test(struct io_kiocb *req) { + if (likely(!(req->flags & REQ_F_REFCOUNT))) + return true; + WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); return atomic_dec_and_test(&req->refs); } static inline void req_ref_put(struct io_kiocb *req) { + WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); WARN_ON_ONCE(req_ref_put_and_test(req)); } static inline void req_ref_get(struct io_kiocb *req) { + WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); atomic_inc(&req->refs); } +static inline void io_req_refcount(struct io_kiocb *req) +{ + if (!(req->flags & REQ_F_REFCOUNT)) { + req->flags |= REQ_F_REFCOUNT; + atomic_set(&req->refs, 1); + } +} + static inline void io_req_set_rsrc_node(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; @@ -5192,6 +5209,7 @@ static int io_arm_poll_handler(struct io_kiocb *req) req->apoll = apoll; req->flags |= REQ_F_POLLED; ipt.pt._qproc = io_async_queue_proc; + io_req_refcount(req); ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, io_async_wake); @@ -5382,6 +5400,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe if (flags & ~IORING_POLL_ADD_MULTI) return -EINVAL; + io_req_refcount(req); poll->events = io_poll_parse_events(sqe, flags); return 0; } @@ -6273,6 +6292,7 @@ static void io_wq_submit_work(struct io_wq_work *work) struct io_kiocb *timeout; int ret = 0; + io_req_refcount(req); /* will be dropped by ->io_free_work() after returning to io-wq */ req_ref_get(req); @@ -6442,7 +6462,10 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) return NULL; /* linked timeouts should have two refs once prep'ed */ + io_req_refcount(req); + io_req_refcount(nxt); req_ref_get(nxt); + nxt->timeout.head = req; nxt->flags |= REQ_F_LTIMEOUT_ACTIVE; req->flags |= REQ_F_LINK_TIMEOUT; @@ -6549,7 +6572,6 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->user_data = READ_ONCE(sqe->user_data); req->file = NULL; req->fixed_rsrc_refs = NULL; - atomic_set(&req->refs, 1); req->task = current; /* enforce forwards compatibility on users */ From fd08e5309bba8672c1190362dff6c92bfd59218d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 11 Aug 2021 19:28:31 +0100 Subject: [PATCH 243/315] io_uring: optimise hot path of ltimeout prep io_prep_linked_timeout() grew too heavy and compiler now refuse to inline the function. Help it by splitting in two and annotating with inline. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/560636717a32e9513724f09b9ecaace942dde4d4.1628705069.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 45 +++++++++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index dae87c576943..4f5a00707644 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1046,7 +1046,6 @@ static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, static void io_put_req(struct io_kiocb *req); static void io_put_req_deferred(struct io_kiocb *req); static void io_dismantle_req(struct io_kiocb *req); -static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req); static void io_queue_linked_timeout(struct io_kiocb *req); static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, struct io_uring_rsrc_update2 *up, @@ -1299,6 +1298,31 @@ static void io_req_track_inflight(struct io_kiocb *req) } } +static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req) +{ + struct io_kiocb *nxt = req->link; + + if (req->flags & REQ_F_LINK_TIMEOUT) + return NULL; + + /* linked timeouts should have two refs once prep'ed */ + io_req_refcount(req); + io_req_refcount(nxt); + req_ref_get(nxt); + + nxt->timeout.head = req; + nxt->flags |= REQ_F_LTIMEOUT_ACTIVE; + req->flags |= REQ_F_LINK_TIMEOUT; + return nxt; +} + +static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) +{ + if (likely(!req->link || req->link->opcode != IORING_OP_LINK_TIMEOUT)) + return NULL; + return __io_prep_linked_timeout(req); +} + static void io_prep_async_work(struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; @@ -6453,25 +6477,6 @@ static void io_queue_linked_timeout(struct io_kiocb *req) io_put_req(req); } -static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) -{ - struct io_kiocb *nxt = req->link; - - if (!nxt || (req->flags & REQ_F_LINK_TIMEOUT) || - nxt->opcode != IORING_OP_LINK_TIMEOUT) - return NULL; - - /* linked timeouts should have two refs once prep'ed */ - io_req_refcount(req); - io_req_refcount(nxt); - req_ref_get(nxt); - - nxt->timeout.head = req; - nxt->flags |= REQ_F_LTIMEOUT_ACTIVE; - req->flags |= REQ_F_LINK_TIMEOUT; - return nxt; -} - static void __io_queue_sqe(struct io_kiocb *req) __must_hold(&req->ctx->uring_lock) { From a4aadd11ea4932588e6530ecd021ffe39f9d5adf Mon Sep 17 00:00:00 2001 From: Hao Xu Date: Thu, 12 Aug 2021 12:14:34 +0800 Subject: [PATCH 244/315] io_uring: extract io_uring_files_cancel() in io_uring_task_cancel() Extract io_uring_files_cancel() call in io_uring_task_cancel() to make io_uring_files_cancel() and io_uring_task_cancel() coherent and easy to read. Signed-off-by: Hao Xu Signed-off-by: Jens Axboe --- include/linux/io_uring.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 04b650bcbbe5..ed13304e764c 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -17,7 +17,8 @@ static inline void io_uring_files_cancel(struct files_struct *files) } static inline void io_uring_task_cancel(void) { - return io_uring_files_cancel(NULL); + if (current->io_uring) + __io_uring_cancel(NULL); } static inline void io_uring_free(struct task_struct *tsk) { From f552a27afe67f05c47bb0c33b92af2a23b684c31 Mon Sep 17 00:00:00 2001 From: Hao Xu Date: Thu, 12 Aug 2021 12:14:35 +0800 Subject: [PATCH 245/315] io_uring: remove files pointer in cancellation functions When doing cancellation, we use a parameter to indicate where it's from do_exit or exec. So a boolean value is good enough for this, remove the struct files* as it is not necessary. Signed-off-by: Hao Xu [axboe: fixup io_uring_files_cancel for !CONFIG_IO_URING] Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 ++-- include/linux/io_uring.h | 10 +++++----- kernel/exit.c | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 4f5a00707644..7626cad93f60 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -9213,9 +9213,9 @@ static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) } } -void __io_uring_cancel(struct files_struct *files) +void __io_uring_cancel(bool cancel_all) { - io_uring_cancel_generic(!files, NULL); + io_uring_cancel_generic(cancel_all, NULL); } static void *io_uring_validate_mmap_request(struct file *file, diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index ed13304e764c..649a4d7c241b 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -7,18 +7,18 @@ #if defined(CONFIG_IO_URING) struct sock *io_uring_get_socket(struct file *file); -void __io_uring_cancel(struct files_struct *files); +void __io_uring_cancel(bool cancel_all); void __io_uring_free(struct task_struct *tsk); -static inline void io_uring_files_cancel(struct files_struct *files) +static inline void io_uring_files_cancel(void) { if (current->io_uring) - __io_uring_cancel(files); + __io_uring_cancel(false); } static inline void io_uring_task_cancel(void) { if (current->io_uring) - __io_uring_cancel(NULL); + __io_uring_cancel(true); } static inline void io_uring_free(struct task_struct *tsk) { @@ -33,7 +33,7 @@ static inline struct sock *io_uring_get_socket(struct file *file) static inline void io_uring_task_cancel(void) { } -static inline void io_uring_files_cancel(struct files_struct *files) +static inline void io_uring_files_cancel(void) { } static inline void io_uring_free(struct task_struct *tsk) diff --git a/kernel/exit.c b/kernel/exit.c index 9a89e7f36acb..91a43e57a32e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -777,7 +777,7 @@ void __noreturn do_exit(long code) schedule(); } - io_uring_files_cancel(tsk->files); + io_uring_files_cancel(); exit_signals(tsk); /* sets PF_EXITING */ /* sync mm's RSS info before statistics gathering */ From 41a5169c23ebe85fdd0b64a0b6381f486a34ef3c Mon Sep 17 00:00:00 2001 From: Hao Xu Date: Thu, 12 Aug 2021 15:47:02 +0800 Subject: [PATCH 246/315] io_uring: code clean for completion_lock in io_arm_poll_handler() We can merge two spin_unlock() operations to one since we removed some code not long ago. Signed-off-by: Hao Xu Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 7626cad93f60..c581c7cbb8b2 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5237,13 +5237,10 @@ static int io_arm_poll_handler(struct io_kiocb *req) ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, io_async_wake); - if (ret || ipt.error) { - spin_unlock(&ctx->completion_lock); - if (ret) - return IO_APOLL_READY; - return IO_APOLL_ABORTED; - } spin_unlock(&ctx->completion_lock); + if (ret || ipt.error) + return ret ? IO_APOLL_READY : IO_APOLL_ABORTED; + trace_io_uring_poll_arm(ctx, req, req->opcode, req->user_data, mask, apoll->poll.events); return IO_APOLL_OK; From a141dd896f544df9627502cfb3fc1a73fb6587e4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 12 Aug 2021 12:48:34 -0600 Subject: [PATCH 247/315] io_uring: correct __must_hold annotation io_req_free_batch() has a __must_hold annotation referencing a request being passed in, but we're passing in the context. Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index c581c7cbb8b2..e1ca427e183a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2182,7 +2182,7 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req, } static void io_submit_flush_completions(struct io_ring_ctx *ctx) - __must_hold(&req->ctx->uring_lock) + __must_hold(&ctx->uring_lock) { struct io_submit_state *state = &ctx->submit_state; int i, nr = state->compl_nr; From 48dcd38d73c22b22bf9dc1c01b0ca0b8414b31da Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 15 Aug 2021 10:40:18 +0100 Subject: [PATCH 248/315] io_uring: optimise iowq refcounting If a requests is forwarded into io-wq, there is a good chance it hasn't been refcounted yet and we can save one req_ref_get() by setting the refcount number to the right value directly. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/2d53f4449faaf73b4a4c5de667fc3c176d974860.1628981736.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index e1ca427e183a..86466e12c74d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1115,14 +1115,19 @@ static inline void req_ref_get(struct io_kiocb *req) atomic_inc(&req->refs); } -static inline void io_req_refcount(struct io_kiocb *req) +static inline void __io_req_set_refcount(struct io_kiocb *req, int nr) { if (!(req->flags & REQ_F_REFCOUNT)) { req->flags |= REQ_F_REFCOUNT; - atomic_set(&req->refs, 1); + atomic_set(&req->refs, nr); } } +static inline void io_req_set_refcount(struct io_kiocb *req) +{ + __io_req_set_refcount(req, 1); +} + static inline void io_req_set_rsrc_node(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; @@ -1306,8 +1311,8 @@ static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req) return NULL; /* linked timeouts should have two refs once prep'ed */ - io_req_refcount(req); - io_req_refcount(nxt); + io_req_set_refcount(req); + io_req_set_refcount(nxt); req_ref_get(nxt); nxt->timeout.head = req; @@ -5233,7 +5238,7 @@ static int io_arm_poll_handler(struct io_kiocb *req) req->apoll = apoll; req->flags |= REQ_F_POLLED; ipt.pt._qproc = io_async_queue_proc; - io_req_refcount(req); + io_req_set_refcount(req); ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, io_async_wake); @@ -5421,7 +5426,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe if (flags & ~IORING_POLL_ADD_MULTI) return -EINVAL; - io_req_refcount(req); + io_req_set_refcount(req); poll->events = io_poll_parse_events(sqe, flags); return 0; } @@ -6313,9 +6318,11 @@ static void io_wq_submit_work(struct io_wq_work *work) struct io_kiocb *timeout; int ret = 0; - io_req_refcount(req); - /* will be dropped by ->io_free_work() after returning to io-wq */ - req_ref_get(req); + /* one will be dropped by ->io_free_work() after returning to io-wq */ + if (!(req->flags & REQ_F_REFCOUNT)) + __io_req_set_refcount(req, 2); + else + req_ref_get(req); timeout = io_prep_linked_timeout(req); if (timeout) From 761bcac1573efc99042d59add94d468bf17127f0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 15 Aug 2021 10:40:19 +0100 Subject: [PATCH 249/315] io_uring: don't inflight-track linked timeouts Tracking linked timeouts as infligh was needed to make sure that io-wq is not destroyed by io_uring_cancel_generic() racing with io_async_cancel_one() accessing it. Now, cancellations issued by linked timeouts are done in the task context, so it's already synchronised. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/e1b05cf47cb69df2305efdbee8cf7ba36f46c1a3.1628981736.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 86466e12c74d..5b95ab661f31 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5701,8 +5701,6 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, data->mode = io_translate_timeout_mode(flags); hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode); - if (is_timeout_link) - io_req_track_inflight(req); return 0; } From fb6820998f57a3e63a382a322530fa28522a2bba Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 15 Aug 2021 10:40:20 +0100 Subject: [PATCH 250/315] io_uring: optimise initial ltimeout refcounting Linked timeouts are never refcounted when it comes to the first call to __io_prep_linked_timeout(), so save an io_ref_get() and set the desired value directly. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/177b24cc62ffbb42d915d6eb9e8876266e4c0d5a.1628981736.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 5b95ab661f31..005fc06f89b9 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1312,8 +1312,7 @@ static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req) /* linked timeouts should have two refs once prep'ed */ io_req_set_refcount(req); - io_req_set_refcount(nxt); - req_ref_get(nxt); + __io_req_set_refcount(nxt, 2); nxt->timeout.head = req; nxt->flags |= REQ_F_LTIMEOUT_ACTIVE; From a8576af9d1b03a1b8aba7228e938ab0817fdbda6 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 15 Aug 2021 10:40:21 +0100 Subject: [PATCH 251/315] io_uring: kill not necessary resubmit switch 773af69121ecc ("io_uring: always reissue from task_work context") makes all resubmission to be made from task_work, so we don't need that hack with resubmit/not-resubmit switch anymore. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/47fa177cca04e5ffd308a35227966c8e15d8525b.1628981736.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 005fc06f89b9..d6e0e7e317da 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2293,7 +2293,7 @@ static inline bool io_run_task_work(void) * Find and free completed poll iocbs */ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, - struct list_head *done, bool resubmit) + struct list_head *done) { struct req_batch rb; struct io_kiocb *req; @@ -2308,7 +2308,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, req = list_first_entry(done, struct io_kiocb, inflight_entry); list_del(&req->inflight_entry); - if (READ_ONCE(req->result) == -EAGAIN && resubmit && + if (READ_ONCE(req->result) == -EAGAIN && !(req->flags & REQ_F_DONT_REISSUE)) { req->iopoll_completed = 0; io_req_task_queue_reissue(req); @@ -2331,7 +2331,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, } static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, - long min, bool resubmit) + long min) { struct io_kiocb *req, *tmp; LIST_HEAD(done); @@ -2371,7 +2371,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, } if (!list_empty(&done)) - io_iopoll_complete(ctx, nr_events, &done, resubmit); + io_iopoll_complete(ctx, nr_events, &done); return 0; } @@ -2389,7 +2389,7 @@ static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) while (!list_empty(&ctx->iopoll_list)) { unsigned int nr_events = 0; - io_do_iopoll(ctx, &nr_events, 0, false); + io_do_iopoll(ctx, &nr_events, 0); /* let it sleep and repeat later if can't complete a request */ if (nr_events == 0) @@ -2451,7 +2451,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) list_empty(&ctx->iopoll_list)) break; } - ret = io_do_iopoll(ctx, &nr_events, min, true); + ret = io_do_iopoll(ctx, &nr_events, min); } while (!ret && nr_events < min && !need_resched()); out: mutex_unlock(&ctx->uring_lock); @@ -6857,7 +6857,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) mutex_lock(&ctx->uring_lock); if (!list_empty(&ctx->iopoll_list)) - io_do_iopoll(ctx, &nr_events, 0, true); + io_do_iopoll(ctx, &nr_events, 0); /* * Don't submit if refs are dying, good for io_uring_register(), From 8cb01fac982a3f8622a46821af1eb68136f936ca Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 15 Aug 2021 10:40:22 +0100 Subject: [PATCH 252/315] io_uring: deduplicate cancellation code IORING_OP_ASYNC_CANCEL and IORING_OP_LINK_TIMEOUT have enough of overlap, so extract a helper for request cancellation and use in both. Also, removes some amount of ugliness because of success_ret. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/900122b588e65b637e71bfec80a260726c6a54d6.1628981736.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 46 ++++++++++++++++++---------------------------- 1 file changed, 18 insertions(+), 28 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index d6e0e7e317da..a23b6089cbf6 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5792,32 +5792,24 @@ static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data, return ret; } -static void io_async_find_and_cancel(struct io_ring_ctx *ctx, - struct io_kiocb *req, __u64 sqe_addr, - int success_ret) +static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr) + __acquires(&req->ctx->completion_lock) { + struct io_ring_ctx *ctx = req->ctx; int ret; + WARN_ON_ONCE(req->task != current); + ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx); spin_lock(&ctx->completion_lock); if (ret != -ENOENT) - goto done; + return ret; spin_lock_irq(&ctx->timeout_lock); ret = io_timeout_cancel(ctx, sqe_addr); spin_unlock_irq(&ctx->timeout_lock); if (ret != -ENOENT) - goto done; - ret = io_poll_cancel(ctx, sqe_addr, false); -done: - if (!ret) - ret = success_ret; - io_cqring_fill_event(ctx, req->user_data, ret, 0); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); - - if (ret < 0) - req_set_fail(req); + return ret; + return io_poll_cancel(ctx, sqe_addr, false); } static int io_async_cancel_prep(struct io_kiocb *req, @@ -5841,17 +5833,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) struct io_tctx_node *node; int ret; - /* tasks should wait for their io-wq threads, so safe w/o sync */ - ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx); - spin_lock(&ctx->completion_lock); - if (ret != -ENOENT) - goto done; - spin_lock_irq(&ctx->timeout_lock); - ret = io_timeout_cancel(ctx, sqe_addr); - spin_unlock_irq(&ctx->timeout_lock); - if (ret != -ENOENT) - goto done; - ret = io_poll_cancel(ctx, sqe_addr, false); + ret = io_try_cancel_userdata(req, sqe_addr); if (ret != -ENOENT) goto done; spin_unlock(&ctx->completion_lock); @@ -6418,9 +6400,17 @@ static void io_req_task_link_timeout(struct io_kiocb *req) { struct io_kiocb *prev = req->timeout.prev; struct io_ring_ctx *ctx = req->ctx; + int ret; if (prev) { - io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME); + ret = io_try_cancel_userdata(req, prev->user_data); + if (!ret) + ret = -ETIME; + io_cqring_fill_event(ctx, req->user_data, ret, 0); + io_commit_cqring(ctx); + spin_unlock(&ctx->completion_lock); + io_cqring_ev_posted(ctx); + io_put_req(prev); io_put_req(req); } else { From b97e736a4b553ff18963019c7ca91cd684f83709 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 15 Aug 2021 10:40:23 +0100 Subject: [PATCH 253/315] io_uring: kill REQ_F_LTIMEOUT_ACTIVE Instead of handling double consecutive linked timeouts through tricky flag combinations, just check the submit_state.link during timeout_prep and fail that case in advance. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/04150760b0dc739522264b8abd309409f7421a06.1628981736.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index a23b6089cbf6..abf8bb781d40 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -705,7 +705,6 @@ enum { REQ_F_NEED_CLEANUP_BIT, REQ_F_POLLED_BIT, REQ_F_BUFFER_SELECTED_BIT, - REQ_F_LTIMEOUT_ACTIVE_BIT, REQ_F_COMPLETE_INLINE_BIT, REQ_F_REISSUE_BIT, REQ_F_DONT_REISSUE_BIT, @@ -750,8 +749,6 @@ enum { REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), /* buffer already selected */ REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), - /* linked timeout is active, i.e. prepared by link's head */ - REQ_F_LTIMEOUT_ACTIVE = BIT(REQ_F_LTIMEOUT_ACTIVE_BIT), /* completion is deferred through io_comp_state */ REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT), /* caller should reissue async */ @@ -1315,7 +1312,6 @@ static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req) __io_req_set_refcount(nxt, 2); nxt->timeout.head = req; - nxt->flags |= REQ_F_LTIMEOUT_ACTIVE; req->flags |= REQ_F_LINK_TIMEOUT; return nxt; } @@ -1895,11 +1891,7 @@ static bool io_kill_linked_timeout(struct io_kiocb *req) { struct io_kiocb *link = req->link; - /* - * Can happen if a linked timeout fired and link had been like - * req -> link t-out -> link t-out [-> ...] - */ - if (link && (link->flags & REQ_F_LTIMEOUT_ACTIVE)) { + if (link && link->opcode == IORING_OP_LINK_TIMEOUT) { struct io_timeout_data *io = link->async_data; io_remove_next_linked(req); @@ -5700,6 +5692,15 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, data->mode = io_translate_timeout_mode(flags); hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode); + + if (is_timeout_link) { + struct io_submit_link *link = &req->ctx->submit_state.link; + + if (!link->head) + return -EINVAL; + if (link->last->opcode == IORING_OP_LINK_TIMEOUT) + return -EINVAL; + } return 0; } From 4d13d1a4d1e1807e04b846b48934e87016027f90 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 15 Aug 2021 10:40:24 +0100 Subject: [PATCH 254/315] io_uring: simplify io_prep_linked_timeout The link test in io_prep_linked_timeout() is pretty bulky, replace it with a flag. It's better for normal path and linked requests, and also will be used further for request failing. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/3703770bfae8bc1ff370e43ef5767940202cab42.1628981736.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index abf8bb781d40..097ba596c819 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -710,6 +710,7 @@ enum { REQ_F_DONT_REISSUE_BIT, REQ_F_CREDS_BIT, REQ_F_REFCOUNT_BIT, + REQ_F_ARM_LTIMEOUT_BIT, /* keep async read/write and isreg together and in order */ REQ_F_NOWAIT_READ_BIT, REQ_F_NOWAIT_WRITE_BIT, @@ -765,6 +766,8 @@ enum { REQ_F_CREDS = BIT(REQ_F_CREDS_BIT), /* skip refcounting if not set */ REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT), + /* there is a linked timeout that has to be armed */ + REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT), }; struct async_poll { @@ -1302,23 +1305,18 @@ static void io_req_track_inflight(struct io_kiocb *req) static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req) { - struct io_kiocb *nxt = req->link; - - if (req->flags & REQ_F_LINK_TIMEOUT) - return NULL; + req->flags &= ~REQ_F_ARM_LTIMEOUT; + req->flags |= REQ_F_LINK_TIMEOUT; /* linked timeouts should have two refs once prep'ed */ io_req_set_refcount(req); - __io_req_set_refcount(nxt, 2); - - nxt->timeout.head = req; - req->flags |= REQ_F_LINK_TIMEOUT; - return nxt; + __io_req_set_refcount(req->link, 2); + return req->link; } static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) { - if (likely(!req->link || req->link->opcode != IORING_OP_LINK_TIMEOUT)) + if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT))) return NULL; return __io_prep_linked_timeout(req); } @@ -5700,6 +5698,8 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, return -EINVAL; if (link->last->opcode == IORING_OP_LINK_TIMEOUT) return -EINVAL; + req->timeout.head = link->last; + link->last->flags |= REQ_F_ARM_LTIMEOUT; } return 0; } From 0756a8691017518ceeca4c083e7a359107186498 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 15 Aug 2021 10:40:25 +0100 Subject: [PATCH 255/315] io_uring: cancel not-armed linked touts separately Adjust io_disarm_next(), so it can detect if there is a linked but not-yet-armed timeout and complete/cancel it separately. Will be used in the following patch. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/ae228cde2c0df3d92d29d5e4852ed9fa8a2a97db.1628981736.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 097ba596c819..8eea3a1e8c21 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1034,6 +1034,9 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_UNLINKAT] = {}, }; +/* requests with any of those set should undergo io_disarm_next() */ +#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) + static bool io_disarm_next(struct io_kiocb *req); static void io_uring_del_tctx_node(unsigned long index); static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, @@ -1688,7 +1691,7 @@ static void io_req_complete_post(struct io_kiocb *req, long res, */ if (req_ref_put_and_test(req)) { if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { - if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL)) + if (req->flags & IO_DISARM_MASK) io_disarm_next(req); if (req->link) { io_req_task_queue(req->link); @@ -1926,7 +1929,17 @@ static bool io_disarm_next(struct io_kiocb *req) { bool posted = false; - if (likely(req->flags & REQ_F_LINK_TIMEOUT)) { + if (req->flags & REQ_F_ARM_LTIMEOUT) { + struct io_kiocb *link = req->link; + + if (link && link->opcode == IORING_OP_LINK_TIMEOUT) { + io_remove_next_linked(req); + io_cqring_fill_event(link->ctx, link->user_data, + -ECANCELED, 0); + io_put_req_deferred(link); + posted = true; + } + } else if (req->flags & REQ_F_LINK_TIMEOUT) { struct io_ring_ctx *ctx = req->ctx; spin_lock_irq(&ctx->timeout_lock); @@ -1951,7 +1964,7 @@ static struct io_kiocb *__io_req_find_next(struct io_kiocb *req) * dependencies to the next request. In case of failure, fail the rest * of the chain. */ - if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL)) { + if (req->flags & IO_DISARM_MASK) { struct io_ring_ctx *ctx = req->ctx; bool posted; From 906c6caaf586180261ea581915e1cf8bc466bd69 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 15 Aug 2021 10:40:26 +0100 Subject: [PATCH 256/315] io_uring: optimise io_prep_linked_timeout() Linked timeout handling during issuing is heavy, it adds extra instructions and forces to save the next linked timeout before io_issue_sqe(). Follwing the same reasoning as in refcounting patches, a request can't be freed by the time it returns from io_issue_sqe(), so now we don't need to do io_prep_linked_timeout() in advance, and it can be delayed to colder paths optimising the generic path. Also, it should also save quite a lot for requests with linked timeouts and completed inline on timeout spinlocking + hrtimer_start() + hrtimer_try_to_cancel() and so on. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/19bfc9a0d26c5c5f1e359f7650afe807ca8ef879.1628981736.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 8eea3a1e8c21..d78a05ecbf68 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1306,8 +1306,16 @@ static void io_req_track_inflight(struct io_kiocb *req) } } +static inline void io_unprep_linked_timeout(struct io_kiocb *req) +{ + req->flags &= ~REQ_F_LINK_TIMEOUT; +} + static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req) { + if (WARN_ON_ONCE(!req->link)) + return NULL; + req->flags &= ~REQ_F_ARM_LTIMEOUT; req->flags |= REQ_F_LINK_TIMEOUT; @@ -1932,6 +1940,7 @@ static bool io_disarm_next(struct io_kiocb *req) if (req->flags & REQ_F_ARM_LTIMEOUT) { struct io_kiocb *link = req->link; + req->flags &= ~REQ_F_ARM_LTIMEOUT; if (link && link->opcode == IORING_OP_LINK_TIMEOUT) { io_remove_next_linked(req); io_cqring_fill_event(link->ctx, link->user_data, @@ -6485,7 +6494,7 @@ static void io_queue_linked_timeout(struct io_kiocb *req) static void __io_queue_sqe(struct io_kiocb *req) __must_hold(&req->ctx->uring_lock) { - struct io_kiocb *linked_timeout = io_prep_linked_timeout(req); + struct io_kiocb *linked_timeout; int ret; issue_sqe: @@ -6503,10 +6512,19 @@ issue_sqe: state->compl_reqs[state->compl_nr++] = req; if (state->compl_nr == ARRAY_SIZE(state->compl_reqs)) io_submit_flush_completions(ctx); + return; } + + linked_timeout = io_prep_linked_timeout(req); + if (linked_timeout) + io_queue_linked_timeout(linked_timeout); } else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { + linked_timeout = io_prep_linked_timeout(req); + switch (io_arm_poll_handler(req)) { case IO_APOLL_READY: + if (linked_timeout) + io_unprep_linked_timeout(req); goto issue_sqe; case IO_APOLL_ABORTED: /* @@ -6516,11 +6534,12 @@ issue_sqe: io_queue_async_work(req); break; } + + if (linked_timeout) + io_queue_linked_timeout(linked_timeout); } else { io_req_complete_failed(req, ret); } - if (linked_timeout) - io_queue_linked_timeout(linked_timeout); } static inline void io_queue_sqe(struct io_kiocb *req) From ae421d9350b51cba1daa28ee6eb14fbce7517eca Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 17 Aug 2021 20:28:08 +0100 Subject: [PATCH 257/315] io_uring: better encapsulate buffer select for rw Make io_put_rw_kbuf() to do the REQ_F_BUFFER_SELECTED check, so all the callers don't need to hand code it. The number of places where we call io_put_rw_kbuf() is growing, so saves some pain. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/3df3919e5e7efe03420c44ab4d9317a81a9cf398.1629228203.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index d78a05ecbf68..4b6003bc731a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2286,6 +2286,8 @@ static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req) { struct io_buffer *kbuf; + if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) + return 0; kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; return io_put_kbuf(req, kbuf); } @@ -2315,8 +2317,6 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, io_init_req_batch(&rb); while (!list_empty(done)) { - int cflags = 0; - req = list_first_entry(done, struct io_kiocb, inflight_entry); list_del(&req->inflight_entry); @@ -2327,10 +2327,8 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, continue; } - if (req->flags & REQ_F_BUFFER_SELECTED) - cflags = io_put_rw_kbuf(req); - - __io_cqring_fill_event(ctx, req->user_data, req->result, cflags); + __io_cqring_fill_event(ctx, req->user_data, req->result, + io_put_rw_kbuf(req)); (*nr_events)++; if (req_ref_put_and_test(req)) @@ -2550,11 +2548,7 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res) static void io_req_task_complete(struct io_kiocb *req) { - int cflags = 0; - - if (req->flags & REQ_F_BUFFER_SELECTED) - cflags = io_put_rw_kbuf(req); - __io_req_complete(req, 0, req->result, cflags); + __io_req_complete(req, 0, req->result, io_put_rw_kbuf(req)); } static void __io_complete_rw(struct io_kiocb *req, long res, long res2, @@ -2822,12 +2816,9 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret, if (io_resubmit_prep(req)) { io_req_task_queue_reissue(req); } else { - int cflags = 0; - req_set_fail(req); - if (req->flags & REQ_F_BUFFER_SELECTED) - cflags = io_put_rw_kbuf(req); - __io_req_complete(req, issue_flags, ret, cflags); + __io_req_complete(req, issue_flags, ret, + io_put_rw_kbuf(req)); } } } From 505657bc6c52b01304d8a7c79b2f98878e3d83db Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 17 Aug 2021 20:28:09 +0100 Subject: [PATCH 258/315] io_uring: reuse io_req_complete_post() We have io_req_complete_post() to post a CQE and put the request. It takes care of all synchronisation and is more concise and efficent, so replace all hancoded occurrences of "lock; post CQE; unlock; + put_req()" with io_req_complete_post(). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/2c83463458a613f9d870e5147eb134da2aa70779.1629228203.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 48 +++++++++++------------------------------------- 1 file changed, 11 insertions(+), 37 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 4b6003bc731a..6f2668b6b867 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5523,16 +5523,8 @@ err: static void io_req_task_timeout(struct io_kiocb *req) { - struct io_ring_ctx *ctx = req->ctx; - - spin_lock(&ctx->completion_lock); - io_cqring_fill_event(ctx, req->user_data, -ETIME, 0); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - - io_cqring_ev_posted(ctx); req_set_fail(req); - io_put_req(req); + io_req_complete_post(req, -ETIME, 0); } static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) @@ -5660,14 +5652,9 @@ static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) io_translate_timeout_mode(tr->flags)); spin_unlock_irq(&ctx->timeout_lock); - spin_lock(&ctx->completion_lock); - io_cqring_fill_event(ctx, req->user_data, ret, 0); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); if (ret < 0) req_set_fail(req); - io_put_req(req); + io_req_complete_post(req, ret, 0); return 0; } @@ -5807,7 +5794,6 @@ static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data, } static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr) - __acquires(&req->ctx->completion_lock) { struct io_ring_ctx *ctx = req->ctx; int ret; @@ -5815,15 +5801,19 @@ static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr) WARN_ON_ONCE(req->task != current); ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx); - spin_lock(&ctx->completion_lock); if (ret != -ENOENT) return ret; + + spin_lock(&ctx->completion_lock); spin_lock_irq(&ctx->timeout_lock); ret = io_timeout_cancel(ctx, sqe_addr); spin_unlock_irq(&ctx->timeout_lock); if (ret != -ENOENT) - return ret; - return io_poll_cancel(ctx, sqe_addr, false); + goto out; + ret = io_poll_cancel(ctx, sqe_addr, false); +out: + spin_unlock(&ctx->completion_lock); + return ret; } static int io_async_cancel_prep(struct io_kiocb *req, @@ -5850,7 +5840,6 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) ret = io_try_cancel_userdata(req, sqe_addr); if (ret != -ENOENT) goto done; - spin_unlock(&ctx->completion_lock); /* slow path, try all io-wq's */ io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); @@ -5863,17 +5852,10 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) break; } io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); - - spin_lock(&ctx->completion_lock); done: - io_cqring_fill_event(ctx, req->user_data, ret, 0); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); - if (ret < 0) req_set_fail(req); - io_put_req(req); + io_req_complete_post(req, ret, 0); return 0; } @@ -6413,20 +6395,12 @@ static inline struct file *io_file_get(struct io_ring_ctx *ctx, static void io_req_task_link_timeout(struct io_kiocb *req) { struct io_kiocb *prev = req->timeout.prev; - struct io_ring_ctx *ctx = req->ctx; int ret; if (prev) { ret = io_try_cancel_userdata(req, prev->user_data); - if (!ret) - ret = -ETIME; - io_cqring_fill_event(ctx, req->user_data, ret, 0); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); - + io_req_complete_post(req, ret ?: -ETIME, 0); io_put_req(prev); - io_put_req(req); } else { io_req_complete_post(req, -ETIME, 0); } From 23a65db83b3f4549e5eee1fb5517c3365f627699 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 17 Aug 2021 20:28:11 +0100 Subject: [PATCH 259/315] io_uring: improve same wq polling Move earlier the check for whether __io_queue_proc() tries to poll already polled waitqueue, and do the same for the second poll entry, if any. Shouldn't really matter, but at least it would have a more predictable behaviour. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/8cb428cfe8ade0fd055859fabb878db8777d4c2f.1629228203.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6f2668b6b867..71d54841ecbe 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5062,8 +5062,13 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, if (unlikely(pt->nr_entries)) { struct io_poll_iocb *poll_one = poll; + /* double add on the same waitqueue head, ignore */ + if (poll_one->head == head) + return; /* already have a 2nd entry, fail a third attempt */ if (*poll_ptr) { + if ((*poll_ptr)->head == head) + return; pt->error = -EINVAL; return; } @@ -5073,9 +5078,6 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, */ if (!(poll_one->events & EPOLLONESHOT)) poll_one->events |= EPOLLONESHOT; - /* double add on the same waitqueue head, ignore */ - if (poll_one->head == head) - return; poll = kmalloc(sizeof(*poll), GFP_ATOMIC); if (!poll) { pt->error = -ENOMEM; From ec3c3d0f3a271b5c7422449262970e7eb98f2126 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 18 Aug 2021 10:50:52 +0100 Subject: [PATCH 260/315] io_uring: fix io_timeout_remove locking io_timeout_cancel() posts CQEs so needs ->completion_lock to be held, so grab it in io_timeout_remove(). Fixes: 48ecb6369f1f2 ("io_uring: run timeouts from task_work") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/d6f03d653a4d7bf693ef6f39b6a426b6d97fd96f.1629280204.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 71d54841ecbe..08546332f1b9 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5572,6 +5572,7 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, } static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) + __must_hold(&ctx->completion_lock) __must_hold(&ctx->timeout_lock) { struct io_kiocb *req = io_timeout_extract(ctx, user_data); @@ -5646,13 +5647,18 @@ static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) struct io_ring_ctx *ctx = req->ctx; int ret; - spin_lock_irq(&ctx->timeout_lock); - if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) + if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) { + spin_lock(&ctx->completion_lock); + spin_lock_irq(&ctx->timeout_lock); ret = io_timeout_cancel(ctx, tr->addr); - else + spin_unlock_irq(&ctx->timeout_lock); + spin_unlock(&ctx->completion_lock); + } else { + spin_lock_irq(&ctx->timeout_lock); ret = io_timeout_update(ctx, tr->addr, &tr->ts, io_translate_timeout_mode(tr->flags)); - spin_unlock_irq(&ctx->timeout_lock); + spin_unlock_irq(&ctx->timeout_lock); + } if (ret < 0) req_set_fail(req); From 79dca1846fe979304ad0b998e56b20326e2e5a72 Mon Sep 17 00:00:00 2001 From: Hao Xu Date: Tue, 10 Aug 2021 20:55:54 +0800 Subject: [PATCH 261/315] io-wq: move nr_running and worker_refs out of wqe->lock protection We don't need to protect nr_running and worker_refs by wqe->lock, so narrow the range of raw_spin_lock_irq - raw_spin_unlock_irq Signed-off-by: Hao Xu Link: https://lore.kernel.org/r/20210810125554.99229-1-haoxu@linux.alibaba.com Signed-off-by: Jens Axboe --- fs/io-wq.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index 4ce83bb48021..8da9bb103916 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -256,16 +256,17 @@ static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct) raw_spin_lock_irq(&wqe->lock); if (acct->nr_workers < acct->max_workers) { - atomic_inc(&acct->nr_running); - atomic_inc(&wqe->wq->worker_refs); if (!acct->nr_workers) first = true; acct->nr_workers++; do_create = true; } raw_spin_unlock_irq(&wqe->lock); - if (do_create) + if (do_create) { + atomic_inc(&acct->nr_running); + atomic_inc(&wqe->wq->worker_refs); create_io_worker(wqe->wq, wqe, acct->index, first); + } } } From 316319e82f7342ef327223a23199648bfabeadcd Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 19 Aug 2021 09:41:42 -0600 Subject: [PATCH 262/315] io_uring: add comments on why PF_EXITING checking is safe We have two checks of task->flags & PF_EXITING left: 1) In io_req_task_submit(), which is called in task_work and hence always in the context of the original task. That means that req->task == current, and hence checking ->flags is totally fine. 2) In io_poll_rewait(), where we need to stop re-arming poll to prevent it interfering with cancelation. This is only run from task_work as well, and hence for this case too req->task == current. Add a comment to both spots detailing that. Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 08546332f1b9..78aa4daf7969 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2112,6 +2112,7 @@ static void io_req_task_submit(struct io_kiocb *req) /* ctx stays valid until unlock, even if we drop all ours ctx->refs */ mutex_lock(&ctx->uring_lock); + /* req->task == current here, checking PF_EXITING is safe */ if (likely(!(req->task->flags & PF_EXITING))) __io_queue_sqe(req); else @@ -4893,6 +4894,7 @@ static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll) { struct io_ring_ctx *ctx = req->ctx; + /* req->task == current here, checking PF_EXITING is safe */ if (unlikely(req->task->flags & PF_EXITING)) WRITE_ONCE(poll->canceled, true); From e98e49b2bbf777f91732dc916d7ad33876c663c9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 18 Aug 2021 17:01:43 +0100 Subject: [PATCH 263/315] io_uring: extend task put optimisations Now with IRQ completions done via IRQ, almost all requests freeing are done from the context of submitter task, so it makes sense to extend task_put optimisation from io_req_free_batch_finish() to cover all the cases including task_work by moving it into io_put_task(). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/824a7cbd745ddeee4a0f3ff85c558a24fd005872.1629302453.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 78aa4daf7969..9107cd78863d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1623,10 +1623,14 @@ static inline void io_put_task(struct task_struct *task, int nr) { struct io_uring_task *tctx = task->io_uring; - percpu_counter_sub(&tctx->inflight, nr); - if (unlikely(atomic_read(&tctx->in_idle))) - wake_up(&tctx->wait); - put_task_struct_many(task, nr); + if (likely(task == current)) { + tctx->cached_refs += nr; + } else { + percpu_counter_sub(&tctx->inflight, nr); + if (unlikely(atomic_read(&tctx->in_idle))) + wake_up(&tctx->wait); + put_task_struct_many(task, nr); + } } static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, @@ -2171,9 +2175,7 @@ static void io_req_free_batch_finish(struct io_ring_ctx *ctx, { if (rb->ctx_refs) percpu_ref_put_many(&ctx->refs, rb->ctx_refs); - if (rb->task == current) - current->io_uring->cached_refs += rb->task_refs; - else if (rb->task) + if (rb->task) io_put_task(rb->task, rb->task_refs); } From 187f08c12cd1d81f000cdc9c0119ef6e0a6f47e3 Mon Sep 17 00:00:00 2001 From: wangyangbo Date: Thu, 19 Aug 2021 13:56:57 +0800 Subject: [PATCH 264/315] io_uring: Add register support for non-4k PAGE_SIZE Now allocated rsrc table uses PAGE_SIZE as the size of 2nd-level, and accessing this table relies on each level index from fixed TABLE_SHIFT (12 - 3) in 4k page case. In order to correctly work in non-4k page, define TABLE_SHIFT as non-fixed (PAGE_SHIFT - shift of data) for 2nd-level table entry number. Signed-off-by: wangyangbo Link: https://lore.kernel.org/r/20210819055657.27327-1-wangyangbo@uniontech.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 9107cd78863d..5d3df4f913a8 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -92,12 +92,12 @@ #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8 -/* 512 entries per page on 64-bit archs, 64 pages max */ +/* only define max */ #define IORING_MAX_FIXED_FILES (1U << 15) #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ IORING_REGISTER_LAST + IORING_OP_LAST) -#define IO_RSRC_TAG_TABLE_SHIFT 9 +#define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3) #define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT) #define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1) From 99c8bc52d1321ab3a711eba2941eadbe7425230f Mon Sep 17 00:00:00 2001 From: Hao Xu Date: Sat, 21 Aug 2021 06:19:54 +0800 Subject: [PATCH 265/315] io_uring: fix lack of protection for compl_nr coml_nr in ctx_flush_and_put() is not protected by uring_lock, this may cause problems when accessing in parallel: say coml_nr > 0 ctx_flush_and put other context if (compl_nr) get mutex coml_nr > 0 do flush coml_nr = 0 release mutex get mutex do flush (*) release mutex in (*) place, we call io_cqring_ev_posted() and users likely get no events there. To avoid spurious events, re-check the value when under the lock. Fixes: 2c32395d8111 ("io_uring: fix __tctx_task_work() ctx race") Signed-off-by: Hao Xu Link: https://lore.kernel.org/r/20210820221954.61815-1-haoxu@linux.alibaba.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 5d3df4f913a8..706ac8c03b95 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2007,7 +2007,8 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx) return; if (ctx->submit_state.compl_nr) { mutex_lock(&ctx->uring_lock); - io_submit_flush_completions(ctx); + if (ctx->submit_state.compl_nr) + io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); } percpu_ref_put(&ctx->refs); From 3a1b8a4e843f96b636431450d8d79061605cf74b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 20 Aug 2021 10:36:35 +0100 Subject: [PATCH 266/315] io_uring: limit fixed table size by RLIMIT_NOFILE Limit the number of files in io_uring fixed tables by RLIMIT_NOFILE, that's the first and the simpliest restriction that we should impose. Cc: stable@vger.kernel.org Suggested-by: Jens Axboe Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/b2756c340aed7d6c0b302c26dab50c6c5907f4ce.1629451684.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 706ac8c03b95..9a021fe6c461 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7733,6 +7733,8 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, return -EINVAL; if (nr_args > IORING_MAX_FIXED_FILES) return -EMFILE; + if (nr_args > rlimit(RLIMIT_NOFILE)) + return -EMFILE; ret = io_rsrc_node_switch_start(ctx); if (ret) return ret; From 0bea96f59ba40e63c0ae93ad6a02417b95f22f4d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 20 Aug 2021 10:36:36 +0100 Subject: [PATCH 267/315] io_uring: place fixed tables under memcg limits Fixed tables may be large enough, place all of them together with allocated tags under memcg limits. Cc: stable@vger.kernel.org Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/b3ac9f5da9821bb59837b5fe25e8ef4be982218c.1629451684.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 9a021fe6c461..c6f1afa5ec04 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7138,14 +7138,14 @@ static void **io_alloc_page_table(size_t size) size_t init_size = size; void **table; - table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL); + table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT); if (!table) return NULL; for (i = 0; i < nr_tables; i++) { unsigned int this_size = min_t(size_t, size, PAGE_SIZE); - table[i] = kzalloc(this_size, GFP_KERNEL); + table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT); if (!table[i]) { io_free_page_table(table, init_size); return NULL; @@ -7336,7 +7336,8 @@ fail: static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files) { - table->files = kvcalloc(nr_files, sizeof(table->files[0]), GFP_KERNEL); + table->files = kvcalloc(nr_files, sizeof(table->files[0]), + GFP_KERNEL_ACCOUNT); return !!table->files; } From 2c5d763c1939fbd130452ee0d4d1a44b5dd97bb7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 21 Aug 2021 07:21:19 -0600 Subject: [PATCH 268/315] io_uring: add clarifying comment for io_cqring_ev_posted() We've previously had an issue where overflow flush unconditionally calls io_cqring_ev_posted() even if it didn't flush any events to the ring, causing wake and eventfd increment where no new events are available. Some applications don't like that, see commit b18032bb0a88 for details. This came up in discussion for another patch recently, hence add a comment detailing what the relationship between calling the events posted helper and CQ ring entries is. Link: https://lore.kernel.org/io-uring/77a44fce-c831-16a6-8e80-9aee77f496a2@kernel.dk/ Signed-off-by: Jens Axboe --- fs/io_uring.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index c6f1afa5ec04..8d263209b508 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1526,6 +1526,13 @@ static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx) return !ctx->eventfd_async || io_wq_current_is_worker(); } +/* + * This should only get called when at least one event has been posted. + * Some applications rely on the eventfd notification count only changing + * IFF a new CQE has been added to the CQ ring. There's no depedency on + * 1:1 relationship between how many times this function is called (and + * hence the eventfd count) and number of CQEs posted to the CQ ring. + */ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) { /* From 26578cda3db983b17cabe4e577af26306beb9987 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 20 Aug 2021 10:36:37 +0100 Subject: [PATCH 269/315] io_uring: add ->splice_fd_in checks ->splice_fd_in is used only by splice/tee, but no other request checks it for validity. Add the check for most of request types excluding reads/writes/sends/recvs, we don't want overhead for them and can leave them be as is until the field is actually used. Cc: stable@vger.kernel.org Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/f44bc2acd6777d932de3d71a5692235b5b2b7397.1629451684.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 52 +++++++++++++++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 8d263209b508..3898f7ab14f6 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3511,7 +3511,7 @@ static int io_renameat_prep(struct io_kiocb *req, if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (sqe->ioprio || sqe->buf_index) + if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; @@ -3562,7 +3562,8 @@ static int io_unlinkat_prep(struct io_kiocb *req, if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index) + if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index || + sqe->splice_fd_in) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; @@ -3608,8 +3609,8 @@ static int io_shutdown_prep(struct io_kiocb *req, #if defined(CONFIG_NET) if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags || - sqe->buf_index) + if (unlikely(sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags || + sqe->buf_index || sqe->splice_fd_in)) return -EINVAL; req->shutdown.how = READ_ONCE(sqe->len); @@ -3757,7 +3758,8 @@ static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) + if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index || + sqe->splice_fd_in)) return -EINVAL; req->sync.flags = READ_ONCE(sqe->fsync_flags); @@ -3790,7 +3792,8 @@ static int io_fsync(struct io_kiocb *req, unsigned int issue_flags) static int io_fallocate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - if (sqe->ioprio || sqe->buf_index || sqe->rw_flags) + if (sqe->ioprio || sqe->buf_index || sqe->rw_flags || + sqe->splice_fd_in) return -EINVAL; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -3823,7 +3826,7 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (unlikely(sqe->ioprio || sqe->buf_index)) + if (unlikely(sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; @@ -3942,7 +3945,8 @@ static int io_remove_buffers_prep(struct io_kiocb *req, struct io_provide_buf *p = &req->pbuf; u64 tmp; - if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off) + if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off || + sqe->splice_fd_in) return -EINVAL; tmp = READ_ONCE(sqe->fd); @@ -4013,7 +4017,7 @@ static int io_provide_buffers_prep(struct io_kiocb *req, struct io_provide_buf *p = &req->pbuf; u64 tmp; - if (sqe->ioprio || sqe->rw_flags) + if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in) return -EINVAL; tmp = READ_ONCE(sqe->fd); @@ -4100,7 +4104,7 @@ static int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_EPOLL) - if (sqe->ioprio || sqe->buf_index) + if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -4146,7 +4150,7 @@ static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) - if (sqe->ioprio || sqe->buf_index || sqe->off) + if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -4181,7 +4185,7 @@ static int io_madvise(struct io_kiocb *req, unsigned int issue_flags) static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - if (sqe->ioprio || sqe->buf_index || sqe->addr) + if (sqe->ioprio || sqe->buf_index || sqe->addr || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -4219,7 +4223,7 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (sqe->ioprio || sqe->buf_index) + if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (req->flags & REQ_F_FIXED_FILE) return -EBADF; @@ -4255,7 +4259,7 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->ioprio || sqe->off || sqe->addr || sqe->len || - sqe->rw_flags || sqe->buf_index) + sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (req->flags & REQ_F_FIXED_FILE) return -EBADF; @@ -4316,7 +4320,8 @@ static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) + if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index || + sqe->splice_fd_in)) return -EINVAL; req->sync.off = READ_ONCE(sqe->off); @@ -4743,7 +4748,7 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (sqe->ioprio || sqe->len || sqe->buf_index) + if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->splice_fd_in) return -EINVAL; accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); @@ -4791,7 +4796,8 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags) + if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags || + sqe->splice_fd_in) return -EINVAL; conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); @@ -5387,7 +5393,7 @@ static int io_poll_update_prep(struct io_kiocb *req, if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (sqe->ioprio || sqe->buf_index) + if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) return -EINVAL; flags = READ_ONCE(sqe->len); if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA | @@ -5626,7 +5632,7 @@ static int io_timeout_remove_prep(struct io_kiocb *req, return -EINVAL; if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; - if (sqe->ioprio || sqe->buf_index || sqe->len) + if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->splice_fd_in) return -EINVAL; tr->addr = READ_ONCE(sqe->addr); @@ -5687,7 +5693,8 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (sqe->ioprio || sqe->buf_index || sqe->len != 1) + if (sqe->ioprio || sqe->buf_index || sqe->len != 1 || + sqe->splice_fd_in) return -EINVAL; if (off && is_timeout_link) return -EINVAL; @@ -5843,7 +5850,8 @@ static int io_async_cancel_prep(struct io_kiocb *req, return -EINVAL; if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; - if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags) + if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags || + sqe->splice_fd_in) return -EINVAL; req->cancel.addr = READ_ONCE(sqe->addr); @@ -5884,7 +5892,7 @@ static int io_rsrc_update_prep(struct io_kiocb *req, { if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; - if (sqe->ioprio || sqe->rw_flags) + if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in) return -EINVAL; req->rsrc_update.offset = READ_ONCE(sqe->off); From 5636c00d3e8ef1f6d1291e71edb48f727ba5a999 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 18 Aug 2021 12:42:45 +0100 Subject: [PATCH 270/315] io_uring: flush completions for fallbacks io_fallback_req_func() doesn't expect anyone creating inline completions, and no one currently does that. Teach the function to flush completions preparing for further changes. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/8b941516921f72e1a64d58932d671736892d7fff.1629286357.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 3898f7ab14f6..926df5b00b31 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1197,6 +1197,11 @@ static void io_fallback_req_func(struct work_struct *work) percpu_ref_get(&ctx->refs); llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node) req->io_task_work.func(req); + + mutex_lock(&ctx->uring_lock); + if (ctx->submit_state.compl_nr) + io_submit_flush_completions(ctx); + mutex_unlock(&ctx->uring_lock); percpu_ref_put(&ctx->refs); } From f237c30a5610d35a584f3296d397b93d80ce374e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 18 Aug 2021 12:42:46 +0100 Subject: [PATCH 271/315] io_uring: batch task work locking Many task_work handlers either grab ->uring_lock, or may benefit from having it. Move locking logic out of individual handlers to a lazy approach controlled by tctx_task_work(), so we don't keep doing tons of mutex lock/unlock. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/d6a34e147f2507a2f3e2fa1e38a9c541dcad3929.1629286357.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 80 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 49 insertions(+), 31 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 926df5b00b31..cb98a831586a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -775,7 +775,7 @@ struct async_poll { struct io_poll_iocb *double_poll; }; -typedef void (*io_req_tw_func_t)(struct io_kiocb *req); +typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked); struct io_task_work { union { @@ -1080,6 +1080,14 @@ struct sock *io_uring_get_socket(struct file *file) } EXPORT_SYMBOL(io_uring_get_socket); +static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked) +{ + if (!*locked) { + mutex_lock(&ctx->uring_lock); + *locked = true; + } +} + #define io_for_each_link(pos, head) \ for (pos = (head); pos; pos = pos->link) @@ -1193,16 +1201,19 @@ static void io_fallback_req_func(struct work_struct *work) fallback_work.work); struct llist_node *node = llist_del_all(&ctx->fallback_llist); struct io_kiocb *req, *tmp; + bool locked = false; percpu_ref_get(&ctx->refs); llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node) - req->io_task_work.func(req); + req->io_task_work.func(req, &locked); - mutex_lock(&ctx->uring_lock); - if (ctx->submit_state.compl_nr) - io_submit_flush_completions(ctx); - mutex_unlock(&ctx->uring_lock); + if (locked) { + if (ctx->submit_state.compl_nr) + io_submit_flush_completions(ctx); + mutex_unlock(&ctx->uring_lock); + } percpu_ref_put(&ctx->refs); + } static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) @@ -1386,12 +1397,15 @@ static void io_prep_async_link(struct io_kiocb *req) } } -static void io_queue_async_work(struct io_kiocb *req) +static void io_queue_async_work(struct io_kiocb *req, bool *locked) { struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *link = io_prep_linked_timeout(req); struct io_uring_task *tctx = req->task->io_uring; + /* must not take the lock, NULL it as a precaution */ + locked = NULL; + BUG_ON(!tctx); BUG_ON(!tctx->io_wq); @@ -2013,21 +2027,22 @@ static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) return __io_req_find_next(req); } -static void ctx_flush_and_put(struct io_ring_ctx *ctx) +static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked) { if (!ctx) return; - if (ctx->submit_state.compl_nr) { - mutex_lock(&ctx->uring_lock); + if (*locked) { if (ctx->submit_state.compl_nr) io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); + *locked = false; } percpu_ref_put(&ctx->refs); } static void tctx_task_work(struct callback_head *cb) { + bool locked = false; struct io_ring_ctx *ctx = NULL; struct io_uring_task *tctx = container_of(cb, struct io_uring_task, task_work); @@ -2050,18 +2065,18 @@ static void tctx_task_work(struct callback_head *cb) io_task_work.node); if (req->ctx != ctx) { - ctx_flush_and_put(ctx); + ctx_flush_and_put(ctx, &locked); ctx = req->ctx; percpu_ref_get(&ctx->refs); } - req->io_task_work.func(req); + req->io_task_work.func(req, &locked); node = next; } while (node); cond_resched(); } - ctx_flush_and_put(ctx); + ctx_flush_and_put(ctx, &locked); } static void io_req_task_work_add(struct io_kiocb *req) @@ -2113,28 +2128,26 @@ static void io_req_task_work_add(struct io_kiocb *req) } } -static void io_req_task_cancel(struct io_kiocb *req) +static void io_req_task_cancel(struct io_kiocb *req, bool *locked) { struct io_ring_ctx *ctx = req->ctx; /* ctx is guaranteed to stay alive while we hold uring_lock */ - mutex_lock(&ctx->uring_lock); + io_tw_lock(ctx, locked); io_req_complete_failed(req, req->result); - mutex_unlock(&ctx->uring_lock); } -static void io_req_task_submit(struct io_kiocb *req) +static void io_req_task_submit(struct io_kiocb *req, bool *locked) { struct io_ring_ctx *ctx = req->ctx; /* ctx stays valid until unlock, even if we drop all ours ctx->refs */ - mutex_lock(&ctx->uring_lock); + io_tw_lock(ctx, locked); /* req->task == current here, checking PF_EXITING is safe */ if (likely(!(req->task->flags & PF_EXITING))) __io_queue_sqe(req); else io_req_complete_failed(req, -EFAULT); - mutex_unlock(&ctx->uring_lock); } static void io_req_task_queue_fail(struct io_kiocb *req, int ret) @@ -2170,6 +2183,11 @@ static void io_free_req(struct io_kiocb *req) __io_free_req(req); } +static void io_free_req_work(struct io_kiocb *req, bool *locked) +{ + io_free_req(req); +} + struct req_batch { struct task_struct *task; int task_refs; @@ -2267,7 +2285,7 @@ static inline void io_put_req(struct io_kiocb *req) static inline void io_put_req_deferred(struct io_kiocb *req) { if (req_ref_put_and_test(req)) { - req->io_task_work.func = io_free_req; + req->io_task_work.func = io_free_req_work; io_req_task_work_add(req); } } @@ -2562,7 +2580,7 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res) return false; } -static void io_req_task_complete(struct io_kiocb *req) +static void io_req_task_complete(struct io_kiocb *req, bool *locked) { __io_req_complete(req, 0, req->result, io_put_rw_kbuf(req)); } @@ -2572,7 +2590,7 @@ static void __io_complete_rw(struct io_kiocb *req, long res, long res2, { if (__io_complete_rw_common(req, res)) return; - io_req_task_complete(req); + __io_req_complete(req, 0, req->result, io_put_rw_kbuf(req)); } static void io_complete_rw(struct kiocb *kiocb, long res, long res2) @@ -4994,7 +5012,7 @@ static bool io_poll_complete(struct io_kiocb *req, __poll_t mask) return !(flags & IORING_CQE_F_MORE); } -static void io_poll_task_func(struct io_kiocb *req) +static void io_poll_task_func(struct io_kiocb *req, bool *locked) { struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *nxt; @@ -5018,7 +5036,7 @@ static void io_poll_task_func(struct io_kiocb *req) if (done) { nxt = io_put_req_find_next(req); if (nxt) - io_req_task_submit(nxt); + io_req_task_submit(nxt, locked); } } } @@ -5130,7 +5148,7 @@ static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll); } -static void io_async_task_func(struct io_kiocb *req) +static void io_async_task_func(struct io_kiocb *req, bool *locked) { struct async_poll *apoll = req->apoll; struct io_ring_ctx *ctx = req->ctx; @@ -5147,7 +5165,7 @@ static void io_async_task_func(struct io_kiocb *req) spin_unlock(&ctx->completion_lock); if (!READ_ONCE(apoll->poll.canceled)) - io_req_task_submit(req); + io_req_task_submit(req, locked); else io_req_complete_failed(req, -ECANCELED); } @@ -5546,7 +5564,7 @@ err: return 0; } -static void io_req_task_timeout(struct io_kiocb *req) +static void io_req_task_timeout(struct io_kiocb *req, bool *locked) { req_set_fail(req); io_req_complete_post(req, -ETIME, 0); @@ -6103,7 +6121,7 @@ fail: if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) { spin_unlock(&ctx->completion_lock); kfree(de); - io_queue_async_work(req); + io_queue_async_work(req, NULL); return true; } @@ -6425,7 +6443,7 @@ static inline struct file *io_file_get(struct io_ring_ctx *ctx, return io_file_get_normal(ctx, req, fd); } -static void io_req_task_link_timeout(struct io_kiocb *req) +static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked) { struct io_kiocb *prev = req->timeout.prev; int ret; @@ -6529,7 +6547,7 @@ issue_sqe: * Queued up for async execution, worker will release * submit reference when the iocb is actually submitted. */ - io_queue_async_work(req); + io_queue_async_work(req, NULL); break; } @@ -6554,7 +6572,7 @@ static inline void io_queue_sqe(struct io_kiocb *req) if (unlikely(ret)) io_req_complete_failed(req, ret); else - io_queue_async_work(req); + io_queue_async_work(req, NULL); } } From 126180b95f27ef6cc536da57115e06665254b0d7 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 18 Aug 2021 12:42:47 +0100 Subject: [PATCH 272/315] io_uring: IRQ rw completion batching Employ inline completion logic for read/write completions done via io_req_task_complete(). If ->uring_lock is contended, just do normal request completion, but if not, make tctx_task_work() to grab the lock and do batched inline completions in io_req_task_complete(). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/94589c3ce69eaed86a21bb1ec696407a54fab1aa.1629286357.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index cb98a831586a..827e60ae4909 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2067,6 +2067,8 @@ static void tctx_task_work(struct callback_head *cb) if (req->ctx != ctx) { ctx_flush_and_put(ctx, &locked); ctx = req->ctx; + /* if not contended, grab and improve batching */ + locked = mutex_trylock(&ctx->uring_lock); percpu_ref_get(&ctx->refs); } req->io_task_work.func(req, &locked); @@ -2582,7 +2584,20 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res) static void io_req_task_complete(struct io_kiocb *req, bool *locked) { - __io_req_complete(req, 0, req->result, io_put_rw_kbuf(req)); + unsigned int cflags = io_put_rw_kbuf(req); + long res = req->result; + + if (*locked) { + struct io_ring_ctx *ctx = req->ctx; + struct io_submit_state *state = &ctx->submit_state; + + io_req_complete_state(req, res, cflags); + state->compl_reqs[state->compl_nr++] = req; + if (state->compl_nr == ARRAY_SIZE(state->compl_reqs)) + io_submit_flush_completions(ctx); + } else { + io_req_complete_post(req, res, cflags); + } } static void __io_complete_rw(struct io_kiocb *req, long res, long res2, From dadebc350da2bef62593b1df007a6e0b90baf42a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 23 Aug 2021 13:30:44 +0100 Subject: [PATCH 273/315] io_uring: fix io_try_cancel_userdata race for iowq WARNING: CPU: 1 PID: 5870 at fs/io_uring.c:5975 io_try_cancel_userdata+0x30f/0x540 fs/io_uring.c:5975 CPU: 0 PID: 5870 Comm: iou-wrk-5860 Not tainted 5.14.0-rc6-next-20210820-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:io_try_cancel_userdata+0x30f/0x540 fs/io_uring.c:5975 Call Trace: io_async_cancel fs/io_uring.c:6014 [inline] io_issue_sqe+0x22d5/0x65a0 fs/io_uring.c:6407 io_wq_submit_work+0x1dc/0x300 fs/io_uring.c:6511 io_worker_handle_work+0xa45/0x1840 fs/io-wq.c:533 io_wqe_worker+0x2cc/0xbb0 fs/io-wq.c:582 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295 io_try_cancel_userdata() can be called from io_async_cancel() executing in the io-wq context, so the warning fires, which is there to alert anyone accessing task->io_uring->io_wq in a racy way. However, io_wq_put_and_exit() always first waits for all threads to complete, so the only detail left is to zero tctx->io_wq after the context is removed. note: one little assumption is that when IO_WQ_WORK_CANCEL, the executor won't touch ->io_wq, because io_wq_destroy() might cancel left pending requests in such a way. Cc: stable@vger.kernel.org Reported-by: syzbot+b0c9d1588ae92866515f@syzkaller.appspotmail.com Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/dfdd37a80cfa9ffd3e59538929c99cdd55d8699e.1629721757.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 827e60ae4909..6859438c4e09 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5863,7 +5863,7 @@ static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr) struct io_ring_ctx *ctx = req->ctx; int ret; - WARN_ON_ONCE(req->task != current); + WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current); ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx); if (ret != -ENOENT) @@ -6369,6 +6369,7 @@ static void io_wq_submit_work(struct io_wq_work *work) if (timeout) io_queue_linked_timeout(timeout); + /* either cancelled or io-wq is dying, so don't touch tctx->iowq */ if (work->flags & IO_WQ_WORK_CANCEL) ret = -ECANCELED; @@ -9184,8 +9185,8 @@ static void io_uring_clean_tctx(struct io_uring_task *tctx) * Must be after io_uring_del_task_file() (removes nodes under * uring_lock) to avoid race with io_uring_try_cancel_iowq(). */ - tctx->io_wq = NULL; io_wq_put_and_exit(wq); + tctx->io_wq = NULL; } } From 539711d7d6fe382a73254cc966602e63242a6fb3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Aug 2021 09:52:15 +0200 Subject: [PATCH 274/315] block: remove a pointless call to MINOR() in device_add_disk blk_alloc_ext_minor already returns just a minor number, so no need to mask the high bits. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210824075216.1179406-2-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/genhd.c b/block/genhd.c index a925f773145f..f13b7eb0238b 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -457,7 +457,7 @@ int device_add_disk(struct device *parent, struct gendisk *disk, if (ret < 0) return ret; disk->major = BLOCK_EXT_MAJOR; - disk->first_minor = MINOR(ret); + disk->first_minor = ret; disk->flags |= GENHD_FL_EXT_DEVT; } From c4b2b7d150d2b155b317b3e2f66492c6befab2b5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Aug 2021 09:52:16 +0200 Subject: [PATCH 275/315] block: remove CONFIG_DEBUG_BLOCK_EXT_DEVT This might have been a neat debug aid when the extended dev_t was added, but that time is long gone. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210824075216.1179406-3-hch@lst.de Signed-off-by: Jens Axboe --- arch/riscv/configs/defconfig | 1 - arch/riscv/configs/rv32_defconfig | 1 - block/genhd.c | 43 +++---------------------------- init/do_mounts.c | 4 --- lib/Kconfig.debug | 27 ------------------- 5 files changed, 4 insertions(+), 72 deletions(-) diff --git a/arch/riscv/configs/defconfig b/arch/riscv/configs/defconfig index 1f2be234b11c..bc68231a8fb7 100644 --- a/arch/riscv/configs/defconfig +++ b/arch/riscv/configs/defconfig @@ -132,7 +132,6 @@ CONFIG_DEBUG_PLIST=y CONFIG_DEBUG_SG=y # CONFIG_RCU_TRACE is not set CONFIG_RCU_EQS_DEBUG=y -CONFIG_DEBUG_BLOCK_EXT_DEVT=y # CONFIG_FTRACE is not set # CONFIG_RUNTIME_TESTING_MENU is not set CONFIG_MEMTEST=y diff --git a/arch/riscv/configs/rv32_defconfig b/arch/riscv/configs/rv32_defconfig index 8dd02b842fef..434ef5b64599 100644 --- a/arch/riscv/configs/rv32_defconfig +++ b/arch/riscv/configs/rv32_defconfig @@ -127,7 +127,6 @@ CONFIG_DEBUG_PLIST=y CONFIG_DEBUG_SG=y # CONFIG_RCU_TRACE is not set CONFIG_RCU_EQS_DEBUG=y -CONFIG_DEBUG_BLOCK_EXT_DEVT=y # CONFIG_FTRACE is not set # CONFIG_RUNTIME_TESTING_MENU is not set CONFIG_MEMTEST=y diff --git a/block/genhd.c b/block/genhd.c index f13b7eb0238b..6a5b65c86c4b 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -313,54 +313,19 @@ void unregister_blkdev(unsigned int major, const char *name) EXPORT_SYMBOL(unregister_blkdev); -/** - * blk_mangle_minor - scatter minor numbers apart - * @minor: minor number to mangle - * - * Scatter consecutively allocated @minor number apart if MANGLE_DEVT - * is enabled. Mangling twice gives the original value. - * - * RETURNS: - * Mangled value. - * - * CONTEXT: - * Don't care. - */ -static int blk_mangle_minor(int minor) -{ -#ifdef CONFIG_DEBUG_BLOCK_EXT_DEVT - int i; - - for (i = 0; i < MINORBITS / 2; i++) { - int low = minor & (1 << i); - int high = minor & (1 << (MINORBITS - 1 - i)); - int distance = MINORBITS - 1 - 2 * i; - - minor ^= low | high; /* clear both bits */ - low <<= distance; /* swap the positions */ - high >>= distance; - minor |= low | high; /* and set */ - } -#endif - return minor; -} - int blk_alloc_ext_minor(void) { int idx; idx = ida_alloc_range(&ext_devt_ida, 0, NR_EXT_DEVT, GFP_KERNEL); - if (idx < 0) { - if (idx == -ENOSPC) - return -EBUSY; - return idx; - } - return blk_mangle_minor(idx); + if (idx == -ENOSPC) + return -EBUSY; + return idx; } void blk_free_ext_minor(unsigned int minor) { - ida_free(&ext_devt_ida, blk_mangle_minor(minor)); + ida_free(&ext_devt_ida, minor); } static char *bdevt_str(dev_t devt, char *buf) diff --git a/init/do_mounts.c b/init/do_mounts.c index 74aede860de7..b691d6891e51 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -432,10 +432,6 @@ retry: printk("Please append a correct \"root=\" boot option; here are the available partitions:\n"); printk_all_partitions(); -#ifdef CONFIG_DEBUG_BLOCK_EXT_DEVT - printk("DEBUG_BLOCK_EXT_DEVT is enabled, you need to specify " - "explicit textual name for \"root=\" boot option.\n"); -#endif panic("VFS: Unable to mount root fs on %s", b); } if (!(flags & SB_RDONLY)) { diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 5ddd575159fb..22a4aa51bd58 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1679,33 +1679,6 @@ config DEBUG_WQ_FORCE_RR_CPU feature by default. When enabled, memory and cache locality will be impacted. -config DEBUG_BLOCK_EXT_DEVT - bool "Force extended block device numbers and spread them" - depends on DEBUG_KERNEL - depends on BLOCK - default n - help - BIG FAT WARNING: ENABLING THIS OPTION MIGHT BREAK BOOTING ON - SOME DISTRIBUTIONS. DO NOT ENABLE THIS UNLESS YOU KNOW WHAT - YOU ARE DOING. Distros, please enable this and fix whatever - is broken. - - Conventionally, block device numbers are allocated from - predetermined contiguous area. However, extended block area - may introduce non-contiguous block device numbers. This - option forces most block device numbers to be allocated from - the extended space and spreads them to discover kernel or - userland code paths which assume predetermined contiguous - device number allocation. - - Note that turning on this debug option shuffles all the - device numbers for all IDE and SCSI devices including libata - ones, so root partition specified using device number - directly (via rdev or root=MAJ:MIN) won't work anymore. - Textual device names (root=/dev/sdXn) will continue to work. - - Say N if you are unsure. - config CPU_HOTPLUG_STATE_CONTROL bool "Enable CPU hotplug state control" depends on DEBUG_KERNEL From d9cf3bd531844ffbfe94b16e417037a16efc988d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 19 Jul 2021 11:53:00 +0100 Subject: [PATCH 276/315] bio: fix page leak bio_add_hw_page failure __bio_iov_append_get_pages() doesn't put not appended pages on bio_add_hw_page() failure, so potentially leaking them, fix it. Also, do the same for __bio_iov_iter_get_pages(), even though it looks like it can't be triggered by userspace in this case. Fixes: 0512a75b98f8 ("block: Introduce REQ_OP_ZONE_APPEND") Cc: stable@vger.kernel.org # 5.8+ Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1edfa6a2ffd66d55e6345a477df5387d2c1415d0.1626653825.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- block/bio.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/block/bio.c b/block/bio.c index 0c89fa2f7a85..265bff6b549a 100644 --- a/block/bio.c +++ b/block/bio.c @@ -974,6 +974,14 @@ static int bio_iov_bvec_set_append(struct bio *bio, struct iov_iter *iter) return 0; } +static void bio_put_pages(struct page **pages, size_t size, size_t off) +{ + size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE); + + for (i = 0; i < nr; i++) + put_page(pages[i]); +} + #define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *)) /** @@ -1018,8 +1026,10 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) if (same_page) put_page(page); } else { - if (WARN_ON_ONCE(bio_full(bio, len))) - return -EINVAL; + if (WARN_ON_ONCE(bio_full(bio, len))) { + bio_put_pages(pages + i, left, offset); + return -EINVAL; + } __bio_add_page(bio, page, len, offset); } offset = 0; @@ -1064,6 +1074,7 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) len = min_t(size_t, PAGE_SIZE - offset, left); if (bio_add_hw_page(q, bio, page, len, offset, max_append_sectors, &same_page) != len) { + bio_put_pages(pages + i, left, offset); ret = -EINVAL; break; } From 0bdfbca8a623e262e0f343b143151000a300cbaf Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Fri, 20 Aug 2021 03:45:33 +0300 Subject: [PATCH 277/315] block: Add alternative_gpt_sector() operation Add alternative_gpt_sector() block device operation which specifies alternative location of a GPT entry. This allows us to support Android devices that have GPT entry at a non-standard location and can't be repartitioned easily. Reviewed-by: Christoph Hellwig Suggested-by: Christoph Hellwig Signed-off-by: Dmitry Osipenko Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20210820004536.15791-2-digetx@gmail.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 22b5b8502d2a..c9cb12483e12 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1872,6 +1872,13 @@ struct block_device_operations { char *(*devnode)(struct gendisk *disk, umode_t *mode); struct module *owner; const struct pr_ops *pr_ops; + + /* + * Special callback for probing GPT entry at a given sector. + * Needed by Android devices, used by GPT scanner and MMC blk + * driver. + */ + int (*alternative_gpt_sector)(struct gendisk *disk, sector_t *sector); }; #ifdef CONFIG_COMPAT From 466d9c4904deb25e2e8dcd29d3a998f4e3fa7c17 Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Fri, 20 Aug 2021 03:45:34 +0300 Subject: [PATCH 278/315] partitions/efi: Support non-standard GPT location Support looking up GPT at a non-standard location specified by a block device driver. Acked-by: Davidlohr Bueso Reviewed-by: Christoph Hellwig Signed-off-by: Dmitry Osipenko Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20210820004536.15791-3-digetx@gmail.com Signed-off-by: Jens Axboe --- block/partitions/efi.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/block/partitions/efi.c b/block/partitions/efi.c index aaa3dc487cb5..7ca5c4c374d4 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -585,6 +585,8 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, gpt_header *pgpt = NULL, *agpt = NULL; gpt_entry *pptes = NULL, *aptes = NULL; legacy_mbr *legacymbr; + struct gendisk *disk = state->disk; + const struct block_device_operations *fops = disk->fops; sector_t total_sectors = get_capacity(state->disk); u64 lastlba; @@ -619,6 +621,16 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, if (!good_agpt && force_gpt) good_agpt = is_gpt_valid(state, lastlba, &agpt, &aptes); + if (!good_agpt && force_gpt && fops->alternative_gpt_sector) { + sector_t agpt_sector; + int err; + + err = fops->alternative_gpt_sector(disk, &agpt_sector); + if (!err) + good_agpt = is_gpt_valid(state, agpt_sector, + &agpt, &aptes); + } + /* The obviously unsuccessful case */ if (!good_pgpt && !good_agpt) goto fail; From dc913385dd74e625271482c30aefedd1e5af7b8c Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Fri, 20 Aug 2021 03:45:35 +0300 Subject: [PATCH 279/315] mmc: block: Support alternative_gpt_sector() operation Support generic alternative_gpt_sector() block device operation. It calculates location of GPT entry for eMMC of NVIDIA Tegra Android devices. Add new MMC_CAP2_ALT_GPT_TEGRA flag that enables scanning of alternative GPT sector and add raw_boot_mult field to mmc_ext_csd which allows to get size of the boot partitions that is needed for the calculation. Signed-off-by: Dmitry Osipenko Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20210820004536.15791-4-digetx@gmail.com Signed-off-by: Jens Axboe --- drivers/mmc/core/block.c | 21 +++++++++++++++++++++ drivers/mmc/core/core.c | 35 +++++++++++++++++++++++++++++++++++ drivers/mmc/core/core.h | 2 ++ drivers/mmc/core/mmc.c | 2 ++ include/linux/mmc/card.h | 1 + include/linux/mmc/host.h | 1 + 6 files changed, 62 insertions(+) diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index 4c11f171e56d..6a15fdf6e5f2 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -831,6 +831,26 @@ static int mmc_blk_compat_ioctl(struct block_device *bdev, fmode_t mode, } #endif +static int mmc_blk_alternative_gpt_sector(struct gendisk *disk, + sector_t *sector) +{ + struct mmc_blk_data *md; + int ret; + + md = mmc_blk_get(disk); + if (!md) + return -EINVAL; + + if (md->queue.card) + ret = mmc_card_alternative_gpt_sector(md->queue.card, sector); + else + ret = -ENODEV; + + mmc_blk_put(md); + + return ret; +} + static const struct block_device_operations mmc_bdops = { .open = mmc_blk_open, .release = mmc_blk_release, @@ -840,6 +860,7 @@ static const struct block_device_operations mmc_bdops = { #ifdef CONFIG_COMPAT .compat_ioctl = mmc_blk_compat_ioctl, #endif + .alternative_gpt_sector = mmc_blk_alternative_gpt_sector, }; static int mmc_blk_part_switch_pre(struct mmc_card *card, diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index 95fedcf56e4a..605f5e8648c1 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -2149,6 +2149,41 @@ int mmc_detect_card_removed(struct mmc_host *host) } EXPORT_SYMBOL(mmc_detect_card_removed); +int mmc_card_alternative_gpt_sector(struct mmc_card *card, sector_t *gpt_sector) +{ + unsigned int boot_sectors_num; + + if ((!(card->host->caps2 & MMC_CAP2_ALT_GPT_TEGRA))) + return -EOPNOTSUPP; + + /* filter out unrelated cards */ + if (card->ext_csd.rev < 3 || + !mmc_card_mmc(card) || + !mmc_card_is_blockaddr(card) || + mmc_card_is_removable(card->host)) + return -ENOENT; + + /* + * eMMC storage has two special boot partitions in addition to the + * main one. NVIDIA's bootloader linearizes eMMC boot0->boot1->main + * accesses, this means that the partition table addresses are shifted + * by the size of boot partitions. In accordance with the eMMC + * specification, the boot partition size is calculated as follows: + * + * boot partition size = 128K byte x BOOT_SIZE_MULT + * + * Calculate number of sectors occupied by the both boot partitions. + */ + boot_sectors_num = card->ext_csd.raw_boot_mult * SZ_128K / + SZ_512 * MMC_NUM_BOOT_PARTITION; + + /* Defined by NVIDIA and used by Android devices. */ + *gpt_sector = card->ext_csd.sectors - boot_sectors_num - 1; + + return 0; +} +EXPORT_SYMBOL(mmc_card_alternative_gpt_sector); + void mmc_rescan(struct work_struct *work) { struct mmc_host *host = diff --git a/drivers/mmc/core/core.h b/drivers/mmc/core/core.h index 0c4de2030b3f..7931a4f0137d 100644 --- a/drivers/mmc/core/core.h +++ b/drivers/mmc/core/core.h @@ -119,6 +119,8 @@ void mmc_release_host(struct mmc_host *host); void mmc_get_card(struct mmc_card *card, struct mmc_ctx *ctx); void mmc_put_card(struct mmc_card *card, struct mmc_ctx *ctx); +int mmc_card_alternative_gpt_sector(struct mmc_card *card, sector_t *sector); + /** * mmc_claim_host - exclusively claim a host * @host: mmc host to claim diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c index 838726b68ff3..29e58ffae379 100644 --- a/drivers/mmc/core/mmc.c +++ b/drivers/mmc/core/mmc.c @@ -418,6 +418,8 @@ static int mmc_decode_ext_csd(struct mmc_card *card, u8 *ext_csd) ext_csd[EXT_CSD_ERASE_TIMEOUT_MULT]; card->ext_csd.raw_hc_erase_grp_size = ext_csd[EXT_CSD_HC_ERASE_GRP_SIZE]; + card->ext_csd.raw_boot_mult = + ext_csd[EXT_CSD_BOOT_MULT]; if (card->ext_csd.rev >= 3) { u8 sa_shift = ext_csd[EXT_CSD_S_A_TIMEOUT]; card->ext_csd.part_config = ext_csd[EXT_CSD_PART_CONFIG]; diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h index 74e6c0624d27..37f975875102 100644 --- a/include/linux/mmc/card.h +++ b/include/linux/mmc/card.h @@ -109,6 +109,7 @@ struct mmc_ext_csd { u8 raw_hc_erase_gap_size; /* 221 */ u8 raw_erase_timeout_mult; /* 223 */ u8 raw_hc_erase_grp_size; /* 224 */ + u8 raw_boot_mult; /* 226 */ u8 raw_sec_trim_mult; /* 229 */ u8 raw_sec_erase_mult; /* 230 */ u8 raw_sec_feature_support;/* 231 */ diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index 0abd47e9ef9b..78dadf86b38f 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -398,6 +398,7 @@ struct mmc_host { #else #define MMC_CAP2_CRYPTO 0 #endif +#define MMC_CAP2_ALT_GPT_TEGRA (1 << 28) /* Host with eMMC that has GPT entry at a non-standard location */ int fixed_drv_type; /* fixed driver type for non-removable media */ From 1743fa54c9e8247000e060fcdab406ab3a808223 Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Fri, 20 Aug 2021 03:45:36 +0300 Subject: [PATCH 280/315] mmc: sdhci-tegra: Enable MMC_CAP2_ALT_GPT_TEGRA Tegra20/30/114/124 Android devices place GPT at a non-standard location. Enable GPT entry scanning at that location. Signed-off-by: Dmitry Osipenko Acked-by: Thierry Reding Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20210820004536.15791-5-digetx@gmail.com Signed-off-by: Jens Axboe --- drivers/mmc/host/sdhci-tegra.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/mmc/host/sdhci-tegra.c b/drivers/mmc/host/sdhci-tegra.c index 387ce9cdbd7c..a5001875876b 100644 --- a/drivers/mmc/host/sdhci-tegra.c +++ b/drivers/mmc/host/sdhci-tegra.c @@ -116,6 +116,8 @@ */ #define NVQUIRK_HAS_TMCLK BIT(10) +#define NVQUIRK_HAS_ANDROID_GPT_SECTOR BIT(11) + /* SDMMC CQE Base Address for Tegra Host Ver 4.1 and Higher */ #define SDHCI_TEGRA_CQE_BASE_ADDR 0xF000 @@ -1361,6 +1363,7 @@ static const struct sdhci_tegra_soc_data soc_data_tegra20 = { .pdata = &sdhci_tegra20_pdata, .dma_mask = DMA_BIT_MASK(32), .nvquirks = NVQUIRK_FORCE_SDHCI_SPEC_200 | + NVQUIRK_HAS_ANDROID_GPT_SECTOR | NVQUIRK_ENABLE_BLOCK_GAP_DET, }; @@ -1390,6 +1393,7 @@ static const struct sdhci_tegra_soc_data soc_data_tegra30 = { .nvquirks = NVQUIRK_ENABLE_SDHCI_SPEC_300 | NVQUIRK_ENABLE_SDR50 | NVQUIRK_ENABLE_SDR104 | + NVQUIRK_HAS_ANDROID_GPT_SECTOR | NVQUIRK_HAS_PADCALIB, }; @@ -1422,6 +1426,7 @@ static const struct sdhci_pltfm_data sdhci_tegra114_pdata = { static const struct sdhci_tegra_soc_data soc_data_tegra114 = { .pdata = &sdhci_tegra114_pdata, .dma_mask = DMA_BIT_MASK(32), + .nvquirks = NVQUIRK_HAS_ANDROID_GPT_SECTOR, }; static const struct sdhci_pltfm_data sdhci_tegra124_pdata = { @@ -1438,6 +1443,7 @@ static const struct sdhci_pltfm_data sdhci_tegra124_pdata = { static const struct sdhci_tegra_soc_data soc_data_tegra124 = { .pdata = &sdhci_tegra124_pdata, .dma_mask = DMA_BIT_MASK(34), + .nvquirks = NVQUIRK_HAS_ANDROID_GPT_SECTOR, }; static const struct sdhci_ops tegra210_sdhci_ops = { @@ -1616,6 +1622,9 @@ static int sdhci_tegra_probe(struct platform_device *pdev) tegra_host->pad_control_available = false; tegra_host->soc_data = soc_data; + if (soc_data->nvquirks & NVQUIRK_HAS_ANDROID_GPT_SECTOR) + host->mmc->caps2 |= MMC_CAP2_ALT_GPT_TEGRA; + if (soc_data->nvquirks & NVQUIRK_NEEDS_PAD_CONTROL) { rc = tegra_sdhci_init_pinctrl_info(&pdev->dev, tegra_host); if (rc == 0) From 9f2869921f2a102e209297d4f742f34b46ed3d36 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Aug 2021 16:43:10 +0200 Subject: [PATCH 281/315] block: refine the disk_live check in del_gendisk hidden gendisks will never be marked live. Fixes: 40b3a52ffc5b ("block: add a sanity check for a live disk in del_gendisk") Reported-by: Bruno Goncalves Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210824144310.1487816-1-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/genhd.c b/block/genhd.c index 6a5b65c86c4b..567549a011d1 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -556,7 +556,7 @@ void del_gendisk(struct gendisk *disk) { might_sleep(); - if (WARN_ON_ONCE(!disk_live(disk))) + if (WARN_ON_ONCE(!disk_live(disk) && !(disk->flags & GENHD_FL_HIDDEN))) return; blk_integrity_del(disk); From 158ee7b65653d9f841823c249014c2d0dfdeeb8f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Aug 2021 17:18:23 +0200 Subject: [PATCH 282/315] block: mark blkdev_fsync static MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit blkdev_fsync is only used inside of block_dev.c since the removal of the raw drіver. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210824151823.1575100-1-hch@lst.de Signed-off-by: Jens Axboe --- fs/block_dev.c | 4 ++-- include/linux/fs.h | 4 ---- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index d3a8062302a0..1f21ac984253 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -687,7 +687,8 @@ static loff_t block_llseek(struct file *file, loff_t offset, int whence) return retval; } -int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) +static int blkdev_fsync(struct file *filp, loff_t start, loff_t end, + int datasync) { struct inode *bd_inode = bdev_file_inode(filp); struct block_device *bdev = I_BDEV(bd_inode); @@ -708,7 +709,6 @@ int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) return error; } -EXPORT_SYMBOL(blkdev_fsync); /** * bdev_read_page() - Start reading a page from a block device diff --git a/include/linux/fs.h b/include/linux/fs.h index 640574294216..9220cdf648b0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3246,10 +3246,6 @@ ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb, ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb, struct iov_iter *iter); -/* fs/block_dev.c */ -extern int blkdev_fsync(struct file *filp, loff_t start, loff_t end, - int datasync); - /* fs/splice.c */ extern ssize_t generic_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); From 62283c6c9d4c1018badcd0b9c5b6ca66d978fa0d Mon Sep 17 00:00:00 2001 From: Jing Yangyang Date: Mon, 23 Aug 2021 23:07:02 -0700 Subject: [PATCH 283/315] include:libata: fix boolreturn.cocci warnings ./include/linux/libata.h:1462:8-9:WARNING: return of 0/1 in function 'ata_is_host_link' with return type bool Return statements in functions returning bool should use true/false instead of 1/0. Generated by: scripts/coccinelle/misc/boolreturn.cocci Reported-by: Zeal Robot Signed-off-by: Jing Yangyang Link: https://lore.kernel.org/r/20210824060702.59006-1-deng.changcheng@zte.com.cn Signed-off-by: Jens Axboe --- include/linux/libata.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/libata.h b/include/linux/libata.h index a2d1bae7900b..860e63f5667b 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1459,7 +1459,7 @@ static inline bool sata_pmp_attached(struct ata_port *ap) static inline bool ata_is_host_link(const struct ata_link *link) { - return 1; + return true; } #endif /* CONFIG_SATA_PMP */ From ead3b768bb51259e3a5f2287ff5fc9041eb6f450 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Wed, 11 Aug 2021 11:05:18 +0000 Subject: [PATCH 284/315] blk-zoned: allow zone management send operations without CAP_SYS_ADMIN Zone management send operations (BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE) should be allowed under the same permissions as write(). (write() does not require CAP_SYS_ADMIN). Additionally, other ioctls like BLKSECDISCARD and BLKZEROOUT only check if the fd was successfully opened with FMODE_WRITE. (They do not require CAP_SYS_ADMIN). Currently, zone management send operations require both CAP_SYS_ADMIN and that the fd was successfully opened with FMODE_WRITE. Remove the CAP_SYS_ADMIN requirement, so that zone management send operations match the access control requirement of write(), BLKSECDISCARD and BLKZEROOUT. Fixes: 3ed05a987e0f ("blk-zoned: implement ioctls") Signed-off-by: Niklas Cassel Reviewed-by: Damien Le Moal Reviewed-by: Aravind Ramesh Reviewed-by: Adam Manzanares Reviewed-by: Himanshu Madhani Reviewed-by: Johannes Thumshirn Cc: stable@vger.kernel.org # v4.10+ Link: https://lore.kernel.org/r/20210811110505.29649-2-Niklas.Cassel@wdc.com Signed-off-by: Jens Axboe --- block/blk-zoned.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 86fce751bb17..8a60dbeb44be 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -421,9 +421,6 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode, if (!blk_queue_is_zoned(q)) return -ENOTTY; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - if (!(mode & FMODE_WRITE)) return -EBADF; From 4d643b66089591b4769bcdb6fd1bfeff2fe301b8 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Wed, 11 Aug 2021 11:05:19 +0000 Subject: [PATCH 285/315] blk-zoned: allow BLKREPORTZONE without CAP_SYS_ADMIN A user space process should not need the CAP_SYS_ADMIN capability set in order to perform a BLKREPORTZONE ioctl. Getting the zone report is required in order to get the write pointer. Neither read() nor write() requires CAP_SYS_ADMIN, so it is reasonable that a user space process that can read/write from/to the device, also can get the write pointer. (Since e.g. writes have to be at the write pointer.) Fixes: 3ed05a987e0f ("blk-zoned: implement ioctls") Signed-off-by: Niklas Cassel Reviewed-by: Damien Le Moal Reviewed-by: Aravind Ramesh Reviewed-by: Adam Manzanares Reviewed-by: Himanshu Madhani Reviewed-by: Johannes Thumshirn Cc: stable@vger.kernel.org # v4.10+ Link: https://lore.kernel.org/r/20210811110505.29649-3-Niklas.Cassel@wdc.com Signed-off-by: Jens Axboe --- block/blk-zoned.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 8a60dbeb44be..1d0c76c18fc5 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -360,9 +360,6 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, if (!blk_queue_is_zoned(q)) return -ENOTTY; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report))) return -EFAULT; From d32f89da7fa8ccc8b3fb8f909d61e42b9bc39329 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 25 Aug 2021 12:25:44 +0100 Subject: [PATCH 286/315] net: add accept helper not installing fd Introduce and reuse a helper that acts similarly to __sys_accept4_file() but returns struct file instead of installing file descriptor. Will be used by io_uring. Signed-off-by: Pavel Begunkov Acked-by: Jakub Kicinski Signed-off-by: Jens Axboe Acked-by: David S. Miller Link: https://lore.kernel.org/r/c57b9e8e818d93683a3d24f8ca50ca038d1da8c4.1629888991.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/socket.h | 3 ++ net/socket.c | 71 ++++++++++++++++++++++-------------------- 2 files changed, 40 insertions(+), 34 deletions(-) diff --git a/include/linux/socket.h b/include/linux/socket.h index 0d8e3dcb7f88..d3c1a42a2edd 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -421,6 +421,9 @@ extern int __sys_accept4_file(struct file *file, unsigned file_flags, struct sockaddr __user *upeer_sockaddr, int __user *upeer_addrlen, int flags, unsigned long nofile); +extern struct file *do_accept(struct file *file, unsigned file_flags, + struct sockaddr __user *upeer_sockaddr, + int __user *upeer_addrlen, int flags); extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, int __user *upeer_addrlen, int flags); extern int __sys_socket(int family, int type, int protocol); diff --git a/net/socket.c b/net/socket.c index 0b2dad3bdf7f..532fff5a3684 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1722,32 +1722,22 @@ SYSCALL_DEFINE2(listen, int, fd, int, backlog) return __sys_listen(fd, backlog); } -int __sys_accept4_file(struct file *file, unsigned file_flags, +struct file *do_accept(struct file *file, unsigned file_flags, struct sockaddr __user *upeer_sockaddr, - int __user *upeer_addrlen, int flags, - unsigned long nofile) + int __user *upeer_addrlen, int flags) { struct socket *sock, *newsock; struct file *newfile; - int err, len, newfd; + int err, len; struct sockaddr_storage address; - if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) - return -EINVAL; - - if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) - flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; - sock = sock_from_file(file); - if (!sock) { - err = -ENOTSOCK; - goto out; - } + if (!sock) + return ERR_PTR(-ENOTSOCK); - err = -ENFILE; newsock = sock_alloc(); if (!newsock) - goto out; + return ERR_PTR(-ENFILE); newsock->type = sock->type; newsock->ops = sock->ops; @@ -1758,18 +1748,9 @@ int __sys_accept4_file(struct file *file, unsigned file_flags, */ __module_get(newsock->ops->owner); - newfd = __get_unused_fd_flags(flags, nofile); - if (unlikely(newfd < 0)) { - err = newfd; - sock_release(newsock); - goto out; - } newfile = sock_alloc_file(newsock, flags, sock->sk->sk_prot_creator->name); - if (IS_ERR(newfile)) { - err = PTR_ERR(newfile); - put_unused_fd(newfd); - goto out; - } + if (IS_ERR(newfile)) + return newfile; err = security_socket_accept(sock, newsock); if (err) @@ -1794,16 +1775,38 @@ int __sys_accept4_file(struct file *file, unsigned file_flags, } /* File flags are not inherited via accept() unlike another OSes. */ - - fd_install(newfd, newfile); - err = newfd; -out: - return err; + return newfile; out_fd: fput(newfile); - put_unused_fd(newfd); - goto out; + return ERR_PTR(err); +} +int __sys_accept4_file(struct file *file, unsigned file_flags, + struct sockaddr __user *upeer_sockaddr, + int __user *upeer_addrlen, int flags, + unsigned long nofile) +{ + struct file *newfile; + int newfd; + + if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) + return -EINVAL; + + if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) + flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; + + newfd = __get_unused_fd_flags(flags, nofile); + if (unlikely(newfd < 0)) + return newfd; + + newfile = do_accept(file, file_flags, upeer_sockaddr, upeer_addrlen, + flags); + if (IS_ERR(newfile)) { + put_unused_fd(newfd); + return PTR_ERR(newfile); + } + fd_install(newfd, newfile); + return newfd; } /* From b9445598d8c60a1379887b957024b71343965f74 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 25 Aug 2021 12:25:45 +0100 Subject: [PATCH 287/315] io_uring: openat directly into fixed fd table Instead of opening a file into a process's file table as usual and then registering the fd within io_uring, some users may want to skip the first step and place it directly into io_uring's fixed file table. This patch adds such a capability for IORING_OP_OPENAT and IORING_OP_OPENAT2. The behaviour is controlled by setting sqe->file_index, where 0 implies the old behaviour using normal file tables. If non-zero value is specified, then it will behave as described and place the file into a fixed file slot sqe->file_index - 1. A file table should be already created, the slot should be valid and empty, otherwise the operation will fail. Keep the error codes consistent with IORING_OP_FILES_UPDATE, ENXIO and EINVAL on inappropriate fixed tables, and return EBADF on collision with already registered file. Note: IOSQE_FIXED_FILE can't be used to switch between modes, because accept takes a file, and it already uses the flag with a different meaning. Suggested-by: Josh Triplett Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe Link: https://lore.kernel.org/r/e9b33d1163286f51ea707f87d95bd596dada1e65.1629888991.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 74 +++++++++++++++++++++++++++++++---- include/uapi/linux/io_uring.h | 5 ++- 2 files changed, 70 insertions(+), 9 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6859438c4e09..62da427d614d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -580,6 +580,7 @@ struct io_sr_msg { struct io_open { struct file *file; int dfd; + u32 file_slot; struct filename *filename; struct open_how how; unsigned long nofile; @@ -1063,6 +1064,9 @@ static void io_req_task_queue(struct io_kiocb *req); static void io_submit_flush_completions(struct io_ring_ctx *ctx); static int io_req_prep_async(struct io_kiocb *req); +static int io_install_fixed_file(struct io_kiocb *req, struct file *file, + unsigned int issue_flags, u32 slot_index); + static struct kmem_cache *req_cachep; static const struct file_operations io_uring_fops; @@ -3864,7 +3868,7 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (unlikely(sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)) + if (unlikely(sqe->ioprio || sqe->buf_index)) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; @@ -3881,6 +3885,11 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe req->open.filename = NULL; return ret; } + + req->open.file_slot = READ_ONCE(sqe->file_index); + if (req->open.file_slot && (req->open.how.flags & O_CLOEXEC)) + return -EINVAL; + req->open.nofile = rlimit(RLIMIT_NOFILE); req->flags |= REQ_F_NEED_CLEANUP; return 0; @@ -3918,8 +3927,8 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags) { struct open_flags op; struct file *file; - bool nonblock_set; - bool resolve_nonblock; + bool resolve_nonblock, nonblock_set; + bool fixed = !!req->open.file_slot; int ret; ret = build_open_flags(&req->open.how, &op); @@ -3938,9 +3947,11 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags) op.open_flag |= O_NONBLOCK; } - ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile); - if (ret < 0) - goto err; + if (!fixed) { + ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile); + if (ret < 0) + goto err; + } file = do_filp_open(req->open.dfd, req->open.filename, &op); if (IS_ERR(file)) { @@ -3949,7 +3960,8 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags) * marginal gain for something that is now known to be a slower * path. So just put it, and we'll get a new one when we retry. */ - put_unused_fd(ret); + if (!fixed) + put_unused_fd(ret); ret = PTR_ERR(file); /* only retry if RESOLVE_CACHED wasn't already set by application */ @@ -3962,7 +3974,12 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags) if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set) file->f_flags &= ~O_NONBLOCK; fsnotify_open(file); - fd_install(ret, file); + + if (!fixed) + fd_install(ret, file); + else + ret = io_install_fixed_file(req, file, issue_flags, + req->open.file_slot - 1); err: putname(req->open.filename); req->flags &= ~REQ_F_NEED_CLEANUP; @@ -7899,6 +7916,46 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file, #endif } +static int io_install_fixed_file(struct io_kiocb *req, struct file *file, + unsigned int issue_flags, u32 slot_index) +{ + struct io_ring_ctx *ctx = req->ctx; + bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + struct io_fixed_file *file_slot; + int ret = -EBADF; + + io_ring_submit_lock(ctx, !force_nonblock); + if (file->f_op == &io_uring_fops) + goto err; + ret = -ENXIO; + if (!ctx->file_data) + goto err; + ret = -EINVAL; + if (slot_index >= ctx->nr_user_files) + goto err; + + slot_index = array_index_nospec(slot_index, ctx->nr_user_files); + file_slot = io_fixed_file_slot(&ctx->file_table, slot_index); + ret = -EBADF; + if (file_slot->file_ptr) + goto err; + + *io_get_tag_slot(ctx->file_data, slot_index) = 0; + io_fixed_file_set(file_slot, file); + ret = io_sqe_file_register(ctx, file, slot_index); + if (ret) { + file_slot->file_ptr = 0; + goto err; + } + + ret = 0; +err: + io_ring_submit_unlock(ctx, !force_nonblock); + if (ret) + fput(file); + return ret; +} + static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, struct io_rsrc_node *node, void *rsrc) { @@ -10366,6 +10423,7 @@ static int __init io_uring_init(void) BUILD_BUG_SQE_ELEM(40, __u16, buf_group); BUILD_BUG_SQE_ELEM(42, __u16, personality); BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in); + BUILD_BUG_SQE_ELEM(44, __u32, file_index); BUILD_BUG_ON(sizeof(struct io_uring_files_update) != sizeof(struct io_uring_rsrc_update)); diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 79126d5cd289..45a4f2373694 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -55,7 +55,10 @@ struct io_uring_sqe { } __attribute__((packed)); /* personality to use, if used */ __u16 personality; - __s32 splice_fd_in; + union { + __s32 splice_fd_in; + __u32 file_index; + }; __u64 __pad2[2]; }; From a7083ad5e30767ede4ff49d7471ea9c078702db2 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 25 Aug 2021 12:25:46 +0100 Subject: [PATCH 288/315] io_uring: hand code io_accept() fd installing Make io_accept() to handle file descriptor allocations and installation. A preparation patch for bypassing file tables. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe Link: https://lore.kernel.org/r/5b73d204caa0ce979ccb98136695b60f52a3d98c.1629888991.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 62da427d614d..ad0a61591ac7 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4810,6 +4810,11 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); accept->flags = READ_ONCE(sqe->accept_flags); accept->nofile = rlimit(RLIMIT_NOFILE); + + if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) + return -EINVAL; + if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK)) + accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK; return 0; } @@ -4818,20 +4823,28 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags) struct io_accept *accept = &req->accept; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0; - int ret; + struct file *file; + int ret, fd; if (req->file->f_flags & O_NONBLOCK) req->flags |= REQ_F_NOWAIT; - ret = __sys_accept4_file(req->file, file_flags, accept->addr, - accept->addr_len, accept->flags, - accept->nofile); - if (ret == -EAGAIN && force_nonblock) - return -EAGAIN; - if (ret < 0) { + fd = __get_unused_fd_flags(accept->flags, accept->nofile); + if (unlikely(fd < 0)) + return fd; + + file = do_accept(req->file, file_flags, accept->addr, accept->addr_len, + accept->flags); + if (IS_ERR(file)) { + ret = PTR_ERR(file); + if (ret == -EAGAIN && force_nonblock) + return -EAGAIN; if (ret == -ERESTARTSYS) ret = -EINTR; req_set_fail(req); + } else { + fd_install(fd, file); + ret = fd; } __io_req_complete(req, issue_flags, ret, 0); return 0; From aaa4db12ef7bdc3e343580d1d3c0b2a8874fc1fb Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 25 Aug 2021 12:25:47 +0100 Subject: [PATCH 289/315] io_uring: accept directly into fixed file table As done with open opcodes, allow accept to skip installing fd into processes' file tables and put it directly into io_uring's fixed file table. Same restrictions and design as for open. Suggested-by: Josh Triplett Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe Link: https://lore.kernel.org/r/6d16163f376fac7ac26a656de6b42199143e9721.1629888991.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index ad0a61591ac7..e321cb0d00c4 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -515,6 +515,7 @@ struct io_accept { struct sockaddr __user *addr; int __user *addr_len; int flags; + u32 file_slot; unsigned long nofile; }; @@ -4803,7 +4804,7 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->splice_fd_in) + if (sqe->ioprio || sqe->len || sqe->buf_index) return -EINVAL; accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); @@ -4811,6 +4812,10 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) accept->flags = READ_ONCE(sqe->accept_flags); accept->nofile = rlimit(RLIMIT_NOFILE); + accept->file_slot = READ_ONCE(sqe->file_index); + if (accept->file_slot && ((req->open.how.flags & O_CLOEXEC) || + (accept->flags & SOCK_CLOEXEC))) + return -EINVAL; if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) return -EINVAL; if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK)) @@ -4823,28 +4828,35 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags) struct io_accept *accept = &req->accept; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0; + bool fixed = !!accept->file_slot; struct file *file; int ret, fd; if (req->file->f_flags & O_NONBLOCK) req->flags |= REQ_F_NOWAIT; - fd = __get_unused_fd_flags(accept->flags, accept->nofile); - if (unlikely(fd < 0)) - return fd; - + if (!fixed) { + fd = __get_unused_fd_flags(accept->flags, accept->nofile); + if (unlikely(fd < 0)) + return fd; + } file = do_accept(req->file, file_flags, accept->addr, accept->addr_len, accept->flags); if (IS_ERR(file)) { + if (!fixed) + put_unused_fd(fd); ret = PTR_ERR(file); if (ret == -EAGAIN && force_nonblock) return -EAGAIN; if (ret == -ERESTARTSYS) ret = -EINTR; req_set_fail(req); - } else { + } else if (!fixed) { fd_install(fd, file); ret = fd; + } else { + ret = io_install_fixed_file(req, file, issue_flags, + accept->file_slot - 1); } __io_req_complete(req, issue_flags, ret, 0); return 0; From cc40b7225151f611ef837f6403cfaeadc7af214a Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 24 Aug 2021 22:59:18 -0700 Subject: [PATCH 290/315] blk-crypto: fix check for too-large dun_bytes dun_bytes needs to be less than or equal to the IV size of the encryption mode, not just less than or equal to BLK_CRYPTO_MAX_IV_SIZE. Currently this doesn't matter since blk_crypto_init_key() is never actually passed invalid values, but we might as well fix this. Fixes: a892c8d52c02 ("block: Inline encryption support for blk-mq") Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20210825055918.51975-1-ebiggers@kernel.org Signed-off-by: Jens Axboe --- block/blk-crypto.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-crypto.c b/block/blk-crypto.c index c5bdaafffa29..103c2e2d50d6 100644 --- a/block/blk-crypto.c +++ b/block/blk-crypto.c @@ -332,7 +332,7 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key, if (mode->keysize == 0) return -EINVAL; - if (dun_bytes == 0 || dun_bytes > BLK_CRYPTO_MAX_IV_SIZE) + if (dun_bytes == 0 || dun_bytes > mode->ivsize) return -EINVAL; if (!is_power_of_2(data_unit_size)) From 1e294970fc00f45c1f17fb442c26a7e3fc9789b1 Mon Sep 17 00:00:00 2001 From: Shaokun Zhang Date: Wed, 25 Aug 2021 14:19:51 +0800 Subject: [PATCH 291/315] block, bfq: cleanup the repeated declaration Function 'bfq_entity_to_bfqq' is declared twice, so remove the repeated declaration and blank line. Cc: Paolo Valente Cc: Jens Axboe Signed-off-by: Shaokun Zhang Link: https://lore.kernel.org/r/1629872391-46399-1-git-send-email-zhangshaokun@hisilicon.com Signed-off-by: Jens Axboe --- block/bfq-iosched.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 385e28a843d1..a73488eec8a4 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -955,8 +955,6 @@ struct bfq_group { }; #endif -struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); - /* --------------- main algorithm interface ----------------- */ #define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ From 1d1cf156dc176e30eeaced5cf1450d582d387b81 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Aug 2021 09:54:38 +0200 Subject: [PATCH 292/315] sg: pass the device name to blk_trace_setup Fix a regression that passed a NULL device name to blk_trace_setup accidentally. Fixes: aebbb5831fbd ("sg: do not allocate a gendisk") Reported-by: syzbot+f74aa89114a236643919@syzkaller.appspotmail.com Signed-off-by: Christoph Hellwig Reviewed-by: Max Gurtovoy Link: https://lore.kernel.org/r/20210825075438.1883687-1-hch@lst.de Signed-off-by: Jens Axboe --- drivers/scsi/sg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 477267add49d..d5889b4f0fd4 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -1118,7 +1118,7 @@ sg_ioctl_common(struct file *filp, Sg_device *sdp, Sg_fd *sfp, return put_user(max_sectors_bytes(sdp->device->request_queue), ip); case BLKTRACESETUP: - return blk_trace_setup(sdp->device->request_queue, NULL, + return blk_trace_setup(sdp->device->request_queue, sdp->name, MKDEV(SCSI_GENERIC_MAJOR, sdp->index), NULL, p); case BLKTRACESTART: From 0c6e1d7fd5e7560fdc4bb3418c2c0f0d7a95bf76 Mon Sep 17 00:00:00 2001 From: Hao Xu Date: Thu, 26 Aug 2021 01:58:56 +0800 Subject: [PATCH 293/315] io_uring: don't free request to slab It's not necessary to free the request back to slab when we fail to get sqe, just move it to state->free_list. Signed-off-by: Hao Xu Reviewed-by: Pavel Begunkov Link: https://lore.kernel.org/r/20210825175856.194299-1-haoxu@linux.alibaba.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index e321cb0d00c4..be3c3aea6398 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6889,7 +6889,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) } sqe = io_get_sqe(ctx); if (unlikely(!sqe)) { - kmem_cache_free(req_cachep, req); + list_add(&req->inflight_entry, &ctx->submit_state.free_list); break; } /* will complete beyond this point, count as submitted */ From 93f63bc41f699318807df202a175d564c26bda87 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 25 Aug 2021 18:31:03 +0200 Subject: [PATCH 294/315] nbd: add missing locking to the nbd_dev_add error path idr_remove needs external synchronization. Fixes: 6e4df4c64881 ("nbd: reduce the nbd_index_mutex scope") Signed-off-by: Tetsuo Handa [hch: split from a larger patch] Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210825163108.50713-2-hch@lst.de Reviewed-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 379032a64a7c..0c1389da3066 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1770,7 +1770,9 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs) return nbd; out_free_idr: + mutex_lock(&nbd_index_mutex); idr_remove(&nbd_index_idr, index); + mutex_unlock(&nbd_index_mutex); out_free_tags: blk_mq_free_tag_set(&nbd->tag_set); out_free_nbd: From 409e0ff10ead30a620ee48acb6d4545d9cb95359 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Aug 2021 18:31:04 +0200 Subject: [PATCH 295/315] nbd: reset NBD to NULL when restarting in nbd_genl_connect When nbd_genl_connect restarts to wait for a disconnecting device, nbd needs to be reset to NULL. Do that by facoring out a helper to find an unused device. Fixes: 6177b56c96ff ("nbd: refactor device search and allocation in nbd_genl_connect") Reported-by: Tetsuo Handa Reported-by: Hillf Danton Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210825163108.50713-3-hch@lst.de Reviewed-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 0c1389da3066..938ca7f5a11f 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1781,6 +1781,20 @@ out: return ERR_PTR(err); } +static struct nbd_device *nbd_find_unused(void) +{ + struct nbd_device *nbd; + int id; + + lockdep_assert_held(&nbd_index_mutex); + + idr_for_each_entry(&nbd_index_idr, nbd, id) + if (!refcount_read(&nbd->config_refs)) + return nbd; + + return NULL; +} + /* Netlink interface. */ static const struct nla_policy nbd_attr_policy[NBD_ATTR_MAX + 1] = { [NBD_ATTR_INDEX] = { .type = NLA_U32 }, @@ -1828,7 +1842,7 @@ static int nbd_genl_size_set(struct genl_info *info, struct nbd_device *nbd) static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) { DECLARE_COMPLETION_ONSTACK(destroy_complete); - struct nbd_device *nbd = NULL; + struct nbd_device *nbd; struct nbd_config *config; int index = -1; int ret; @@ -1849,20 +1863,10 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) } again: mutex_lock(&nbd_index_mutex); - if (index == -1) { - struct nbd_device *tmp; - int id; - - idr_for_each_entry(&nbd_index_idr, tmp, id) { - if (!refcount_read(&tmp->config_refs)) { - nbd = tmp; - break; - } - } - } else { + if (index == -1) + nbd = nbd_find_unused(); + else nbd = idr_find(&nbd_index_idr, index); - } - if (nbd) { if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) && test_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags)) { From 75b7f62aa65d5c496391ec2c3db3561aaf81a403 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 25 Aug 2021 18:31:05 +0200 Subject: [PATCH 296/315] nbd: prevent IDR lookups from finding partially initialized devices Previously nbd_index_mutex was held during whole add/remove/lookup operations in order to guarantee that partially initialized devices are not reachable via idr_find() or idr_for_each(). But now that partially initialized devices become reachable as soon as idr_alloc() succeeds, we need to skip partially initialized devices. Since it seems that all functions use refcount_inc_not_zero(&nbd->refs) in order to skip destroying devices, update nbd->refs from zero to non-zero as the last step of device initialization in order to also skip partially initialized devices. Fixes: 6e4df4c64881 ("nbd: reduce the nbd_index_mutex scope") Signed-off-by: Tetsuo Handa [hch: split from a larger patch, added comments] Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210825163108.50713-4-hch@lst.de Reviewed-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 938ca7f5a11f..b1ed2360ef32 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1747,7 +1747,11 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs) mutex_init(&nbd->config_lock); refcount_set(&nbd->config_refs, 0); - refcount_set(&nbd->refs, refs); + /* + * Start out with a zero references to keep other threads from using + * this device until it is fully initialized. + */ + refcount_set(&nbd->refs, 0); INIT_LIST_HEAD(&nbd->list); disk->major = NBD_MAJOR; @@ -1766,6 +1770,11 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs) disk->private_data = nbd; sprintf(disk->disk_name, "nbd%d", index); add_disk(disk); + + /* + * Now publish the device. + */ + refcount_set(&nbd->refs, refs); nbd_total_devices++; return nbd; From b190300decb352a0b865d7aa379e89b17d772a43 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 25 Aug 2021 18:31:06 +0200 Subject: [PATCH 297/315] nbd: set nbd->index before releasing nbd_index_mutex Set nbd->index before releasing nbd_index_mutex, as populate_nbd_status() might access nbd->index as soon as nbd_index_mutex is released. Fixes: 6e4df4c64881 ("nbd: reduce the nbd_index_mutex scope") Signed-off-by: Tetsuo Handa [hch: split from a larger patch] Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210825163108.50713-5-hch@lst.de Reviewed-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index b1ed2360ef32..6a832bf81647 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1720,10 +1720,10 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs) if (err >= 0) index = err; } + nbd->index = index; mutex_unlock(&nbd_index_mutex); if (err < 0) goto out_free_tags; - nbd->index = index; disk = blk_mq_alloc_disk(&nbd->tag_set, NULL); if (IS_ERR(disk)) { From 438cd318c8dfa5228ffd43af1b98d7cd7d92e1c6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Aug 2021 18:31:07 +0200 Subject: [PATCH 298/315] nbd: only return usable devices from nbd_find_unused Device marked as NBD_DESTROY_ON_DISCONNECT can and should be skipped given that they won't survive the disconnect. So skip them and try to grab a reference directly and just continue if the the devices is being torn down or created and thus has a zero refcount. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210825163108.50713-6-hch@lst.de Reviewed-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 6a832bf81647..70dc9d80a173 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1790,16 +1790,20 @@ out: return ERR_PTR(err); } -static struct nbd_device *nbd_find_unused(void) +static struct nbd_device *nbd_find_get_unused(void) { struct nbd_device *nbd; int id; lockdep_assert_held(&nbd_index_mutex); - idr_for_each_entry(&nbd_index_idr, nbd, id) - if (!refcount_read(&nbd->config_refs)) + idr_for_each_entry(&nbd_index_idr, nbd, id) { + if (refcount_read(&nbd->config_refs) || + test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags)) + continue; + if (refcount_inc_not_zero(&nbd->refs)) return nbd; + } return NULL; } @@ -1873,10 +1877,10 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) again: mutex_lock(&nbd_index_mutex); if (index == -1) - nbd = nbd_find_unused(); + nbd = nbd_find_get_unused(); else nbd = idr_find(&nbd_index_idr, index); - if (nbd) { + if (nbd && index != -1) { if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) && test_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags)) { nbd->destroy_complete = &destroy_complete; @@ -1889,8 +1893,6 @@ again: if (!refcount_inc_not_zero(&nbd->refs)) { mutex_unlock(&nbd_index_mutex); - if (index == -1) - goto again; pr_err("nbd: device at index %d is going down\n", index); return -EINVAL; From 7ee656c3ac3d047b4cf1269f83ac9d6c0bba916b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Aug 2021 18:31:08 +0200 Subject: [PATCH 299/315] nbd: remove nbd->destroy_complete The nbd->destroy_complete pointer is not really needed. For creating a device without a specific index we now simplify skip devices marked NBD_DESTROY_ON_DISCONNECT as there is not much point to reuse them. For device creation with a specific index there is no real need to treat the case of a requested but not finished disconnect different than any other device that is being shutdown, i.e. we can just return an error, as a slightly different race window would anyway. Fixes: 6e4df4c64881 ("nbd: reduce the nbd_index_mutex scope") Reported-by: Tetsuo Handa Reported-by: syzbot+2c98885bcd769f56b6d6@syzkaller.appspotmail.com Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210825163108.50713-7-hch@lst.de Reviewed-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 54 +++++++++++++-------------------------------- 1 file changed, 15 insertions(+), 39 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 70dc9d80a173..44143068c91e 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -120,7 +120,6 @@ struct nbd_device { struct task_struct *task_recv; struct task_struct *task_setup; - struct completion *destroy_complete; unsigned long flags; char *backend; @@ -235,19 +234,6 @@ static const struct device_attribute backend_attr = { .show = backend_show, }; -/* - * Place this in the last just before the nbd is freed to - * make sure that the disk and the related kobject are also - * totally removed to avoid duplicate creation of the same - * one. - */ -static void nbd_notify_destroy_completion(struct nbd_device *nbd) -{ - if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) && - nbd->destroy_complete) - complete(nbd->destroy_complete); -} - static void nbd_dev_remove(struct nbd_device *nbd) { struct gendisk *disk = nbd->disk; @@ -262,7 +248,6 @@ static void nbd_dev_remove(struct nbd_device *nbd) */ mutex_lock(&nbd_index_mutex); idr_remove(&nbd_index_idr, nbd->index); - nbd_notify_destroy_completion(nbd); mutex_unlock(&nbd_index_mutex); kfree(nbd); @@ -1702,7 +1687,6 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs) BLK_MQ_F_BLOCKING; nbd->tag_set.driver_data = nbd; INIT_WORK(&nbd->remove_work, nbd_dev_remove_work); - nbd->destroy_complete = NULL; nbd->backend = NULL; err = blk_mq_alloc_tag_set(&nbd->tag_set); @@ -1854,7 +1838,6 @@ static int nbd_genl_size_set(struct genl_info *info, struct nbd_device *nbd) static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) { - DECLARE_COMPLETION_ONSTACK(destroy_complete); struct nbd_device *nbd; struct nbd_config *config; int index = -1; @@ -1876,31 +1859,24 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) } again: mutex_lock(&nbd_index_mutex); - if (index == -1) + if (index == -1) { nbd = nbd_find_get_unused(); - else - nbd = idr_find(&nbd_index_idr, index); - if (nbd && index != -1) { - if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) && - test_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags)) { - nbd->destroy_complete = &destroy_complete; - mutex_unlock(&nbd_index_mutex); - - /* wait until the nbd device is completely destroyed */ - wait_for_completion(&destroy_complete); - goto again; - } - - if (!refcount_inc_not_zero(&nbd->refs)) { - mutex_unlock(&nbd_index_mutex); - pr_err("nbd: device at index %d is going down\n", - index); - return -EINVAL; - } - mutex_unlock(&nbd_index_mutex); } else { - mutex_unlock(&nbd_index_mutex); + nbd = idr_find(&nbd_index_idr, index); + if (nbd) { + if ((test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) && + test_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags)) || + !refcount_inc_not_zero(&nbd->refs)) { + mutex_unlock(&nbd_index_mutex); + pr_err("nbd: device at index %d is going down\n", + index); + return -EINVAL; + } + } + } + mutex_unlock(&nbd_index_mutex); + if (!nbd) { nbd = nbd_dev_add(index, 2); if (IS_ERR(nbd)) { pr_err("nbd: failed to add new device\n"); From 46d4703b1db4c86ab5acb2331b10df999f005e8e Mon Sep 17 00:00:00 2001 From: Xiao Ni Date: Wed, 18 Aug 2021 13:57:48 +0800 Subject: [PATCH 300/315] md/raid10: Remove unnecessary rcu_dereference in raid10_handle_discard We are seeing the following warning in raid10_handle_discard. [ 695.110751] ============================= [ 695.131439] WARNING: suspicious RCU usage [ 695.151389] 4.18.0-319.el8.x86_64+debug #1 Not tainted [ 695.174413] ----------------------------- [ 695.192603] drivers/md/raid10.c:1776 suspicious rcu_dereference_check() usage! [ 695.225107] other info that might help us debug this: [ 695.260940] rcu_scheduler_active = 2, debug_locks = 1 [ 695.290157] no locks held by mkfs.xfs/10186. In the first loop of function raid10_handle_discard. It already determines which disk need to handle discard request and add the rdev reference count rdev->nr_pending. So the conf->mirrors will not change until all bios come back from underlayer disks. It doesn't need to use rcu_dereference to get rdev. Cc: stable@vger.kernel.org Fixes: d30588b2731f ('md/raid10: improve raid10 discard request') Signed-off-by: Xiao Ni Acked-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/raid10.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 16977e8e075d..d5d92337e35e 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1712,6 +1712,11 @@ retry_discard: } else r10_bio->master_bio = (struct bio *)first_r10bio; + /* + * first select target devices under rcu_lock and + * inc refcount on their rdev. Record them by setting + * bios[x] to bio + */ rcu_read_lock(); for (disk = 0; disk < geo->raid_disks; disk++) { struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev); @@ -1743,9 +1748,6 @@ retry_discard: for (disk = 0; disk < geo->raid_disks; disk++) { sector_t dev_start, dev_end; struct bio *mbio, *rbio = NULL; - struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev); - struct md_rdev *rrdev = rcu_dereference( - conf->mirrors[disk].replacement); /* * Now start to calculate the start and end address for each disk. @@ -1775,9 +1777,12 @@ retry_discard: /* * It only handles discard bio which size is >= stripe size, so - * dev_end > dev_start all the time + * dev_end > dev_start all the time. + * It doesn't need to use rcu lock to get rdev here. We already + * add rdev->nr_pending in the first loop. */ if (r10_bio->devs[disk].bio) { + struct md_rdev *rdev = conf->mirrors[disk].rdev; mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); mbio->bi_end_io = raid10_end_discard_request; mbio->bi_private = r10_bio; @@ -1790,6 +1795,7 @@ retry_discard: bio_endio(mbio); } if (r10_bio->devs[disk].repl_bio) { + struct md_rdev *rrdev = conf->mirrors[disk].replacement; rbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); rbio->bi_end_io = raid10_end_discard_request; rbio->bi_private = r10_bio; From 14afdd6ee3a0db7bcae887d1951ed21c4d1539cd Mon Sep 17 00:00:00 2001 From: Hao Xu Date: Fri, 27 Aug 2021 17:46:08 +0800 Subject: [PATCH 301/315] io_uring: remove redundant req_set_fail() req_set_fail() in io_submit_sqe() is redundant, remove it. Signed-off-by: Hao Xu Reviewed-by: Pavel Begunkov Link: https://lore.kernel.org/r/20210827094609.36052-2-haoxu@linux.alibaba.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index be3c3aea6398..48cc6328eb06 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6734,7 +6734,6 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, fail_req: if (link->head) { /* fail even hard links since we don't submit */ - req_set_fail(link->head); io_req_complete_failed(link->head, -ECANCELED); link->head = NULL; } From a8295b982c46d4a7c259a4cdd58a2681929068a9 Mon Sep 17 00:00:00 2001 From: Hao Xu Date: Fri, 27 Aug 2021 17:46:09 +0800 Subject: [PATCH 302/315] io_uring: fix failed linkchain code logic Given a linkchain like this: req0(link_flag)-->req1(link_flag)-->...-->reqn(no link_flag) There is a problem: - if some intermediate linked req like req1 's submittion fails, reqs after it won't be cancelled. - sqpoll disabled: maybe it's ok since users can get the error info of req1 and stop submitting the following sqes. - sqpoll enabled: definitely a problem, the following sqes will be submitted in the next round. The solution is to refactor the code logic to: - if a linked req's submittion fails, just mark it and the head(if it exists) as REQ_F_FAIL. Leverage req->result to indicate whether it is failed or cancelled. - submit or fail the whole chain when we come to the end of it. Signed-off-by: Hao Xu Reviewed-by: Pavel Begunkov Link: https://lore.kernel.org/r/20210827094609.36052-3-haoxu@linux.alibaba.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 61 +++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 47 insertions(+), 14 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 48cc6328eb06..1341b714ed12 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1188,6 +1188,12 @@ static inline void req_set_fail(struct io_kiocb *req) req->flags |= REQ_F_FAIL; } +static inline void req_fail_link_node(struct io_kiocb *req, int res) +{ + req_set_fail(req); + req->result = res; +} + static void io_ring_ctx_ref_free(struct percpu_ref *ref) { struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs); @@ -1957,11 +1963,16 @@ static void io_fail_links(struct io_kiocb *req) req->link = NULL; while (link) { + long res = -ECANCELED; + + if (link->flags & REQ_F_FAIL) + res = link->result; + nxt = link->link; link->link = NULL; trace_io_uring_fail_link(req, link); - io_cqring_fill_event(link->ctx, link->user_data, -ECANCELED, 0); + io_cqring_fill_event(link->ctx, link->user_data, res, 0); io_put_req_deferred(link); link = nxt; } @@ -6622,8 +6633,10 @@ static inline void io_queue_sqe(struct io_kiocb *req) if (unlikely(req->ctx->drain_active) && io_drain_req(req)) return; - if (likely(!(req->flags & REQ_F_FORCE_ASYNC))) { + if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)))) { __io_queue_sqe(req); + } else if (req->flags & REQ_F_FAIL) { + io_req_complete_failed(req, req->result); } else { int ret = io_req_prep_async(req); @@ -6732,19 +6745,34 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, ret = io_init_req(ctx, req, sqe); if (unlikely(ret)) { fail_req: + /* fail even hard links since we don't submit */ if (link->head) { - /* fail even hard links since we don't submit */ - io_req_complete_failed(link->head, -ECANCELED); - link->head = NULL; + /* + * we can judge a link req is failed or cancelled by if + * REQ_F_FAIL is set, but the head is an exception since + * it may be set REQ_F_FAIL because of other req's failure + * so let's leverage req->result to distinguish if a head + * is set REQ_F_FAIL because of its failure or other req's + * failure so that we can set the correct ret code for it. + * init result here to avoid affecting the normal path. + */ + if (!(link->head->flags & REQ_F_FAIL)) + req_fail_link_node(link->head, -ECANCELED); + } else if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) { + /* + * the current req is a normal req, we should return + * error and thus break the submittion loop. + */ + io_req_complete_failed(req, ret); + return ret; } - io_req_complete_failed(req, ret); - return ret; + req_fail_link_node(req, ret); + } else { + ret = io_req_prep(req, sqe); + if (unlikely(ret)) + goto fail_req; } - ret = io_req_prep(req, sqe); - if (unlikely(ret)) - goto fail_req; - /* don't need @sqe from now on */ trace_io_uring_submit_sqe(ctx, req, req->opcode, req->user_data, req->flags, true, @@ -6760,9 +6788,14 @@ fail_req: if (link->head) { struct io_kiocb *head = link->head; - ret = io_req_prep_async(req); - if (unlikely(ret)) - goto fail_req; + if (!(req->flags & REQ_F_FAIL)) { + ret = io_req_prep_async(req); + if (unlikely(ret)) { + req_fail_link_node(req, ret); + if (!(head->flags & REQ_F_FAIL)) + req_fail_link_node(head, -ECANCELED); + } + } trace_io_uring_link(ctx, req, head); link->last->link = req; link->last = req; From 9a10867ae54e02a0f204d2eebea5a446fb7a86f9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 27 Aug 2021 11:55:01 +0100 Subject: [PATCH 303/315] io_uring: add task-refs-get helper As we have a more complicated task referencing, which apart from normal task references includes taking tctx->inflight and caching all that, it would be a good idea to have all that isolated in helpers. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/d9114d037f1c195897aa13f38a496078eca2afdb.1630023531.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 1341b714ed12..5059049da242 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1670,6 +1670,24 @@ static inline void io_put_task(struct task_struct *task, int nr) } } +static void io_task_refs_refill(struct io_uring_task *tctx) +{ + unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR; + + percpu_counter_add(&tctx->inflight, refill); + refcount_add(refill, ¤t->usage); + tctx->cached_refs += refill; +} + +static inline void io_get_task_refs(int nr) +{ + struct io_uring_task *tctx = current->io_uring; + + tctx->cached_refs -= nr; + if (unlikely(tctx->cached_refs < 0)) + io_task_refs_refill(tctx); +} + static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, long res, unsigned int cflags) { @@ -6890,25 +6908,15 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) __must_hold(&ctx->uring_lock) { - struct io_uring_task *tctx; int submitted = 0; /* make sure SQ entry isn't read before tail */ nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx)); if (!percpu_ref_tryget_many(&ctx->refs, nr)) return -EAGAIN; + io_get_task_refs(nr); - tctx = current->io_uring; - tctx->cached_refs -= nr; - if (unlikely(tctx->cached_refs < 0)) { - unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR; - - percpu_counter_add(&tctx->inflight, refill); - refcount_add(refill, ¤t->usage); - tctx->cached_refs += refill; - } io_submit_state_start(&ctx->submit_state, nr); - while (submitted < nr) { const struct io_uring_sqe *sqe; struct io_kiocb *req; From b18a1a4574d2d15f1b0c84658d4549ccbf241fee Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 25 Aug 2021 20:51:39 +0100 Subject: [PATCH 304/315] io_uring: clarify io_req_task_cancel() locking It's too easy to forget and misjudge about synchronisation in io_req_task_cancel(), add a comment clarifying it. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/71099083835f983a1fd73d5a3da6391924da8300.1629920396.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 5059049da242..09194f7276ba 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2168,7 +2168,7 @@ static void io_req_task_cancel(struct io_kiocb *req, bool *locked) { struct io_ring_ctx *ctx = req->ctx; - /* ctx is guaranteed to stay alive while we hold uring_lock */ + /* not needed for normal modes, but SQPOLL depends on it */ io_tw_lock(ctx, locked); io_req_complete_failed(req, req->result); } @@ -2177,7 +2177,6 @@ static void io_req_task_submit(struct io_kiocb *req, bool *locked) { struct io_ring_ctx *ctx = req->ctx; - /* ctx stays valid until unlock, even if we drop all ours ctx->refs */ io_tw_lock(ctx, locked); /* req->task == current here, checking PF_EXITING is safe */ if (likely(!(req->task->flags & PF_EXITING))) From 90499ad00ca59320b5bb43392b7931e1bd84cad2 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 25 Aug 2021 20:51:40 +0100 Subject: [PATCH 305/315] io_uring: add build check for buf_index overflows req->buf_index is u16 and so we rely on registered buffers indexes fitting into it. Add a build check, so when the upper limit for the number of buffers is lifted we get a compliation fail but not lurking problems. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/787e8e1a17cea51ca6301426b1c4c4887b8bd676.1629920396.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 09194f7276ba..53326449d685 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -10493,6 +10493,10 @@ static int __init io_uring_init(void) sizeof(struct io_uring_rsrc_update)); BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) > sizeof(struct io_uring_rsrc_update2)); + + /* ->buf_index is u16 */ + BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); + /* should fit into one byte */ BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8)); From 6607cd319b6b91bff94e90f798a61c031650b514 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Tue, 24 Aug 2021 09:16:54 +0800 Subject: [PATCH 306/315] raid1: ensure write behind bio has less than BIO_MAX_VECS sectors We can't split write behind bio with more than BIO_MAX_VECS sectors, otherwise the below call trace was triggered because we could allocate oversized write behind bio later. [ 8.097936] bvec_alloc+0x90/0xc0 [ 8.098934] bio_alloc_bioset+0x1b3/0x260 [ 8.099959] raid1_make_request+0x9ce/0xc50 [raid1] [ 8.100988] ? __bio_clone_fast+0xa8/0xe0 [ 8.102008] md_handle_request+0x158/0x1d0 [md_mod] [ 8.103050] md_submit_bio+0xcd/0x110 [md_mod] [ 8.104084] submit_bio_noacct+0x139/0x530 [ 8.105127] submit_bio+0x78/0x1d0 [ 8.106163] ext4_io_submit+0x48/0x60 [ext4] [ 8.107242] ext4_writepages+0x652/0x1170 [ext4] [ 8.108300] ? do_writepages+0x41/0x100 [ 8.109338] ? __ext4_mark_inode_dirty+0x240/0x240 [ext4] [ 8.110406] do_writepages+0x41/0x100 [ 8.111450] __filemap_fdatawrite_range+0xc5/0x100 [ 8.112513] file_write_and_wait_range+0x61/0xb0 [ 8.113564] ext4_sync_file+0x73/0x370 [ext4] [ 8.114607] __x64_sys_fsync+0x33/0x60 [ 8.115635] do_syscall_64+0x33/0x40 [ 8.116670] entry_SYSCALL_64_after_hwframe+0x44/0xae Thanks for the comment from Christoph. [1]. https://bugs.archlinux.org/task/70992 Cc: stable@vger.kernel.org # v5.12+ Reported-by: Jens Stutte Tested-by: Jens Stutte Reviewed-by: Christoph Hellwig Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/raid1.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 51f2547c2007..21b348408478 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1331,6 +1331,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, struct raid1_plug_cb *plug = NULL; int first_clone; int max_sectors; + bool write_behind = false; if (mddev_is_clustered(mddev) && md_cluster_ops->area_resyncing(mddev, WRITE, @@ -1383,6 +1384,15 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, max_sectors = r1_bio->sectors; for (i = 0; i < disks; i++) { struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); + + /* + * The write-behind io is only attempted on drives marked as + * write-mostly, which means we could allocate write behind + * bio later. + */ + if (rdev && test_bit(WriteMostly, &rdev->flags)) + write_behind = true; + if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { atomic_inc(&rdev->nr_pending); blocked_rdev = rdev; @@ -1456,6 +1466,15 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, goto retry_write; } + /* + * When using a bitmap, we may call alloc_behind_master_bio below. + * alloc_behind_master_bio allocates a copy of the data payload a page + * at a time and thus needs a new bio that can fit the whole payload + * this bio in page sized chunks. + */ + if (write_behind && bitmap) + max_sectors = min_t(int, max_sectors, + BIO_MAX_VECS * (PAGE_SIZE >> 9)); if (max_sectors < bio_sectors(bio)) { struct bio *split = bio_split(bio, max_sectors, GFP_NOIO, &conf->bio_split); From c7e9d0020361f4308a70cdfd6d5335e273eb8717 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Sat, 7 Aug 2021 10:37:02 +0300 Subject: [PATCH 307/315] Revert "floppy: reintroduce O_NDELAY fix" The patch breaks userspace implementations (e.g. fdutils) and introduces regressions in behaviour. Previously, it was possible to O_NDELAY open a floppy device with no media inserted or with write protected media without an error. Some userspace tools use this particular behavior for probing. It's not the first time when we revert this patch. Previous revert is in commit f2791e7eadf4 (Revert "floppy: refactor open() flags handling"). This reverts commit 8a0c014cd20516ade9654fc13b51345ec58e7be8. Link: https://lore.kernel.org/linux-block/de10cb47-34d1-5a88-7751-225ca380f735@compro.net/ Reported-by: Mark Hounschell Cc: Jiri Kosina Cc: Wim Osterholt Cc: Kurt Garloff Cc: Signed-off-by: Denis Efremov --- drivers/block/floppy.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 87460e0e5c72..fef79ea52e3e 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -4029,23 +4029,23 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) if (fdc_state[FDC(drive)].rawcmd == 1) fdc_state[FDC(drive)].rawcmd = 2; - if (mode & (FMODE_READ|FMODE_WRITE)) { - drive_state[drive].last_checked = 0; - clear_bit(FD_OPEN_SHOULD_FAIL_BIT, &drive_state[drive].flags); - if (bdev_check_media_change(bdev)) - floppy_revalidate(bdev->bd_disk); - if (test_bit(FD_DISK_CHANGED_BIT, &drive_state[drive].flags)) - goto out; - if (test_bit(FD_OPEN_SHOULD_FAIL_BIT, &drive_state[drive].flags)) + if (!(mode & FMODE_NDELAY)) { + if (mode & (FMODE_READ|FMODE_WRITE)) { + drive_state[drive].last_checked = 0; + clear_bit(FD_OPEN_SHOULD_FAIL_BIT, + &drive_state[drive].flags); + if (bdev_check_media_change(bdev)) + floppy_revalidate(bdev->bd_disk); + if (test_bit(FD_DISK_CHANGED_BIT, &drive_state[drive].flags)) + goto out; + if (test_bit(FD_OPEN_SHOULD_FAIL_BIT, &drive_state[drive].flags)) + goto out; + } + res = -EROFS; + if ((mode & FMODE_WRITE) && + !test_bit(FD_DISK_WRITABLE_BIT, &drive_state[drive].flags)) goto out; } - - res = -EROFS; - - if ((mode & FMODE_WRITE) && - !test_bit(FD_DISK_WRITABLE_BIT, &drive_state[drive].flags)) - goto out; - mutex_unlock(&open_lock); mutex_unlock(&floppy_mutex); return 0; From d25a025201ed98f4b93775e0999a3f2135702106 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 12 Aug 2021 09:31:28 -0700 Subject: [PATCH 308/315] clocksource: Make clocksource watchdog test safe for slow-HZ systems The clocksource watchdog test sets a local JIFFIES_SHIFT macro and assumes that HZ is >= 100. For smaller HZ values this shift value is too large and causes undefined behaviour. Move the HZ-based definitions of JIFFIES_SHIFT from kernel/time/jiffies.c to kernel/time/tick-internal.h so the clocksource watchdog test can utilize them, which makes it work correctly with all HZ values. [ tglx: Resolved conflicts and massaged changelog ] Signed-off-by: Paul E. McKenney Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/lkml/20210812000133.GA402890@paulmck-ThinkPad-P17-Gen-1/ --- kernel/time/clocksource-wdtest.c | 5 ++--- kernel/time/jiffies.c | 21 +-------------------- kernel/time/tick-internal.h | 20 ++++++++++++++++++++ 3 files changed, 23 insertions(+), 23 deletions(-) diff --git a/kernel/time/clocksource-wdtest.c b/kernel/time/clocksource-wdtest.c index 01df12395c0e..df922f49d171 100644 --- a/kernel/time/clocksource-wdtest.c +++ b/kernel/time/clocksource-wdtest.c @@ -19,6 +19,8 @@ #include #include +#include "tick-internal.h" + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney "); @@ -34,9 +36,6 @@ static u64 wdtest_jiffies_read(struct clocksource *cs) return (u64)jiffies; } -/* Assume HZ > 100. */ -#define JIFFIES_SHIFT 8 - static struct clocksource clocksource_wdtest_jiffies = { .name = "wdtest-jiffies", .rating = 1, /* lowest valid rating*/ diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 01935aafdb46..bc4db9e5ab70 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -10,28 +10,9 @@ #include #include "timekeeping.h" +#include "tick-internal.h" -/* Since jiffies uses a simple TICK_NSEC multiplier - * conversion, the .shift value could be zero. However - * this would make NTP adjustments impossible as they are - * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to - * shift both the nominator and denominator the same - * amount, and give ntp adjustments in units of 1/2^8 - * - * The value 8 is somewhat carefully chosen, as anything - * larger can result in overflows. TICK_NSEC grows as HZ - * shrinks, so values greater than 8 overflow 32bits when - * HZ=100. - */ -#if HZ < 34 -#define JIFFIES_SHIFT 6 -#elif HZ < 67 -#define JIFFIES_SHIFT 7 -#else -#define JIFFIES_SHIFT 8 -#endif - static u64 jiffies_read(struct clocksource *cs) { return (u64) jiffies; diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 3548f0829e6d..649f2b48e8f0 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -177,3 +177,23 @@ void clock_was_set(unsigned int bases); void clock_was_set_delayed(void); void hrtimers_resume_local(void); + +/* Since jiffies uses a simple TICK_NSEC multiplier + * conversion, the .shift value could be zero. However + * this would make NTP adjustments impossible as they are + * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to + * shift both the nominator and denominator the same + * amount, and give ntp adjustments in units of 1/2^8 + * + * The value 8 is somewhat carefully chosen, as anything + * larger can result in overflows. TICK_NSEC grows as HZ + * shrinks, so values greater than 8 overflow 32bits when + * HZ=100. + */ +#if HZ < 34 +#define JIFFIES_SHIFT 6 +#elif HZ < 67 +#define JIFFIES_SHIFT 7 +#else +#define JIFFIES_SHIFT 8 +#endif From 2e480058ddc21ec53a10e8b41623e245e908bdbc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 27 Aug 2021 11:33:19 -0600 Subject: [PATCH 309/315] io-wq: provide a way to limit max number of workers io-wq divides work into two categories: 1) Work that completes in a bounded time, like reading from a regular file or a block device. This type of work is limited based on the size of the SQ ring. 2) Work that may never complete, we call this unbounded work. The amount of workers here is just limited by RLIMIT_NPROC. For various uses cases, it's handy to have the kernel limit the maximum amount of pending workers for both categories. Provide a way to do with with a new IORING_REGISTER_IOWQ_MAX_WORKERS operation. IORING_REGISTER_IOWQ_MAX_WORKERS takes an array of two integers and sets the max worker count to what is being passed in for each category. The old values are returned into that same array. If 0 is being passed in for either category, it simply returns the current value. The value is capped at RLIMIT_NPROC. This actually isn't that important as it's more of a hint, if we're exceeding the value then our attempt to fork a new worker will fail. This happens naturally already if more than one node is in the system, as these values are per-node internally for io-wq. Reported-by: Johannes Lundberg Link: https://github.com/axboe/liburing/issues/420 Signed-off-by: Jens Axboe --- fs/io-wq.c | 29 +++++++++++++++++++++++++++++ fs/io-wq.h | 1 + fs/io_uring.c | 32 ++++++++++++++++++++++++++++++++ include/uapi/linux/io_uring.h | 3 +++ 4 files changed, 65 insertions(+) diff --git a/fs/io-wq.c b/fs/io-wq.c index 8da9bb103916..4b5fc621ab39 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -1152,6 +1152,35 @@ int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask) return 0; } +/* + * Set max number of unbounded workers, returns old value. If new_count is 0, + * then just return the old value. + */ +int io_wq_max_workers(struct io_wq *wq, int *new_count) +{ + int i, node, prev = 0; + + for (i = 0; i < 2; i++) { + if (new_count[i] > task_rlimit(current, RLIMIT_NPROC)) + new_count[i] = task_rlimit(current, RLIMIT_NPROC); + } + + rcu_read_lock(); + for_each_node(node) { + struct io_wqe_acct *acct; + + for (i = 0; i < 2; i++) { + acct = &wq->wqes[node]->acct[i]; + prev = max_t(int, acct->max_workers, prev); + if (new_count[i]) + acct->max_workers = new_count[i]; + new_count[i] = prev; + } + } + rcu_read_unlock(); + return 0; +} + static __init int io_wq_init(void) { int ret; diff --git a/fs/io-wq.h b/fs/io-wq.h index 308af3928424..bf5c4c533760 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -128,6 +128,7 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work); void io_wq_hash_work(struct io_wq_work *work, void *val); int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask); +int io_wq_max_workers(struct io_wq *wq, int *new_count); static inline bool io_wq_is_hashed(struct io_wq_work *work) { diff --git a/fs/io_uring.c b/fs/io_uring.c index 53326449d685..edbda88142f9 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -10233,6 +10233,31 @@ static int io_unregister_iowq_aff(struct io_ring_ctx *ctx) return io_wq_cpu_affinity(tctx->io_wq, NULL); } +static int io_register_iowq_max_workers(struct io_ring_ctx *ctx, + void __user *arg) +{ + struct io_uring_task *tctx = current->io_uring; + __u32 new_count[2]; + int i, ret; + + if (!tctx || !tctx->io_wq) + return -EINVAL; + if (copy_from_user(new_count, arg, sizeof(new_count))) + return -EFAULT; + for (i = 0; i < ARRAY_SIZE(new_count); i++) + if (new_count[i] > INT_MAX) + return -EINVAL; + + ret = io_wq_max_workers(tctx->io_wq, new_count); + if (ret) + return ret; + + if (copy_to_user(arg, new_count, sizeof(new_count))) + return -EFAULT; + + return 0; +} + static bool io_register_op_must_quiesce(int op) { switch (op) { @@ -10250,6 +10275,7 @@ static bool io_register_op_must_quiesce(int op) case IORING_REGISTER_BUFFERS_UPDATE: case IORING_REGISTER_IOWQ_AFF: case IORING_UNREGISTER_IOWQ_AFF: + case IORING_REGISTER_IOWQ_MAX_WORKERS: return false; default: return true; @@ -10406,6 +10432,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_unregister_iowq_aff(ctx); break; + case IORING_REGISTER_IOWQ_MAX_WORKERS: + ret = -EINVAL; + if (!arg || nr_args != 2) + break; + ret = io_register_iowq_max_workers(ctx, arg); + break; default: ret = -EINVAL; break; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 45a4f2373694..64fe809c4e36 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -309,6 +309,9 @@ enum { IORING_REGISTER_IOWQ_AFF = 17, IORING_UNREGISTER_IOWQ_AFF = 18, + /* set/get max number of workers */ + IORING_REGISTER_IOWQ_MAX_WORKERS = 19, + /* this goes last */ IORING_REGISTER_LAST }; From 50c1df2b56e0f581b1dbf334dbf807d6fb8f77b2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 27 Aug 2021 17:11:06 -0600 Subject: [PATCH 310/315] io_uring: support CLOCK_BOOTTIME/REALTIME for timeouts Certain use cases want to use CLOCK_BOOTTIME or CLOCK_REALTIME rather than CLOCK_MONOTONIC, instead of the default CLOCK_MONOTONIC. Add an IORING_TIMEOUT_BOOTTIME and IORING_TIMEOUT_REALTIME flag that allows timeouts and linked timeouts to use the selected clock source. Only one clock source may be selected, and we -EINVAL the request if more than one is given. If neither BOOTIME nor REALTIME are selected, the previous default of MONOTONIC is used. Link: https://github.com/axboe/liburing/issues/369 Signed-off-by: Jens Axboe --- fs/io_uring.c | 27 ++++++++++++++++++++++++--- include/uapi/linux/io_uring.h | 3 +++ 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index edbda88142f9..1c99f0143b57 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -508,6 +508,7 @@ struct io_timeout_data { struct hrtimer timer; struct timespec64 ts; enum hrtimer_mode mode; + u32 flags; }; struct io_accept { @@ -5712,6 +5713,22 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) return 0; } +static clockid_t io_timeout_get_clock(struct io_timeout_data *data) +{ + switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) { + case IORING_TIMEOUT_BOOTTIME: + return CLOCK_BOOTTIME; + case IORING_TIMEOUT_REALTIME: + return CLOCK_REALTIME; + default: + /* can't happen, vetted at prep time */ + WARN_ON_ONCE(1); + fallthrough; + case 0: + return CLOCK_MONOTONIC; + } +} + static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, struct timespec64 *ts, enum hrtimer_mode mode) __must_hold(&ctx->timeout_lock) @@ -5725,7 +5742,7 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, req->timeout.off = 0; /* noseq */ data = req->async_data; list_add_tail(&req->timeout.list, &ctx->timeout_list); - hrtimer_init(&data->timer, CLOCK_MONOTONIC, mode); + hrtimer_init(&data->timer, io_timeout_get_clock(data), mode); data->timer.function = io_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode); return 0; @@ -5807,7 +5824,10 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (off && is_timeout_link) return -EINVAL; flags = READ_ONCE(sqe->timeout_flags); - if (flags & ~IORING_TIMEOUT_ABS) + if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK)) + return -EINVAL; + /* more than one clock specified is invalid, obviously */ + if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1) return -EINVAL; req->timeout.off = off; @@ -5819,12 +5839,13 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, data = req->async_data; data->req = req; + data->flags = flags; if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr))) return -EFAULT; data->mode = io_translate_timeout_mode(flags); - hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode); + hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode); if (is_timeout_link) { struct io_submit_link *link = &req->ctx->submit_state.link; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 64fe809c4e36..b6d28d927a3f 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -151,6 +151,9 @@ enum { */ #define IORING_TIMEOUT_ABS (1U << 0) #define IORING_TIMEOUT_UPDATE (1U << 1) +#define IORING_TIMEOUT_BOOTTIME (1U << 2) +#define IORING_TIMEOUT_REALTIME (1U << 3) +#define IORING_TIMEOUT_CLOCK_MASK (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME) /* * sqe->splice_flags From ef9dd637084d437463f5e9efa153dfc94e7e5f08 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 28 Aug 2021 19:54:38 -0600 Subject: [PATCH 311/315] io_uring: keep ltimeouts in a list A preparation patch. Keep all queued linked timeout in a list, so they may be found and updated. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 1c99f0143b57..aa978292f34b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -375,6 +375,7 @@ struct io_ring_ctx { struct io_submit_state submit_state; struct list_head timeout_list; + struct list_head ltimeout_list; struct list_head cq_overflow_list; struct xarray io_buffers; struct xarray personalities; @@ -1277,6 +1278,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->iopoll_list); INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); + INIT_LIST_HEAD(&ctx->ltimeout_list); spin_lock_init(&ctx->rsrc_ref_lock); INIT_LIST_HEAD(&ctx->rsrc_ref_list); INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work); @@ -1966,6 +1968,7 @@ static bool io_kill_linked_timeout(struct io_kiocb *req) io_remove_next_linked(req); link->timeout.head = NULL; if (hrtimer_try_to_cancel(&io->timer) != -1) { + list_del(&link->timeout.list); io_cqring_fill_event(link->ctx, link->user_data, -ECANCELED, 0); io_put_req_deferred(link); @@ -5830,6 +5833,7 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1) return -EINVAL; + INIT_LIST_HEAD(&req->timeout.list); req->timeout.off = off; if (unlikely(off && !req->ctx->off_timeout_used)) req->ctx->off_timeout_used = true; @@ -6585,6 +6589,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) if (!req_ref_inc_not_zero(prev)) prev = NULL; } + list_del(&req->timeout.list); req->timeout.prev = prev; spin_unlock_irqrestore(&ctx->timeout_lock, flags); @@ -6608,6 +6613,7 @@ static void io_queue_linked_timeout(struct io_kiocb *req) data->timer.function = io_link_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); + list_add_tail(&req->timeout.list, &ctx->ltimeout_list); } spin_unlock_irq(&ctx->timeout_lock); /* drop submission reference */ @@ -8900,6 +8906,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) sock_release(ctx->ring_sock); } #endif + WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); io_mem_free(ctx->rings); io_mem_free(ctx->sq_sqes); From f1042b6ccb887f07301f6b096b3d0cfcf9189323 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 28 Aug 2021 19:54:39 -0600 Subject: [PATCH 312/315] io_uring: allow updating linked timeouts We allow updating normal timeouts, add support for adjusting timings of linked timeouts as well. Reported-by: Victor Stewart Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 44 +++++++++++++++++++++++++++++++---- include/uapi/linux/io_uring.h | 11 +++++---- 2 files changed, 46 insertions(+), 9 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index aa978292f34b..7cc458e0b636 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -552,6 +552,7 @@ struct io_timeout_rem { /* timeout update */ struct timespec64 ts; u32 flags; + bool ltimeout; }; struct io_rw { @@ -1069,6 +1070,7 @@ static int io_req_prep_async(struct io_kiocb *req); static int io_install_fixed_file(struct io_kiocb *req, struct file *file, unsigned int issue_flags, u32 slot_index); +static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer); static struct kmem_cache *req_cachep; @@ -5732,6 +5734,31 @@ static clockid_t io_timeout_get_clock(struct io_timeout_data *data) } } +static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, + struct timespec64 *ts, enum hrtimer_mode mode) + __must_hold(&ctx->timeout_lock) +{ + struct io_timeout_data *io; + struct io_kiocb *req; + bool found = false; + + list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) { + found = user_data == req->user_data; + if (found) + break; + } + if (!found) + return -ENOENT; + + io = req->async_data; + if (hrtimer_try_to_cancel(&io->timer) == -1) + return -EALREADY; + hrtimer_init(&io->timer, io_timeout_get_clock(io), mode); + io->timer.function = io_link_timeout_fn; + hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode); + return 0; +} + static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, struct timespec64 *ts, enum hrtimer_mode mode) __must_hold(&ctx->timeout_lock) @@ -5763,10 +5790,15 @@ static int io_timeout_remove_prep(struct io_kiocb *req, if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->splice_fd_in) return -EINVAL; + tr->ltimeout = false; tr->addr = READ_ONCE(sqe->addr); tr->flags = READ_ONCE(sqe->timeout_flags); - if (tr->flags & IORING_TIMEOUT_UPDATE) { - if (tr->flags & ~(IORING_TIMEOUT_UPDATE|IORING_TIMEOUT_ABS)) + if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) { + if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1) + return -EINVAL; + if (tr->flags & IORING_LINK_TIMEOUT_UPDATE) + tr->ltimeout = true; + if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS)) return -EINVAL; if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2))) return -EFAULT; @@ -5800,9 +5832,13 @@ static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) spin_unlock_irq(&ctx->timeout_lock); spin_unlock(&ctx->completion_lock); } else { + enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags); + spin_lock_irq(&ctx->timeout_lock); - ret = io_timeout_update(ctx, tr->addr, &tr->ts, - io_translate_timeout_mode(tr->flags)); + if (tr->ltimeout) + ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode); + else + ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode); spin_unlock_irq(&ctx->timeout_lock); } diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index b6d28d927a3f..3caec9199658 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -149,12 +149,13 @@ enum { /* * sqe->timeout_flags */ -#define IORING_TIMEOUT_ABS (1U << 0) -#define IORING_TIMEOUT_UPDATE (1U << 1) -#define IORING_TIMEOUT_BOOTTIME (1U << 2) -#define IORING_TIMEOUT_REALTIME (1U << 3) +#define IORING_TIMEOUT_ABS (1U << 0) +#define IORING_TIMEOUT_UPDATE (1U << 1) +#define IORING_TIMEOUT_BOOTTIME (1U << 2) +#define IORING_TIMEOUT_REALTIME (1U << 3) +#define IORING_LINK_TIMEOUT_UPDATE (1U << 4) #define IORING_TIMEOUT_CLOCK_MASK (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME) - +#define IORING_TIMEOUT_UPDATE_MASK (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE) /* * sqe->splice_flags * extends splice(2) flags From ecc53c48c13d995e6fe5559e30ffee48d92784fd Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 29 Aug 2021 16:13:03 -0600 Subject: [PATCH 313/315] io-wq: check max_worker limits if a worker transitions bound state For the two places where new workers are created, we diligently check if we are allowed to create a new worker. If we're currently at the limit of how many workers of a given type we can have, then we don't create any new ones. If you have a mixed workload with various types of bound and unbounded work, then it can happen that a worker finishes one type of work and is then transitioned to the other type. For this case, we don't check if we are actually allowed to do so. This can cause io-wq to temporarily exceed the allowed number of workers for a given type. When retrieving work, check that the types match. If they don't, check if we are allowed to transition to the other type. If not, then don't handle the new work. Cc: stable@vger.kernel.org Reported-by: Johannes Lundberg Signed-off-by: Jens Axboe --- fs/io-wq.c | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index 4b5fc621ab39..da3ad45028f9 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -424,7 +424,28 @@ static void io_wait_on_hash(struct io_wqe *wqe, unsigned int hash) spin_unlock(&wq->hash->wait.lock); } -static struct io_wq_work *io_get_next_work(struct io_wqe *wqe) +/* + * We can always run the work if the worker is currently the same type as + * the work (eg both are bound, or both are unbound). If they are not the + * same, only allow it if incrementing the worker count would be allowed. + */ +static bool io_worker_can_run_work(struct io_worker *worker, + struct io_wq_work *work) +{ + struct io_wqe_acct *acct; + + if (!(worker->flags & IO_WORKER_F_BOUND) != + !(work->flags & IO_WQ_WORK_UNBOUND)) + return true; + + /* not the same type, check if we'd go over the limit */ + acct = io_work_get_acct(worker->wqe, work); + return acct->nr_workers < acct->max_workers; +} + +static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, + struct io_worker *worker, + bool *stalled) __must_hold(wqe->lock) { struct io_wq_work_node *node, *prev; @@ -436,6 +457,9 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe) work = container_of(node, struct io_wq_work, list); + if (!io_worker_can_run_work(worker, work)) + break; + /* not hashed, can run anytime */ if (!io_wq_is_hashed(work)) { wq_list_del(&wqe->work_list, node, prev); @@ -462,6 +486,7 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe) raw_spin_unlock(&wqe->lock); io_wait_on_hash(wqe, stall_hash); raw_spin_lock(&wqe->lock); + *stalled = true; } return NULL; @@ -501,6 +526,7 @@ static void io_worker_handle_work(struct io_worker *worker) do { struct io_wq_work *work; + bool stalled; get_next: /* * If we got some work, mark us as busy. If we didn't, but @@ -509,10 +535,11 @@ get_next: * can't make progress, any work completion or insertion will * clear the stalled flag. */ - work = io_get_next_work(wqe); + stalled = false; + work = io_get_next_work(wqe, worker, &stalled); if (work) __io_worker_busy(wqe, worker, work); - else if (!wq_list_empty(&wqe->work_list)) + else if (stalled) wqe->flags |= IO_WQE_FLAG_STALLED; raw_spin_unlock_irq(&wqe->lock); From a9a4aa9fbfc5b87f315c63d9a317648774a46879 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 30 Aug 2021 06:33:08 -0600 Subject: [PATCH 314/315] io-wq: wqe and worker locks no longer need to be IRQ safe io_uring no longer queues async work off completion handlers that run in hard or soft interrupt context, and that use case was the only reason that io-wq had to use IRQ safe locks for wqe and worker locks. Signed-off-by: Jens Axboe --- fs/io-wq.c | 59 ++++++++++++++++++++++++++---------------------------- 1 file changed, 28 insertions(+), 31 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index da3ad45028f9..13aeb48a0964 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -178,7 +178,7 @@ static void io_worker_exit(struct io_worker *worker) complete(&worker->ref_done); wait_for_completion(&worker->ref_done); - raw_spin_lock_irq(&wqe->lock); + raw_spin_lock(&wqe->lock); if (worker->flags & IO_WORKER_F_FREE) hlist_nulls_del_rcu(&worker->nulls_node); list_del_rcu(&worker->all_list); @@ -188,7 +188,7 @@ static void io_worker_exit(struct io_worker *worker) worker->flags = 0; current->flags &= ~PF_IO_WORKER; preempt_enable(); - raw_spin_unlock_irq(&wqe->lock); + raw_spin_unlock(&wqe->lock); kfree_rcu(worker, rcu); io_worker_ref_put(wqe->wq); @@ -254,14 +254,14 @@ static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct) if (!ret) { bool do_create = false, first = false; - raw_spin_lock_irq(&wqe->lock); + raw_spin_lock(&wqe->lock); if (acct->nr_workers < acct->max_workers) { if (!acct->nr_workers) first = true; acct->nr_workers++; do_create = true; } - raw_spin_unlock_irq(&wqe->lock); + raw_spin_unlock(&wqe->lock); if (do_create) { atomic_inc(&acct->nr_running); atomic_inc(&wqe->wq->worker_refs); @@ -289,14 +289,14 @@ static void create_worker_cb(struct callback_head *cb) wqe = worker->wqe; wq = wqe->wq; acct = &wqe->acct[worker->create_index]; - raw_spin_lock_irq(&wqe->lock); + raw_spin_lock(&wqe->lock); if (acct->nr_workers < acct->max_workers) { if (!acct->nr_workers) first = true; acct->nr_workers++; do_create = true; } - raw_spin_unlock_irq(&wqe->lock); + raw_spin_unlock(&wqe->lock); if (do_create) { create_io_worker(wq, wqe, worker->create_index, first); } else { @@ -510,9 +510,9 @@ static void io_assign_current_work(struct io_worker *worker, cond_resched(); } - spin_lock_irq(&worker->lock); + spin_lock(&worker->lock); worker->cur_work = work; - spin_unlock_irq(&worker->lock); + spin_unlock(&worker->lock); } static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work); @@ -542,7 +542,7 @@ get_next: else if (stalled) wqe->flags |= IO_WQE_FLAG_STALLED; - raw_spin_unlock_irq(&wqe->lock); + raw_spin_unlock(&wqe->lock); if (!work) break; io_assign_current_work(worker, work); @@ -574,16 +574,16 @@ get_next: clear_bit(hash, &wq->hash->map); if (wq_has_sleeper(&wq->hash->wait)) wake_up(&wq->hash->wait); - raw_spin_lock_irq(&wqe->lock); + raw_spin_lock(&wqe->lock); wqe->flags &= ~IO_WQE_FLAG_STALLED; /* skip unnecessary unlock-lock wqe->lock */ if (!work) goto get_next; - raw_spin_unlock_irq(&wqe->lock); + raw_spin_unlock(&wqe->lock); } } while (work); - raw_spin_lock_irq(&wqe->lock); + raw_spin_lock(&wqe->lock); } while (1); } @@ -604,13 +604,13 @@ static int io_wqe_worker(void *data) set_current_state(TASK_INTERRUPTIBLE); loop: - raw_spin_lock_irq(&wqe->lock); + raw_spin_lock(&wqe->lock); if (io_wqe_run_queue(wqe)) { io_worker_handle_work(worker); goto loop; } __io_worker_idle(wqe, worker); - raw_spin_unlock_irq(&wqe->lock); + raw_spin_unlock(&wqe->lock); if (io_flush_signals()) continue; ret = schedule_timeout(WORKER_IDLE_TIMEOUT); @@ -629,7 +629,7 @@ loop: } if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) { - raw_spin_lock_irq(&wqe->lock); + raw_spin_lock(&wqe->lock); io_worker_handle_work(worker); } @@ -671,9 +671,9 @@ void io_wq_worker_sleeping(struct task_struct *tsk) worker->flags &= ~IO_WORKER_F_RUNNING; - raw_spin_lock_irq(&worker->wqe->lock); + raw_spin_lock(&worker->wqe->lock); io_wqe_dec_running(worker); - raw_spin_unlock_irq(&worker->wqe->lock); + raw_spin_unlock(&worker->wqe->lock); } static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index, bool first) @@ -699,9 +699,9 @@ static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index, bo kfree(worker); fail: atomic_dec(&acct->nr_running); - raw_spin_lock_irq(&wqe->lock); + raw_spin_lock(&wqe->lock); acct->nr_workers--; - raw_spin_unlock_irq(&wqe->lock); + raw_spin_unlock(&wqe->lock); io_worker_ref_put(wq); return; } @@ -711,7 +711,7 @@ fail: set_cpus_allowed_ptr(tsk, wqe->cpu_mask); tsk->flags |= PF_NO_SETAFFINITY; - raw_spin_lock_irq(&wqe->lock); + raw_spin_lock(&wqe->lock); hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); list_add_tail_rcu(&worker->all_list, &wqe->all_list); worker->flags |= IO_WORKER_F_FREE; @@ -719,7 +719,7 @@ fail: worker->flags |= IO_WORKER_F_BOUND; if (first && (worker->flags & IO_WORKER_F_BOUND)) worker->flags |= IO_WORKER_F_FIXED; - raw_spin_unlock_irq(&wqe->lock); + raw_spin_unlock(&wqe->lock); wake_up_new_task(tsk); } @@ -795,7 +795,6 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) { struct io_wqe_acct *acct = io_work_get_acct(wqe, work); int work_flags; - unsigned long flags; /* * If io-wq is exiting for this task, or if the request has explicitly @@ -808,10 +807,10 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) } work_flags = work->flags; - raw_spin_lock_irqsave(&wqe->lock, flags); + raw_spin_lock(&wqe->lock); io_wqe_insert_work(wqe, work); wqe->flags &= ~IO_WQE_FLAG_STALLED; - raw_spin_unlock_irqrestore(&wqe->lock, flags); + raw_spin_unlock(&wqe->lock); if ((work_flags & IO_WQ_WORK_CONCURRENT) || !atomic_read(&acct->nr_running)) @@ -840,19 +839,18 @@ void io_wq_hash_work(struct io_wq_work *work, void *val) static bool io_wq_worker_cancel(struct io_worker *worker, void *data) { struct io_cb_cancel_data *match = data; - unsigned long flags; /* * Hold the lock to avoid ->cur_work going out of scope, caller * may dereference the passed in work. */ - spin_lock_irqsave(&worker->lock, flags); + spin_lock(&worker->lock); if (worker->cur_work && match->fn(worker->cur_work, match->data)) { set_notify_signal(worker->task); match->nr_running++; } - spin_unlock_irqrestore(&worker->lock, flags); + spin_unlock(&worker->lock); return match->nr_running && !match->cancel_all; } @@ -880,16 +878,15 @@ static void io_wqe_cancel_pending_work(struct io_wqe *wqe, { struct io_wq_work_node *node, *prev; struct io_wq_work *work; - unsigned long flags; retry: - raw_spin_lock_irqsave(&wqe->lock, flags); + raw_spin_lock(&wqe->lock); wq_list_for_each(node, prev, &wqe->work_list) { work = container_of(node, struct io_wq_work, list); if (!match->fn(work, match->data)) continue; io_wqe_remove_pending(wqe, work, prev); - raw_spin_unlock_irqrestore(&wqe->lock, flags); + raw_spin_unlock(&wqe->lock); io_run_cancel(work, wqe); match->nr_pending++; if (!match->cancel_all) @@ -898,7 +895,7 @@ retry: /* not safe to continue after unlock */ goto retry; } - raw_spin_unlock_irqrestore(&wqe->lock, flags); + raw_spin_unlock(&wqe->lock); } static void io_wqe_cancel_running_work(struct io_wqe *wqe, From 87df7fb922d18e96992aa5e824aa34b2065fef59 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 30 Aug 2021 07:45:47 -0600 Subject: [PATCH 315/315] io-wq: fix wakeup race when adding new work When new work is added, io_wqe_enqueue() checks if we need to wake or create a new worker. But that check is done outside the lock that otherwise synchronizes us with a worker going to sleep, so we can end up in the following situation: CPU0 CPU1 lock insert work unlock atomic_read(nr_running) != 0 lock atomic_dec(nr_running) no wakeup needed Hold the wqe lock around the "need to wakeup" check. Then we can also get rid of the temporary work_flags variable, as we know the work will remain valid as long as we hold the lock. Cc: stable@vger.kernel.org Reported-by: Andres Freund Signed-off-by: Jens Axboe --- fs/io-wq.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index 13aeb48a0964..cd9bd095fb1b 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -794,7 +794,7 @@ append: static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) { struct io_wqe_acct *acct = io_work_get_acct(wqe, work); - int work_flags; + bool do_wake; /* * If io-wq is exiting for this task, or if the request has explicitly @@ -806,14 +806,14 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) return; } - work_flags = work->flags; raw_spin_lock(&wqe->lock); io_wqe_insert_work(wqe, work); wqe->flags &= ~IO_WQE_FLAG_STALLED; + do_wake = (work->flags & IO_WQ_WORK_CONCURRENT) || + !atomic_read(&acct->nr_running); raw_spin_unlock(&wqe->lock); - if ((work_flags & IO_WQ_WORK_CONCURRENT) || - !atomic_read(&acct->nr_running)) + if (do_wake) io_wqe_wake_worker(wqe, acct); }